# Modeling

### Imports

In [1]:
#import statements
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as ply
import seaborn as sns

#sci-kit learn
import sklearn
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.dummy import DummyClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### Import cleaned data

In [2]:
df = pd.read_csv('../data/water_well_train_clean.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,...,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,id_label,status_group,date_recorded_datetime
0,0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,...,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,69572,functional,2011-03-14
1,1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,...,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,8776,functional,2013-03-06
2,2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,...,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,34310,functional,2013-02-25
3,3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,...,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,67743,non functional,2013-01-28
4,4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,...,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,19728,functional,2011-07-13


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              59400 non-null  int64  
 1   id                      59400 non-null  int64  
 2   amount_tsh              59400 non-null  float64
 3   date_recorded           59400 non-null  object 
 4   funder                  59400 non-null  object 
 5   gps_height              59400 non-null  int64  
 6   installer               59400 non-null  object 
 7   longitude               59400 non-null  float64
 8   latitude                59400 non-null  float64
 9   wpt_name                59400 non-null  object 
 10  num_private             59400 non-null  int64  
 11  basin                   59400 non-null  object 
 12  subvillage              59400 non-null  object 
 13  region                  59400 non-null  object 
 14  region_code             59400 non-null

In [5]:
df.isna().sum()

Unnamed: 0                0
id                        0
amount_tsh                0
date_recorded             0
funder                    0
gps_height                0
installer                 0
longitude                 0
latitude                  0
wpt_name                  0
num_private               0
basin                     0
subvillage                0
region                    0
region_code               0
district_code             0
lga                       0
ward                      0
population                0
public_meeting            0
recorded_by               0
scheme_management         0
scheme_name               0
permit                    0
construction_year         0
extraction_type           0
extraction_type_group     0
extraction_type_class     0
management                0
management_group          0
payment                   0
payment_type              0
water_quality             0
quality_group             0
quantity                  0
quantity_group      

# First Model

## Setting up Pipelines

First model will include the following features:
    
amount_tsh,
installer, 
gps_height, 
basin, 
region, 
district_code, 
lga, 
population, 
public_meeting, 
management, 
permit, 
extraction_type, 
payment, 
water_quality, 
quantity, 
source, 
waterpoint_type

With the target:
status_group

In [6]:
X = df.drop('status_group', axis=1)

In [7]:
y = df['status_group']

In [8]:
picked_features = ['amount_tsh', 'installer',  'gps_height', 'basin', 'region', 'lga', 'population', 
                   'public_meeting', 'management', 'permit', 'extraction_type', 'payment', 'water_quality', 'quantity', 
                   'source', 'waterpoint_type']
len(picked_features)

16

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
X_picked = X[picked_features]

In [11]:
X_picked.head()

Unnamed: 0,amount_tsh,installer,gps_height,basin,region,lga,population,public_meeting,management,permit,extraction_type,payment,water_quality,quantity,source,waterpoint_type
0,6000.0,Roman,1390,Lake Nyasa,Iringa,Ludewa,109,True,vwc,False,gravity,pay annually,soft,enough,spring,communal standpipe
1,0.0,GRUMETI,1399,Lake Victoria,Mara,Serengeti,280,True,wug,True,gravity,never pay,soft,insufficient,rainwater harvesting,communal standpipe
2,25.0,World vision,686,Pangani,Manyara,Simanjiro,250,True,vwc,True,gravity,pay per bucket,soft,enough,dam,communal standpipe multiple
3,0.0,UNICEF,263,Ruvuma / Southern Coast,Mtwara,Nanyumbu,58,True,vwc,True,submersible,never pay,soft,dry,machine dbh,communal standpipe multiple
4,0.0,Artisan,0,Lake Victoria,Kagera,Karagwe,0,True,other,True,gravity,never pay,soft,seasonal,rainwater harvesting,communal standpipe


In [12]:
X_num = X_picked.select_dtypes(include='number')
X_num.head()

Unnamed: 0,amount_tsh,gps_height,population
0,6000.0,1390,109
1,0.0,1399,280
2,25.0,686,250
3,0.0,263,58
4,0.0,0,0


In [13]:
num_cols = list(X_num.columns)
num_cols

['amount_tsh', 'gps_height', 'population']

In [14]:
X_cat = X_picked.select_dtypes(exclude='number')
X_cat.head()

Unnamed: 0,installer,basin,region,lga,public_meeting,management,permit,extraction_type,payment,water_quality,quantity,source,waterpoint_type
0,Roman,Lake Nyasa,Iringa,Ludewa,True,vwc,False,gravity,pay annually,soft,enough,spring,communal standpipe
1,GRUMETI,Lake Victoria,Mara,Serengeti,True,wug,True,gravity,never pay,soft,insufficient,rainwater harvesting,communal standpipe
2,World vision,Pangani,Manyara,Simanjiro,True,vwc,True,gravity,pay per bucket,soft,enough,dam,communal standpipe multiple
3,UNICEF,Ruvuma / Southern Coast,Mtwara,Nanyumbu,True,vwc,True,submersible,never pay,soft,dry,machine dbh,communal standpipe multiple
4,Artisan,Lake Victoria,Kagera,Karagwe,True,other,True,gravity,never pay,soft,seasonal,rainwater harvesting,communal standpipe


In [15]:
cat_cols = list(X_cat.columns)
cat_cols

['installer',
 'basin',
 'region',
 'lga',
 'public_meeting',
 'management',
 'permit',
 'extraction_type',
 'payment',
 'water_quality',
 'quantity',
 'source',
 'waterpoint_type']

In [16]:
#create subpipes for numeric and categorical features
#StandardScaler on numerical features
subpipe_num = Pipeline(steps=[('ss', StandardScaler() )])
#OHE categorical features, handle_unknown = ignore, drop first if binary
subpipe_cat = Pipeline(steps=[('ohe', OneHotEncoder(drop = 'if_binary', sparse=False, handle_unknown='error') )])

In [17]:
CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, num_cols),
                                     ('subpipe_cat', subpipe_cat, cat_cols) ],
                       remainder='drop')

In [18]:
dtc = DecisionTreeClassifier(random_state=42)

first_model_pipe = Pipeline(steps = [('ct', CT),
                            ('dtc', dtc )])

In [19]:
first_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'population']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(drop='if_binary',
                                                                                 sparse=False))]),
                                                  ['installer', 'basin',
                                                   'region', 'lga',
                                                   'public_meeting',
                                                   'manage

In [20]:
first_model_pipe.score(X_train, y_train)

0.9485521885521886

# DummyClassifier Baseline

In [21]:
dummy_clf = DummyClassifier(strategy='most_frequent')

In [22]:
dummy_clf.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [23]:
dummy_clf.score(X_train, y_train)

0.542334455667789