# Modeling

### Imports

In [1]:
#import statements
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as ply
import seaborn as sns

#sci-kit learn
import sklearn
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.dummy import DummyClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### Import cleaned data

In [2]:
df = pd.read_csv('../data/water_well_train_clean.csv')

In [3]:
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,id_label,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,69572,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,8776,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,34310,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,67743,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,19728,functional


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 42 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 59400 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              59400 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59400 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [5]:
df.isna().sum()

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
subvillage               0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
recorded_by              0
scheme_management        0
scheme_name              0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
s

In [6]:
df['region_code'] = df['region_code'].astype(str)
df['district_code'] = df['district_code'].astype(str)

# First Model

## Setting up Pipelines

First model will include the following features:
    
amount_tsh,
installer, 
gps_height, 
basin, 
region, 
district_code, 
lga, 
population, 
public_meeting, 
management, 
permit, 
extraction_type, 
payment, 
water_quality, 
quantity, 
source, 
waterpoint_type

With the target:
status_group

In [7]:
X = df.drop('status_group', axis=1)

In [8]:
y = df['status_group']

In [9]:
picked_features = ['amount_tsh', 'installer',  'gps_height', 'basin', 'region', 'district_code', 'lga', 'population', 
                   'public_meeting', 'management', 'permit', 'extraction_type', 'payment', 'water_quality', 'quantity', 
                   'source', 'waterpoint_type']
len(picked_features)

17

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
X_picked = X[picked_features]

In [12]:
X_picked.head()

Unnamed: 0,amount_tsh,installer,gps_height,basin,region,district_code,lga,population,public_meeting,management,permit,extraction_type,payment,water_quality,quantity,source,waterpoint_type
0,6000.0,Roman,1390,Lake Nyasa,Iringa,5,Ludewa,109,True,vwc,False,gravity,pay annually,soft,enough,spring,communal standpipe
1,0.0,GRUMETI,1399,Lake Victoria,Mara,2,Serengeti,280,True,wug,True,gravity,never pay,soft,insufficient,rainwater harvesting,communal standpipe
2,25.0,World vision,686,Pangani,Manyara,4,Simanjiro,250,True,vwc,True,gravity,pay per bucket,soft,enough,dam,communal standpipe multiple
3,0.0,UNICEF,263,Ruvuma / Southern Coast,Mtwara,63,Nanyumbu,58,True,vwc,True,submersible,never pay,soft,dry,machine dbh,communal standpipe multiple
4,0.0,Artisan,0,Lake Victoria,Kagera,1,Karagwe,0,True,other,True,gravity,never pay,soft,seasonal,rainwater harvesting,communal standpipe


In [13]:
X_num = X_picked.select_dtypes(include='number')
X_num.head()

Unnamed: 0,amount_tsh,gps_height,population
0,6000.0,1390,109
1,0.0,1399,280
2,25.0,686,250
3,0.0,263,58
4,0.0,0,0


In [14]:
num_cols = list(X_num.columns)
num_cols

['amount_tsh', 'gps_height', 'population']

In [15]:
X_cat = X_picked.select_dtypes(include='object')
X_cat.head()

Unnamed: 0,installer,basin,region,district_code,lga,management,extraction_type,payment,water_quality,quantity,source,waterpoint_type
0,Roman,Lake Nyasa,Iringa,5,Ludewa,vwc,gravity,pay annually,soft,enough,spring,communal standpipe
1,GRUMETI,Lake Victoria,Mara,2,Serengeti,wug,gravity,never pay,soft,insufficient,rainwater harvesting,communal standpipe
2,World vision,Pangani,Manyara,4,Simanjiro,vwc,gravity,pay per bucket,soft,enough,dam,communal standpipe multiple
3,UNICEF,Ruvuma / Southern Coast,Mtwara,63,Nanyumbu,vwc,submersible,never pay,soft,dry,machine dbh,communal standpipe multiple
4,Artisan,Lake Victoria,Kagera,1,Karagwe,other,gravity,never pay,soft,seasonal,rainwater harvesting,communal standpipe


In [16]:
cat_cols = list(X_cat.columns)
cat_cols

['installer',
 'basin',
 'region',
 'district_code',
 'lga',
 'management',
 'extraction_type',
 'payment',
 'water_quality',
 'quantity',
 'source',
 'waterpoint_type']

In [17]:
X_bool = X_picked.select_dtypes(include='boolean')
X_bool.head()

Unnamed: 0,public_meeting,permit
0,True,False
1,True,True
2,True,True
3,True,True
4,True,True


In [18]:
bool_cols = list(X_bool.columns)
bool_cols

['public_meeting', 'permit']

In [19]:
#create subpipes for numeric and categorical features
#StandardScaler on numerical features
subpipe_num = Pipeline(steps=[('ss', StandardScaler() )])
#OHE categorical features, handle_unknown = ignore, drop first if binary
subpipe_cat = Pipeline(steps=[('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore') )])
#boolean pipeline with ordinal encoder
subpipe_bool = Pipeline(steps=[('ord', OrdinalEncoder() )])

In [20]:
CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, num_cols),
                                     ('subpipe_cat', subpipe_cat, cat_cols),
                                     ('subpipe_bool', subpipe_bool, bool_cols)],
                       remainder='drop')

In [21]:
rfc = RandomForestClassifier(random_state=42)

first_model_pipe = Pipeline(steps = [('ct', CT),
                            ('rfc', rfc )])

In [22]:
first_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  ['amount_tsh', 'gps_height',
                                                   'population']),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['installer', 'basin',
                                                   'region', 'district_code',
                                                   'lga', 'management',
                                      

In [23]:
first_model_pipe.score(X_train, y_train)

0.9486644219977554

In [24]:
first_model_pipe.score(X_test, y_test)

0.7844444444444445

In [25]:
rfc.feature_importances_

array([0.03235295, 0.1525929 , 0.08621218, ..., 0.02874709, 0.0093724 ,
       0.01144891])

In [26]:
rfc

RandomForestClassifier(random_state=42)

In [27]:
len(X_train.columns), len(rfc.feature_importances_)

(41, 2107)

In [28]:
first_model_pipe.named_steps
#dictionary with keys as names, values as objects associated with name

{'ct': ColumnTransformer(transformers=[('subpipe_num',
                                  Pipeline(steps=[('ss', StandardScaler())]),
                                  ['amount_tsh', 'gps_height', 'population']),
                                 ('subpipe_cat',
                                  Pipeline(steps=[('ohe',
                                                   OneHotEncoder(handle_unknown='ignore',
                                                                 sparse=False))]),
                                  ['installer', 'basin', 'region',
                                   'district_code', 'lga', 'management',
                                   'extraction_type', 'payment', 'water_quality',
                                   'quantity', 'source', 'waterpoint_type']),
                                 ('subpipe_bool',
                                  Pipeline(steps=[('ord', OrdinalEncoder())]),
                                  ['public_meeting', 'permit'])]),
 'rfc': Rand

In [29]:
feature_importance = {name:score for name, score in zip(X_train.columns, rfc.feature_importances_)}
feature_importance
#access OHE feature names, combine them with categorical, figure out a way to sort

{'id': 0.03235294712569168,
 'amount_tsh': 0.15259290212476806,
 'date_recorded': 0.08621217983103296,
 'funder': 2.2395872427717503e-05,
 'gps_height': 0.0004901450108233817,
 'installer': 3.101675458972177e-05,
 'longitude': 0.00011260575086136527,
 'latitude': 6.295163399621074e-05,
 'wpt_name': 5.782916429473131e-06,
 'num_private': 1.0074142713492508e-06,
 'basin': 0.0002599373026673585,
 'subvillage': 1.6493957457162314e-05,
 'region': 3.938020005189533e-06,
 'region_code': 7.040896308694836e-05,
 'district_code': 2.8506098507988067e-05,
 'lga': 6.131284578434692e-05,
 'ward': 7.4785752385342e-05,
 'population': 5.2596074025014724e-05,
 'public_meeting': 0.00018206653340206765,
 'recorded_by': 9.620625759138111e-06,
 'scheme_management': 1.015088374680087e-05,
 'scheme_name': 5.666680534884074e-05,
 'permit': 1.417463055604765e-06,
 'construction_year': 1.1070571977721396e-06,
 'extraction_type': 9.325191108548348e-06,
 'extraction_type_group': 1.3038769934013236e-05,
 'extractio

# DummyClassifier Baseline

In [30]:
dummy_clf = DummyClassifier(strategy='most_frequent')

In [31]:
dummy_clf.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

In [30]:
dummy_clf.score(X_train, y_train)

0.542334455667789

# GridSearch 

In [34]:
params = {'rfc__n_estimators':[100, 200],
          'rfc__criterion':['gini', 'entropy'],
          'rfc__max_depth':[None, 50, 100],
          'rfc__min_samples_leaf': [1, 10],
          'rfc__min_samples_split': [2, 5, 10]}
gs = GridSearchCV(estimator = first_model_pipe, param_grid = params, cv=5, verbose=3)

In [35]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100, score=0.786, total=  24.9s
[CV] rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.8s remaining:    0.0s


[CV]  rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100, score=0.784, total=  24.9s
[CV] rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   49.7s remaining:    0.0s


[CV]  rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100, score=0.786, total=  24.9s
[CV] rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100 
[CV]  rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100, score=0.784, total=  24.9s
[CV] rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100 
[CV]  rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100, score=0.781, total=  24.0s
[CV] rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.787, total=  48.2s
[CV] rfc__criterion=gini, rfc__max_depth=

[CV]  rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=100, score=0.743, total=  14.1s
[CV] rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=100 
[CV]  rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=100, score=0.743, total=  13.8s
[CV] rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.756, total=  27.0s
[CV] rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.744, total=  27.1s
[CV] rfc__criterion=gini, rfc__max

[CV]  rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=100, score=0.783, total=  24.0s
[CV] rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.788, total=  46.9s
[CV] rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.788, total=  46.6s
[CV] rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.788, total=  47.3s
[CV] rfc__criterion=gini, rfc__max_depth=50, rfc__min_s

[CV]  rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.756, total=  27.0s
[CV] rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.744, total=  27.1s
[CV] rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.743, total=  27.1s
[CV] rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.745, total=  28.2s
[CV] rfc__criterion=gini, rfc__max_depth=50, rfc

[CV]  rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.785, total=  50.4s
[CV] rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.784, total=  49.6s
[CV] rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.784, total=  51.7s
[CV] rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.782, total=  50.3s
[CV] rfc__criterion=gini, rfc__max_depth=100, rf

[CV]  rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.743, total=  29.2s
[CV] rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.745, total=  30.2s
[CV] rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.742, total=  29.1s
[CV] rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=gini, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.756, total=  15.0s
[CV] rfc__criterion=gini, rfc__max_depth=

[CV]  rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.783, total=  55.5s
[CV] rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.780, total=  56.0s
[CV] rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.796, total=  27.4s
[CV] rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.792, total=  27.4s
[CV] rfc__criterion=

[CV]  rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.746, total=  32.0s
[CV] rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.740, total=  31.9s
[CV] rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.749, total=  16.8s
[CV] rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=None, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.741, total=  16.3s
[CV] rfc__cri

[CV]  rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.788, total=  56.2s
[CV] rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.782, total=  54.1s
[CV] rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.798, total=  25.6s
[CV] rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.790, total=  25.3s
[CV] rfc__criterion=entropy, rfc__

[CV]  rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.746, total=  30.7s
[CV] rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.739, total=  29.9s
[CV] rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.749, total=  15.9s
[CV] rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=50, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.741, total=  15.3s
[CV] rfc__criterion=entropy

[CV]  rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.783, total=  52.5s
[CV] rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.781, total=  51.6s
[CV] rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.796, total=  24.1s
[CV] rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=1, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.792, total=  24.4s
[CV] rfc__criterion=entropy

[CV]  rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.746, total=  29.9s
[CV] rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200 
[CV]  rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=2, rfc__n_estimators=200, score=0.740, total=  28.4s
[CV] rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.749, total=  15.2s
[CV] rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100 
[CV]  rfc__criterion=entropy, rfc__max_depth=100, rfc__min_samples_leaf=10, rfc__min_samples_split=5, rfc__n_estimators=100, score=0.741, total=  15.0s
[CV] rfc__criterion=

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed: 172.5min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         ['amount_tsh',
                                                                          'gps_height',
                                                                          'population']),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                            

In [36]:
gs.best_params_

{'rfc__criterion': 'gini',
 'rfc__max_depth': 50,
 'rfc__min_samples_leaf': 1,
 'rfc__min_samples_split': 10,
 'rfc__n_estimators': 200}

In [37]:
gs.best_score_

0.7958698092031424

In [39]:
gs.score(X_train, y_train)

0.8675869809203143

In [None]:
#FunctionTransformer to select features

In [None]:
#pass in a list of features
#create col lists of numeric, categorical, and boolean data
def GrabTypes(features, df):
    df_features = df[features]
    num_df = df_features.select_dtypes(include='number')
    num_cols = list(num_df.columns)
    cat_df = df_features.select_dtypes(include='object')
    cat_cols = list(cat_df.columns)
    bool_df = df_features.select_dtypes(include='boolean')
    bool_cols = list(bool_df.columns)
    return (num_cols, cat_cols, bool_cols)