## Imports and Data Setup

In [1]:
# DISCLAIMER: For all non-water columns feature engineering, I took the easy way out just to get my resampling and models 
# up and running. 

In [2]:
# data
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import EditedNearestNeighbours
from collections import Counter
import regex as re

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

  import pandas.util.testing as tm


In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [4]:
data = pd.read_csv('../data/train_data.csv')
targets = pd.read_csv('../data/train_targets.csv')
test_data = pd.read_csv('../data/test_set_values.csv')

In [5]:
data = data.merge(targets, on='id', how='inner')

## Basic EDA

In [6]:
data.shape

(59400, 41)

In [7]:
data['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [8]:
data.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group', 'status_group'],
      dtype='object')

#### Columns Addressed in this Notebook

amount_tsh - total static head
water_quality - The quality of the water
quality_group - The quality of the water
source - The source of the water
source_type - The source of the water
source_class - The source of the water
waterpoint_type - The kind of waterpoint
waterpoint_type_group - The kind of waterpoint

In [9]:
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [10]:
data.drop('scheme_name', axis=1, inplace=True)
data.dropna(axis=0, how='any', inplace=True)

In [11]:
# data['amount_tsh'].value_counts()

In [12]:
# data.isnull().sum()
# no missing data in water columns

In [13]:
data['water_quality'].value_counts()

soft                  42667
salty                  3718
unknown                 880
coloured                320
milky                   299
salty abandoned         231
fluoride                160
fluoride abandoned       13
Name: water_quality, dtype: int64

In [14]:
data['quality_group'].value_counts()

good        42667
salty        3949
unknown       880
colored       320
milky         299
fluoride      173
Name: quality_group, dtype: int64

In [15]:
data['source'].value_counts()

spring                  14852
shallow well            12415
machine dbh              8730
river                    8506
rainwater harvesting     1607
hand dtw                  779
lake                      604
dam                       581
other                     182
unknown                    32
Name: source, dtype: int64

In [16]:
data['source_type'].value_counts()

spring                  14852
shallow well            12415
borehole                 9509
river/lake               9110
rainwater harvesting     1607
dam                       581
other                     214
Name: source_type, dtype: int64

In [17]:
data['source_class'].value_counts()

groundwater    36776
surface        11298
unknown          214
Name: source_class, dtype: int64

In [18]:
data['waterpoint_type'].value_counts()

communal standpipe             23837
hand pump                      13602
communal standpipe multiple     5459
other                           4651
improved spring                  651
cattle trough                     82
dam                                6
Name: waterpoint_type, dtype: int64

In [19]:
data['waterpoint_type_group'].value_counts()

communal standpipe    29296
hand pump             13602
other                  4651
improved spring         651
cattle trough            82
dam                       6
Name: waterpoint_type_group, dtype: int64

In [20]:
# data.dtypes

# amount_tsh - float64
# water_quality - object = 
# quality_group - object = 
# source - object = 
# source_type - object = 
# source_class - object = 
# waterpoint_type - object = 
# waterpoint_type_group - object = 

In [21]:
data.isnull().sum()

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
subvillage               0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
recorded_by              0
scheme_management        0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
source_class             0
w

In [22]:
data.shape

(48288, 40)

In [23]:
# data.dtypes

## Non-water Columns Management

In [24]:
data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
5,9944,20.0,2011-03-13,Mkinga Distric Coun,0,DWE,39.172796,-4.765587,Tajiri,0,Pangani,Moa/Mwereme,Tanga,4,8,Mkinga,Moa,1,True,GeoData Consultants Ltd,VWC,True,2009,submersible,submersible,submersible,vwc,user-group,pay per bucket,per bucket,salty,salty,enough,enough,other,other,unknown,communal standpipe multiple,communal standpipe,functional
6,19816,0.0,2012-10-01,Dwsp,0,DWSP,33.36241,-3.766365,Kwa Ngomho,0,Internal,Ishinabulandi,Shinyanga,17,3,Shinyanga Rural,Samuye,0,True,GeoData Consultants Ltd,VWC,True,0,swn 80,swn 80,handpump,vwc,user-group,never pay,never pay,soft,good,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,non functional


In [25]:
# good to go:

# id, amount_tsh, gps_height, population

In [26]:
# date_recorded AND construction_year

data['year'] = pd.DatetimeIndex(data['date_recorded']).year
data['age_in_years'] = data['year'] - data['construction_year']

data.drop(['date_recorded', 'year', 'construction_year'], axis=1, inplace=True)

In [27]:
# funder

data['funder'] = data['funder'].astype('category')
data['funder'] = data['funder'].cat.codes

In [28]:
# installer

data['installer'] = data['installer'].astype('category')
data['installer'] = data['installer'].cat.codes

In [29]:
# wtp_name

data.drop('wpt_name', axis=1, inplace=True)

In [30]:
# num_private has no descriptor in data source. since we don't know what it is, 
# we can't really derive insights from it. dropping.

data.drop('num_private', axis=1, inplace=True)

In [31]:
# basin

data['basin'] = data['basin'].astype('category')
data['basin'] = data['basin'].cat.codes

In [32]:
# subvillage - label encoding (too many for onehot or binary, still want to preserve unique identities)
# although I believe we can bin by distance/geographic region from Jacob's location work

data['subvillage'] = data['subvillage'].astype('category')
data['subvillage'] = data['subvillage'].cat.codes

In [33]:
# region - label encoding (too many for onehot or binary, still want to preserve unique identities)
# although I believe we can bin by distance/geographic region from Jacob's location work

data['region'] = data['region'].astype('category')
data['region'] = data['region'].cat.codes

In [34]:
# region code - drop because this is just a number and we want to use the label encoded 'region'

data.drop('region_code', axis=1, inplace=True)

In [35]:
# district_code - label encode

data['district_code'] = data['district_code'].astype('category')
data['district_code'] = data['district_code'].cat.codes

In [36]:
# lga - label encoding (too many for onehot or binary, still want to preserve unique identities)
# although I believe we can bin by distance/geographic region from Jacob's location work

data['lga'] = data['lga'].astype('category')
data['lga'] = data['lga'].cat.codes

In [37]:
# ward - label encoding (too many for onehot or binary, still want to preserve unique identities)
# although I believe we can bin by distance/geographic region from Jacob's location work

data['ward'] = data['ward'].astype('category')
data['ward'] = data['ward'].cat.codes

##### public meeting go to binary

In [38]:
data['public_meeting'] = data['public_meeting'].astype(int)

##### recorded_by - only has one value, GeoData Consultants Ltd. Dropping.

In [39]:
data.drop('recorded_by', axis=1, inplace=True)

##### scheme_management - label encode based on value counts, except move single None value into Other

In [40]:
data['scheme_management'].replace(to_replace='None', value='Other', inplace=True)

In [41]:
data['scheme_management'] = data['scheme_management'].astype('category')
data['scheme_management'] = data['scheme_management'].cat.codes

##### permit go to binary

In [42]:
data['permit'] = data['permit'].astype(int)

##### construction year: see data_recorded cell above

In [43]:
# extraction_type
# data['extraction_type'].value_counts()
# change other- rope pump to rope pump
# for now we will drop and just use extraction_type_class

data.drop('extraction_type', axis=1, inplace=True)

In [44]:
# extraction_type_group
# data['extraction_type_group'].value_counts()
# MISSING: ksb, swn 81, cemo, play pump, walimi, climax, mkuima/shinyanga
# change wind-powered to windmill
# for now we will drop and just use extraction_type_class

data.drop('extraction_type_group', axis=1, inplace=True)

In [45]:
# extraction_type_class compare to above 2, run a function to count rows where values 
# in these 3 are different from each other AFTER i've gone over options in each
# for now we do easy peasy encode

data['extraction_type_class'] = data['extraction_type_class'].astype('category')
data['extraction_type_class'] = data['extraction_type_class'].cat.codes

In [46]:
# management
# for now we drop, but we actually prefer this column over management_group, more info.

data.drop('management', axis=1, inplace=True)

In [47]:
# management_group - easy encode for now, but we will eventually drop this column in favor of management
data['management_group'] = data['management_group'].astype('category')
data['management_group'] = data['management_group'].cat.codes

In [48]:
# payment is same as payment_type, dropping
data.drop('payment', axis=1, inplace=True)

In [49]:
# payment_type - find someway to turn this into frequency amount, for now easy encode

data['payment_type'] = data['payment_type'].astype('category')
data['payment_type'] = data['payment_type'].cat.codes

In [50]:
data.head()

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,district_code,lga,ward,population,public_meeting,scheme_management,permit,extraction_type_class,management_group,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,age_in_years
0,69572,6000.0,1131,1390,1254,34.938093,-9.856322,1,9941,3,5,47,1270,109,1,6,0,0,4,0,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,12
2,34310,25.0,696,686,1713,37.460664,-3.821329,5,7638,8,4,100,1455,250,1,6,1,0,4,5,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional,4
3,67743,0.0,1449,263,1540,38.486161,-11.155298,7,7562,12,16,80,1408,58,1,6,1,5,4,2,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional,27
5,9944,20.0,818,0,322,39.172796,-4.765587,5,9945,20,8,62,1180,1,1,6,1,5,4,5,salty,salty,enough,enough,other,other,unknown,communal standpipe multiple,communal standpipe,functional,2
6,19816,0.0,288,0,333,33.36241,-3.766365,0,3165,17,3,96,1650,0,1,6,1,1,4,2,soft,good,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,non functional,2012


## Water Columns Management

In [51]:
# WATER COLUMNS CLEANING

### amount_tsh - get rid of decimals
data['amount_tsh'] = data['amount_tsh'].astype(int)
    
### water_quality - create binary encoded columns and delete old column
data['salt_quality'] = data['water_quality'].map(lambda x : 1 if x=='salty' else 0)
data['soft_quality'] = data['water_quality'].map(lambda x : 1 if x=='soft' else 0)
data['color_quality'] = data['water_quality'].map(lambda x : 1 if x=='coloured' else 0)
data['opaque_quality'] = data['water_quality'].map(lambda x : 1 if x=='milky' else 0)
data['flouride_quality'] = data['water_quality'].map(lambda x : 1 if x=='flouride' else 0)
    
### quality_group - drop since water_quality is similar but more detailed
data.drop('quality_group', axis=1, inplace=True)
    
### quantity
data['quantity'] = data['quantity'].astype('category')
data['quantity'] = data['quantity'].cat.codes

### quantity_group - same as quantity, drop
data.drop('quantity_group', axis=1, inplace=True)

### source - drop, source_type is less redundant
data.drop('source', axis=1, inplace=True)

### source_type - category encode
data['source_type'] = data['source_type'].astype('category')
data['source_type'] = data['source_type'].cat.codes

### source_class
data.drop('source_class', axis=1, inplace=True)

### waterpoint_type - drop, it only had one more variable than waterpoint_type_group which belonged under another
data.drop('waterpoint_type', axis=1, inplace=True)

### waterpoint_type_group
data['waterpoint_type_group'] = data['waterpoint_type_group'].astype('category')
data['waterpoint_type_group'] = data['waterpoint_type_group'].cat.codes

In [52]:
data.drop('water_quality', axis=1, inplace=True)

In [53]:
data

Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,district_code,lga,ward,population,public_meeting,scheme_management,permit,extraction_type_class,management_group,payment_type,quantity,source_type,waterpoint_type_group,status_group,age_in_years,salt_quality,soft_quality,color_quality,opaque_quality,flouride_quality
0,69572,6000,1131,1390,1254,34.938093,-9.856322,1,9941,3,5,47,1270,109,1,6,0,0,4,0,1,6,1,functional,12,0,1,0,0,0
2,34310,25,696,686,1713,37.460664,-3.821329,5,7638,8,4,100,1455,250,1,6,1,0,4,5,1,1,1,functional,4,0,1,0,0,0
3,67743,0,1449,263,1540,38.486161,-11.155298,7,7562,12,16,80,1408,58,1,6,1,5,4,2,0,0,1,non functional,27,0,1,0,0,0
5,9944,20,818,0,322,39.172796,-4.765587,5,9945,20,8,62,1180,1,1,6,1,5,4,5,1,2,1,functional,2,1,0,0,0,0
6,19816,0,288,0,333,33.362410,-3.766365,0,3165,17,3,96,1650,0,1,6,1,1,4,2,1,0,3,non functional,2012,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59394,11164,500,1558,351,864,37.634053,-6.124830,8,5898,11,6,76,191,89,1,6,1,5,4,1,1,0,1,non functional,4,0,1,0,0,0
59395,60739,10,372,1210,163,37.169807,-3.253847,5,4736,6,5,14,967,125,1,9,1,0,4,5,1,6,1,functional,14,0,1,0,0,0
59396,27263,4700,140,1212,217,35.249991,-9.070629,6,2495,3,4,84,310,56,1,6,1,0,4,0,1,4,1,functional,15,0,1,0,0,0
59398,31282,0,739,0,1006,35.861315,-6.378573,6,11780,2,4,9,1290,0,1,6,1,1,4,2,2,5,3,functional,2011,0,1,0,0,0


## Visualizations

## Notes

In [54]:
# when stretching this, build a proposed schedule as business question that predicts how often various
# types of maintence are needed

## Initial Models

In [55]:
# model prep
X = data.drop('status_group', axis=1)
y = data['status_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [56]:
# scaling
ss = StandardScaler()
X_train = ss.fit_transform(X_train)

In [57]:
# instantiate each model type
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

In [58]:
# KNearestNeighbor
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_train)

knn_recall = recall_score(y_train, y_pred_knn, average='macro')
knn_precision = precision_score(y_train, y_pred_knn, average='macro')
knn_accuracy = accuracy_score(y_train, y_pred_knn)
knn_f1 = f1_score(y_train, y_pred_knn, average='macro')

In [59]:
# DecisionTree
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_train)

dt_recall = recall_score(y_train, y_pred_dt, average='macro')
dt_precision = precision_score(y_train, y_pred_dt, average='macro')
dt_accuracy = accuracy_score(y_train, y_pred_dt)
dt_f1 = f1_score(y_train, y_pred_dt, average='macro')

In [60]:
# RandomForest
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_train)

rf_recall = recall_score(y_train, y_pred_rf, average='macro')
rf_precision = precision_score(y_train, y_pred_rf, average='macro')
rf_accuracy = accuracy_score(y_train, y_pred_rf)
rf_f1 = f1_score(y_train, y_pred_rf, average='macro')



In [61]:
# scores table

base_scores = pd.DataFrame({'Model': ['KNN', 'Decision Tree', 'Random Forest'], 
                            'Recall_Score' : [knn_recall, dt_recall, rf_recall],
                            'Precision_Score' : [knn_precision, dt_precision, rf_precision],
                            'Accuracy_Score' : [knn_accuracy, dt_accuracy, rf_accuracy],
                            'F1_Score' : [knn_f1, dt_f1, rf_f1]
                           })

base_scores

Unnamed: 0,Model,Recall_Score,Precision_Score,Accuracy_Score,F1_Score
0,KNN,0.686959,0.774674,0.817401,0.716994
1,Decision Tree,1.0,1.0,1.0,1.0
2,Random Forest,0.969401,0.985585,0.984979,0.977226


## Resampling Code

In [62]:
# data.dtypes

In [63]:
# resample method 1 - RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)

In [64]:
# resample method 2 - SMOTE

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_sample(X_train, y_train)

In [65]:
# resample method 3 - EditedNearestNeighbor

edit = EditedNearestNeighbours(sampling_strategy='auto', random_state=42)
X_edit, y_edit = edit.fit_sample(X_train, y_train)

## Resampled Models

In [66]:
# KNN on RandomUnderSampler
knn.fit(X_rus, y_rus)
y_pred_knn_rus = knn.predict(X_rus)

knn_recall_rus = recall_score(y_rus, y_pred_knn_rus, average='macro')
knn_prec_rus = precision_score(y_rus, y_pred_knn_rus, average='macro')
knn_acc_rus = accuracy_score(y_rus, y_pred_knn_rus)
knn_f1_rus = f1_score(y_rus, y_pred_knn_rus, average='macro')

# KNN on SMOTE
knn.fit(X_smote, y_smote)
y_pred_knn_smote = knn.predict(X_smote)

knn_recall_smote = recall_score(y_smote, y_pred_knn_smote, average='macro')
knn_prec_smote = precision_score(y_smote, y_pred_knn_smote, average='macro')
knn_acc_smote = accuracy_score(y_smote, y_pred_knn_smote)
knn_f1_smote = f1_score(y_smote, y_pred_knn_smote, average='macro')

# KNN on EditedNearestNeighbors
knn.fit(X_edit, y_edit)
y_pred_knn_edit = knn.predict(X_edit)

knn_recall_edit = recall_score(y_edit, y_pred_knn_edit, average='macro')
knn_prec_edit = precision_score(y_edit, y_pred_knn_edit, average='macro')
knn_acc_edit = accuracy_score(y_edit, y_pred_knn_edit)
knn_f1_edit = f1_score(y_edit, y_pred_knn_edit, average='macro')

In [70]:
# Decision Tree on RandomUnderSampler
dt.fit(X_rus, y_rus)
y_pred_dt_rus = dt.predict(X_rus)

dt_recall_rus = recall_score(y_rus, y_pred_dt_rus, average='macro')
dt_prec_rus = precision_score(y_rus, y_pred_dt_rus, average='macro')
dt_acc_rus = accuracy_score(y_rus, y_pred_dt_rus)
dt_f1_rus = f1_score(y_rus, y_pred_dt_rus, average='macro')

# DecisionTree on SMOTE
dt.fit(X_smote, y_smote)
y_pred_dt_smote = dt.predict(X_smote)

dt_recall_smote = recall_score(y_smote, y_pred_dt_smote, average='macro')
dt_prec_smote = precision_score(y_smote, y_pred_dt_smote, average='macro')
dt_acc_smote = accuracy_score(y_smote, y_pred_dt_smote)
dt_f1_smote = f1_score(y_smote, y_pred_dt_smote, average='macro')

# DecisionTree on EditedNearestNeighbors
dt.fit(X_edit, y_edit)
y_pred_dt_edit = dt.predict(X_edit)

dt_recall_edit = recall_score(y_edit, y_pred_dt_edit, average='macro')
dt_prec_edit = precision_score(y_edit, y_pred_dt_edit, average='macro')
dt_acc_edit = accuracy_score(y_edit, y_pred_dt_edit)
dt_f1_edit = f1_score(y_edit, y_pred_dt_edit, average='macro')

In [72]:
# RF on RandomUnderSampler
rf.fit(X_rus, y_rus)
y_pred_rf_rus = rf.predict(X_rus)

rf_recall_rus = recall_score(y_rus, y_pred_rf_rus, average='macro')
rf_prec_rus = precision_score(y_rus, y_pred_rf_rus, average='macro')
rf_acc_rus = accuracy_score(y_rus, y_pred_rf_rus)
rf_f1_rus = f1_score(y_rus, y_pred_rf_rus, average='macro')

# RF on SMOTE
rf.fit(X_smote, y_smote)
y_pred_rf_smote = rf.predict(X_smote)

rf_recall_smote = recall_score(y_smote, y_pred_rf_smote, average='macro')
rf_prec_smote = precision_score(y_smote, y_pred_rf_smote, average='macro')
rf_acc_smote = accuracy_score(y_smote, y_pred_rf_smote)
rf_f1_smote = f1_score(y_smote, y_pred_rf_smote, average='macro')

# RF on EditedNearestNeighbors
rf.fit(X_edit, y_edit)
y_pred_rf_edit = rf.predict(X_edit)

rf_recall_edit = recall_score(y_edit, y_pred_rf_edit, average='macro')
rf_prec_edit = precision_score(y_edit, y_pred_rf_edit, average='macro')
rf_acc_edit = accuracy_score(y_edit, y_pred_rf_edit)
rf_f1_edit = f1_score(y_edit, y_pred_rf_edit, average='macro')

In [88]:
recall_scores = pd.DataFrame({'Resample Method': ['None', 'RandomUndersampler', 'SMOTE', 'EditedNearestNeighbor'],
                             'KNN': [knn_recall, knn_recall_rus, knn_recall_smote, knn_recall_edit],
                             'Decision Tree': [dt_recall, dt_recall_rus, dt_recall_smote, dt_recall_edit],
                             'Random Forest': [rf_recall, rf_recall_rus, rf_recall_smote, rf_recall_edit]})
recall_scores = recall_scores.style.set_caption("All Recall Scores")

In [89]:
precision_scores = pd.DataFrame({'Resample Method': ['None', 'RandomUndersampler', 'SMOTE', 'EditedNearestNeighbor'],
                             'KNN': [knn_precision, knn_prec_rus, knn_prec_smote, knn_prec_edit],
                             'Decision Tree': [dt_precision, dt_prec_rus, dt_prec_smote, dt_prec_edit],
                             'Random Forest': [rf_precision, rf_prec_rus, rf_prec_smote, rf_prec_edit]})
precision_scores = precision_scores.style.set_caption("All Precision Scores")

In [91]:
accuracy_scores = pd.DataFrame({'Resample Method': ['None', 'RandomUndersampler', 'SMOTE', 'EditedNearestNeighbor'],
                             'KNN': [knn_accuracy, knn_acc_rus, knn_acc_smote, knn_acc_edit],
                             'Decision Tree': [dt_accuracy, dt_acc_rus, dt_acc_smote, dt_acc_edit],
                             'Random Forest': [rf_accuracy, rf_acc_rus, rf_acc_smote, rf_acc_edit]})
accuracy_scores = accuracy_scores.style.set_caption("All Accuracy Scores")

In [92]:
F1_scores = pd.DataFrame({'Resample Method': ['None', 'RandomUndersampler', 'SMOTE', 'EditedNearestNeighbor'],
                             'KNN': [knn_f1, knn_f1_rus, knn_f1_smote, knn_f1_edit],
                             'Decision Tree': [dt_f1, dt_f1_rus, dt_f1_smote, dt_f1_edit],
                             'Random Forest': [rf_f1, rf_f1_rus, rf_f1_smote, rf_f1_edit]})
F1_scores = F1_scores.style.set_caption("All F1 Scores")

In [94]:
recall_scores

Unnamed: 0,Resample Method,KNN,Decision Tree,Random Forest
0,,0.686959,1.0,0.969401
1,RandomUndersampler,0.728309,1.0,0.984589
2,SMOTE,0.87125,1.0,0.992764
3,EditedNearestNeighbor,0.875313,1.0,0.988373


In [95]:
precision_scores

Unnamed: 0,Resample Method,KNN,Decision Tree,Random Forest
0,,0.774674,1.0,0.985585
1,RandomUndersampler,0.739932,1.0,0.984662
2,SMOTE,0.873179,1.0,0.992814
3,EditedNearestNeighbor,0.928427,1.0,0.996143


In [96]:
accuracy_scores

Unnamed: 0,Resample Method,KNN,Decision Tree,Random Forest
0,,0.817401,1.0,0.984979
1,RandomUndersampler,0.728309,1.0,0.984589
2,SMOTE,0.87125,1.0,0.992764
3,EditedNearestNeighbor,0.937229,1.0,0.994956


In [97]:
F1_scores

Unnamed: 0,Resample Method,KNN,Decision Tree,Random Forest
0,,0.716994,1.0,0.977226
1,RandomUndersampler,0.728314,1.0,0.984596
2,SMOTE,0.869162,1.0,0.992764
3,EditedNearestNeighbor,0.896644,1.0,0.992174
