In [1]:
#import statements
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as ply
import seaborn as sns

#sci-kit learn
import sklearn
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.dummy import DummyClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Data Cleaning and EDA

In [2]:
#import train data
#DO NOT LOOK AT TEST DATA UNTIL VALIDATION
df_train = pd.read_csv('../data/water_well_train_data.csv')

In [3]:
df_train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
# info of train data
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [5]:
#NA values of train data
df_train_nans = df_train.isna().sum()[df_train.isna().sum() > 0]
df_train_nans

funder                3635
installer             3655
subvillage             371
public_meeting        3334
scheme_management     3877
scheme_name          28166
permit                3056
dtype: int64

In [6]:
df_train_nacolumns = df_train_nans.index
df_train[df_train_nacolumns]

Unnamed: 0,funder,installer,subvillage,public_meeting,scheme_management,scheme_name,permit
0,Roman,Roman,Mnyusi B,True,VWC,Roman,False
1,Grumeti,GRUMETI,Nyamara,,Other,,True
2,Lottery Club,World vision,Majengo,True,VWC,Nyumba ya mungu pipe scheme,True
3,Unicef,UNICEF,Mahakamani,True,VWC,,True
4,Action In A,Artisan,Kyanyamisa,True,,,True
...,...,...,...,...,...,...,...
59395,Germany Republi,CES,Kiduruni,True,Water Board,Losaa Kia water supply,True
59396,Cefa-njombe,Cefa,Igumbilo,True,VWC,Ikondo electrical water sch,True
59397,,,Madungulu,True,VWC,,False
59398,Malec,Musa,Mwinyi,True,VWC,,True


In [7]:
df_train['permit'].value_counts()

True     38852
False    17492
Name: permit, dtype: int64

In [8]:
#import target information
df_label = pd.read_csv('../data/water_well_train_labels.csv')

In [9]:
df_label.shape

(59400, 2)

In [10]:
df_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            59400 non-null  int64 
 1   status_group  59400 non-null  object
dtypes: int64(1), object(1)
memory usage: 928.2+ KB


In [11]:
df_label['status_group'].value_counts()

functional                 32259
non functional             22824
functional needs repair     4317
Name: status_group, dtype: int64

In [12]:
df_label['status_group'].value_counts(normalize = True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

There are three target classifications: functional (54%), non-functional (38%), and function needs repair (7%).

In [13]:
df = df_train.join(other = df_label, rsuffix = '_label')


In [14]:
# convert date_recorded to datetime
df['date_recorded_datetime'] = pd.to_datetime(df['date_recorded'])

In [15]:
# drop object column date_recorded
df.drop(columns=['date_recorded'], inplace=True)

In [16]:
df.select_dtypes(include=object).nunique()

funder                    1897
installer                 2145
wpt_name                 37400
basin                        9
subvillage               19287
region                      21
lga                        125
ward                      2092
public_meeting               2
recorded_by                  1
scheme_management           12
scheme_name               2696
permit                       2
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
management_group             5
payment                      7
payment_type                 7
water_quality                8
quality_group                6
quantity                     5
quantity_group               5
source                      10
source_type                  7
source_class                 3
waterpoint_type              7
waterpoint_type_group        6
status_group                 3
dtype: int64

In [80]:
df.select_dtypes(include='number').nunique()

id                   59400
amount_tsh              98
gps_height            2428
longitude            57516
latitude             57517
num_private             65
region_code             27
district_code           20
population            1049
construction_year       55
id_label             59400
dtype: int64

In [19]:
df.drop(columns=['id', 'longitude', 'latitude', 'num_private', 'id_label', 'wpt_name', 'subvillage', 'recorded_by', 'ward', 'scheme_name'], axis=1, inplace=True)

In [20]:
df.columns

Index(['amount_tsh', 'funder', 'gps_height', 'installer', 'basin', 'region',
       'region_code', 'district_code', 'lga', 'population', 'public_meeting',
       'scheme_management', 'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group',
       'status_group', 'date_recorded_datetime'],
      dtype='object')

amount_tsh 
installer 
gps_height
basin
region
district_code
lga
population
public_meeting
management
permit
extraction_type
payment
water_quality
quantity
source
waterpoint_type
status_group

In [25]:
df.select_dtypes(include=object)

Unnamed: 0,funder,installer,basin,region,lga,public_meeting,scheme_management,permit,extraction_type,extraction_type_group,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,Roman,Roman,Lake Nyasa,Iringa,Ludewa,True,VWC,False,gravity,gravity,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,Grumeti,GRUMETI,Lake Victoria,Mara,Serengeti,,Other,True,gravity,gravity,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,Lottery Club,World vision,Pangani,Manyara,Simanjiro,True,VWC,True,gravity,gravity,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,Unicef,UNICEF,Ruvuma / Southern Coast,Mtwara,Nanyumbu,True,VWC,True,submersible,submersible,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,Action In A,Artisan,Lake Victoria,Kagera,Karagwe,True,,True,gravity,gravity,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,Germany Republi,CES,Pangani,Kilimanjaro,Hai,True,Water Board,True,gravity,gravity,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
59396,Cefa-njombe,Cefa,Rufiji,Iringa,Njombe,True,VWC,True,gravity,gravity,...,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional
59397,,,Rufiji,Mbeya,Mbarali,True,VWC,False,swn 80,swn 80,...,fluoride,fluoride,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,functional
59398,Malec,Musa,Rufiji,Dodoma,Chamwino,True,VWC,True,nira/tanira,nira/tanira,...,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional


In [29]:
new_df= df[['amount_tsh', 'installer', 'gps_height', 'basin', 'region', 'district_code',
'lga',
'population',
'public_meeting',
'management',
'permit',
'extraction_type',
'payment',
'water_quality',
'quantity',
'source',
'waterpoint_type',
'status_group']].copy()

In [30]:
new_df.head()

Unnamed: 0,amount_tsh,installer,gps_height,basin,region,district_code,lga,population,public_meeting,management,permit,extraction_type,payment,water_quality,quantity,source,waterpoint_type,status_group
0,6000.0,Roman,1390,Lake Nyasa,Iringa,5,Ludewa,109,True,vwc,False,gravity,pay annually,soft,enough,spring,communal standpipe,functional
1,0.0,GRUMETI,1399,Lake Victoria,Mara,2,Serengeti,280,,wug,True,gravity,never pay,soft,insufficient,rainwater harvesting,communal standpipe,functional
2,25.0,World vision,686,Pangani,Manyara,4,Simanjiro,250,True,vwc,True,gravity,pay per bucket,soft,enough,dam,communal standpipe multiple,functional
3,0.0,UNICEF,263,Ruvuma / Southern Coast,Mtwara,63,Nanyumbu,58,True,vwc,True,submersible,never pay,soft,dry,machine dbh,communal standpipe multiple,non functional
4,0.0,Artisan,0,Lake Victoria,Kagera,1,Karagwe,0,True,other,True,gravity,never pay,soft,seasonal,rainwater harvesting,communal standpipe,functional


In [41]:
new_df.isna().sum()

amount_tsh         0
installer          0
gps_height         0
basin              0
region             0
district_code      0
lga                0
population         0
public_meeting     0
management         0
permit             0
extraction_type    0
payment            0
water_quality      0
quantity           0
source             0
waterpoint_type    0
status_group       0
dtype: int64

In [33]:
new_df['permit'].fillna(value = False, inplace=True)

In [36]:
new_df['installer'].fillna(value='Not known', inplace=True)

In [40]:
new_df['public_meeting'].fillna(value= True, inplace=True)

In [42]:
new_df.to_csv('clean.csv')

In [43]:
!ls


Anton.ipynb
clean.csv
Yuhkai.ipynb
