Download the training and testing datasets to our local directory.

In [9]:
import requests
from contextlib import closing
import csv

urls = {
        'X_train' : "https://s3.amazonaws.com/drivendata/data/7/public/4910797b-ee55-40a7-8668-10efd5c1b960.csv",
        'y_train' : "https://s3.amazonaws.com/drivendata/data/7/public/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv",
        'X_test' : "https://s3.amazonaws.com/drivendata/data/7/public/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv"
        }

for i in urls:
    r = requests.get(urls[i])

    text = r.iter_lines()

    reader = csv.reader(text, delimiter=',')

    mylist = list(reader)

    with open(str(i)+'.csv', 'wb') as fp:
        a = csv.writer(fp, delimiter=',')
        data = mylist
        a.writerows(data)

Load our train and test datasets into pandas.

In [334]:
import pandas as pd
import numpy as np

X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')

Let's inspect our target variable:

In [68]:
y_train['status_group'].value_counts(normalize = True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

In [51]:
for i in X_train.columns:
    print i,',' ,'{:.1%}'.format(np.mean(X_train[i].isnull())),'nulls',',',type(X_train[i][0]), X_train[i].nunique()

id , 0.0% nulls , <type 'numpy.int64'> 59400
amount_tsh , 0.0% nulls , <type 'numpy.float64'> 98
date_recorded , 0.0% nulls , <type 'str'> 356
funder , 6.1% nulls , <type 'str'> 1897
gps_height , 0.0% nulls , <type 'numpy.int64'> 2428
installer , 6.2% nulls , <type 'str'> 2145
longitude , 0.0% nulls , <type 'numpy.float64'> 57516
latitude , 0.0% nulls , <type 'numpy.float64'> 57517
wpt_name , 0.0% nulls , <type 'str'> 37400
num_private , 0.0% nulls , <type 'numpy.int64'> 65
basin , 0.0% nulls , <type 'str'> 9
subvillage , 0.6% nulls , <type 'str'> 19287
region , 0.0% nulls , <type 'str'> 21
region_code , 0.0% nulls , <type 'numpy.int64'> 27
district_code , 0.0% nulls , <type 'numpy.int64'> 20
lga , 0.0% nulls , <type 'str'> 125
ward , 0.0% nulls , <type 'str'> 2092
population , 0.0% nulls , <type 'numpy.int64'> 1049
public_meeting , 5.6% nulls , <type 'bool'> 2
recorded_by , 0.0% nulls , <type 'str'> 1
scheme_management , 6.5% nulls , <type 'str'> 12
scheme_name , 47.4% nulls , <type '

In [15]:
X_train.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [335]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Variable Clean-up

### Variables to delete

In [386]:
# id: we drop the id column because it is not a useful predictor.
# 'amount_tsh' is mostly blank - delete
# construction_year: we will delete this column since ~35% of the values are zeros.
# wpt_name: not useful, delete (too many values)
# subvillage: too many values, delete
# scheme_name: this is almost 50% nulls, so we will delete this column
# num_private: we will delete this column because ~99% of the values are zeros.

remove = ['id', 'amount_tsh', 'num_private', 'construction_year', 'wpt_name', 'subvillage', 'scheme name', 'num_private']

In [None]:
def removal(z):
    for i in z:
        del X_train[i]
        del X_test[i]
    return

date_recorded: this might be a useful variable for this analysis, although the year itself would be useless in a practical scenario moving into the future. We will convert this column into a datetime, and we will also create 'year_recorded' and 'month_recorded' columns just in case those levels prove to be useful. A visual inspection of both casts significant doubt on that possibility, but we'll proceed for now. 

In [354]:
X_train['date_recorded'] = pd.to_datetime(X_train['date_recorded'])

In [58]:
def col_date_recorded(data):
    data['date_recorded'] = pd.to_datetime(data['date_recorded'])
    data['year_recorded'] = data['date_recorded'].apply(lambda x: x.year).value_counts()
    data['month_recorded'] = data['date_recorded'].apply(lambda x: x.month).value_counts()
    return

`gps_height`, `latitude`, `longitude`

In [391]:
trans = ['longitude', 'latitude', 'gps_height', 'population']

for i in [X_train, X_test]:
    i.loc[i.longitude == 0, 'latitude'] = 0
        for z in trans:
            i[z].replace(0., np.NaN, inplace = True)
    
    data = X_train.groupby(['district_code']).longitude
    X_train['longitude'] = data.transform(lambda x: x.fillna(x.mean()))

    med = X_train.groupby('district_code')['latitude'].transform('mean')
    X_train['latitude'].fillna(med)

    fill_mean = lambda g: g.fillna(g.mean())
    X_train['gps_height'] = X_train.groupby('district_code').gps_height.apply(fill_mean)

In [267]:
fill_values = dict(X_train['longitude'].groupby(X_train['district_code']).mean())

fill_func = lambda g: g.fillna(fill_values[g.name])
X_train['longitude'] = X_train['longitude'].groupby(X_train['district_code']).apply(fill_func)

In [387]:
# public_meeting: we will fill the nulls as 'False'
# permit: we will fill the nulls as 'False'

binary = ['public_meeting', 'permit']

def bools(z):
    for i in z:
        X_train[i].fillna(False, inplace = True)
        X_test[i].fillna(False, inplace = True)
    return

string columns

In [108]:
def dummies(data, column):
    global X_train, X_test
    good_cols = []
    data[column].fillna('NULL', inplace = True)
    dumms = pd.get_dummies(data[column])
    for i in dumms.columns:
        if chi2_contingency(pd.crosstab(dumms[i], y_train['status_group']))[1] < .001:
            good_cols.append(i)
    X_train = pd.concat((X_train, pd.get_dummies(data[column])[good_cols]), axis = 1)
    print good_cols
    return

In [388]:
X_train['population'].value_counts(normalize = True)

0       0.359949
1       0.118266
200     0.032660
150     0.031852
250     0.028300
300     0.024848
100     0.019293
50      0.019175
500     0.016987
350     0.016599
120     0.015421
400     0.013047
60      0.011886
30      0.010539
40      0.009293
80      0.008973
450     0.008401
20      0.007778
600     0.007374
230     0.006532
75      0.004865
1000    0.004680
800     0.004529
90      0.004461
130     0.004444
25      0.004293
320     0.004192
35      0.004125
360     0.003737
140     0.003620
          ...   
8848    0.000017
628     0.000017
4520    0.000017
468     0.000017
693     0.000017
725     0.000017
789     0.000017
821     0.000017
5300    0.000017
3127    0.000017
2345    0.000017
3031    0.000017
886     0.000017
392     0.000017
424     0.000017
2807    0.000017
726     0.000017
694     0.000017
2569    0.000017
4788    0.000017
662     0.000017
4660    0.000017
406     0.000017
1032    0.000017
1160    0.000017
3241    0.000017
1960    0.000017
1685    0.0000