Download the training and testing datasets to our local directory.

In [642]:
import requests
import csv

urls = {
        'X_train' : "https://s3.amazonaws.com/drivendata/data/7/public/4910797b-ee55-40a7-8668-10efd5c1b960.csv",
        'y_train' : "https://s3.amazonaws.com/drivendata/data/7/public/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv",
        'X_test' : "https://s3.amazonaws.com/drivendata/data/7/public/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv",
        'y_test' : "https://s3.amazonaws.com/drivendata/data/7/public/SubmissionFormat.csv"
        }

for i in urls:
    r = requests.get(urls[i])

    text = r.iter_lines()

    reader = csv.reader(text, delimiter=',')

    mylist = list(reader)

    with open(str(i)+'.csv', 'wb') as fp:
        a = csv.writer(fp, delimiter=',')
        data = mylist
        a.writerows(data)

Load our train and test datasets into pandas.

In [489]:
import pandas as pd
import numpy as np

In [697]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')

Let's inspect our target variable:

In [698]:
del y_train['id']

In [699]:
y_train['status_group'].value_counts(normalize = True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

In [497]:
for i in X_train.columns:
    print i,',' ,'{:.1%}'.format(np.mean(X_train[i].isnull())),'nulls',',',type(X_train[i][0]), X_train[i].nunique()

date_recorded , 0.0% nulls , <class 'pandas.tslib.Timestamp'> 356
gps_height , 0.0% nulls , <type 'numpy.int64'> 2428
longitude , 0.0% nulls , <type 'numpy.float64'> 57516
latitude , 0.0% nulls , <type 'numpy.float64'> 57517
population , 0.0% nulls , <type 'numpy.int64'> 1049
public_meeting , 0.0% nulls , <type 'numpy.bool_'> 2
permit , 0.0% nulls , <type 'numpy.bool_'> 2
year_recorded , 100.0% nulls , <type 'numpy.float64'> 5
month_recorded , 100.0% nulls , <type 'numpy.float64'> 12
funder__0 , 0.0% nulls , <type 'numpy.float64'> 2
funder__Abasia , 0.0% nulls , <type 'numpy.float64'> 2
funder__Acra , 0.0% nulls , <type 'numpy.float64'> 2
funder__Adb , 0.0% nulls , <type 'numpy.float64'> 2
funder__Adra , 0.0% nulls , <type 'numpy.float64'> 2
funder__African Development Bank , 0.0% nulls , <type 'numpy.float64'> 2
funder__Ai , 0.0% nulls , <type 'numpy.float64'> 2
funder__Aict , 0.0% nulls , <type 'numpy.float64'> 2
funder__Amref , 0.0% nulls , <type 'numpy.float64'> 2
funder__Anglican 

In [459]:
X_train.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [460]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Variable Clean-up

### Variables to delete

In [473]:
def removal():
    # id: we drop the id column because it is not a useful predictor.
    # 'amount_tsh' is mostly blank - delete
    # construction_year: we will delete this column since ~35% of the values are zeros.
    # wpt_name: not useful, delete (too many values)
    # subvillage: too many values, delete
    # scheme_name: this is almost 50% nulls, so we will delete this column
    # num_private: we will delete this column because ~99% of the values are zeros.
    # region: drop this b/c is seems very similar to region_code, though not 100% sure about this one!
    z = ['id', 'amount_tsh', 'num_private', 'construction_year', 'wpt_name', 
          'subvillage', 'scheme_name', 'region']
    for i in z:
        del X_train[i]
        del X_test[i]
    return

date_recorded: this might be a useful variable for this analysis, although the year itself would be useless in a practical scenario moving into the future. We will convert this column into a datetime, and we will also create 'year_recorded' and 'month_recorded' columns just in case those levels prove to be useful. A visual inspection of both casts significant doubt on that possibility, but we'll proceed for now. We will delete date_recorded itself, since random forest cannot accept datetime

In [527]:
def dates():
    for i in [X_train, X_test]:
        i['date_recorded'] = pd.to_datetime(i['date_recorded'])
        i['date_recorded'] = pd.to_datetime(i['date_recorded'])
        i['year_recorded'] = i['date_recorded'].apply(lambda x: x.year)
        i['month_recorded'] = i['date_recorded'].apply(lambda x: x.month)
        del i['date_recorded']
    return

`gps_height`, `latitude`, `longitude`

In [603]:
def locs():
    trans = ['longitude', 'latitude', 'gps_height', 'population']
    for i in [X_train, X_test]:
        i.loc[i.longitude == 0, 'latitude'] = 0
        for z in trans:
            i[z].replace(0., np.NaN, inplace = True)
            data = X_train.groupby(['district_code'])[z]
            i[z] = data.transform(lambda x: x.fillna(x.mean()))
        i['gps_height'].fillna(X_train['gps_height'].mean(), inplace = True)
        X_train['population'].fillna(X_train['population'].mean(), inplace = True)
    return

In [538]:
# public_meeting: we will fill the nulls as 'False'
# permit: we will fill the nulls as 'False'

def bools():
    z = ['public_meeting', 'permit']
    for i in z:
        X_train[i].fillna(False, inplace = True)
        X_train[i] = X_train[i].apply(lambda x: float(x))
        X_test[i].fillna(False, inplace = True)
        X_test[i] = X_test[i].apply(lambda x: float(x))
    return

string columns

In [480]:
def codes():
    # convert region_code and district_code to string objects, since they are actually categorical variables
    for i in ['region_code', 'district_code']:
        X_train[i] = X_train[i].apply(lambda x: str(x))
        X_test[i] = X_test[i].apply(lambda x: str(x))
    return

In [696]:
def dummies():
    global X_train, X_test
    columns = [i for i in X_train.columns if type(X_train[i].iloc[1]) == str]
    for column in columns:
        good_cols = []
        X_train[column].fillna('NULL', inplace = True)
        dumms = pd.get_dummies(X_train[column], prefix = column+'_')
        for i in dumms.columns:
        #    if chi2_contingency(pd.crosstab(dumms[i], y_train['status_group']))[1] < .001:
            good_cols.append(i)
        good_cols = [i for i in good_cols if i in pd.get_dummies(X_test[column], prefix = column+'_').columns]
        X_train = pd.concat((X_train, pd.get_dummies(X_train[column], prefix = column+'_')[good_cols]), axis = 1)
        X_test = pd.concat((X_test, pd.get_dummies(X_test[column], prefix = column+'_')[good_cols]), axis = 1)
        del X_train[column]
        del X_test[column]
    return

In [700]:
removal()

In [701]:
locs()

In [702]:
dates()

In [703]:
bools()

In [704]:
codes()

In [None]:
dummies()

In [None]:
np.shape(X_train)

In [690]:
np.shape(X_test)

(14850, 1115)

In [657]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion='entropy',
                                n_estimators=100,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf.fit(X_train, y_train.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=True, random_state=1,
            verbose=0, warm_start=False)

In [658]:
print "%.4f" % rf.oob_score_ 

0.8044


In [659]:
predictions = rf.predict(X_test)
y_test = pd.read_csv('y_test.csv')
pred = pd.DataFrame(predictions, columns = [y_test.columns[1]])
del y_test['status_group']
y_test = pd.concat((y_test, pred), axis = 1)
y_test.to_csv('y_test.csv', sep=",", index = False)

ACTUAL SCORE: 0.7812

let's try dropping some of the lower-count columns...

In [667]:
for i in X_train.columns:
    if sum(X_train[i]) < 100:
        del X_train[i]
        del X_test[i]

In [668]:
print np.shape(X_train)
print np.shape(X_test)

(59400, 432)
(14850, 432)


In [669]:
from sklearn.ensemble import RandomForestClassifier

rf2 = RandomForestClassifier(criterion='entropy',
                                n_estimators=500,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf2.fit(X_train, y_train.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=True, random_state=1,
            verbose=0, warm_start=False)

In [671]:
print "%.4f" % rf2.oob_score_ 

0.7941


In [672]:
predictions = rf2.predict(X_test)
y_test = pd.read_csv('y_test.csv')
pred = pd.DataFrame(predictions, columns = [y_test.columns[1]])
del y_test['status_group']
y_test = pd.concat((y_test, pred), axis = 1)
y_test.to_csv('y_test.csv', sep=",", index = False)

ACTUAL SCORE: 0.7701

In [674]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [691]:
from sklearn.ensemble import AdaBoostClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1,
                              random_state=0)

ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=900, 
                         learning_rate=.5,
                         random_state=0)

In [694]:
scores = cross_val_score(ada, X_train, y_train.values.ravel())

In [695]:
scores.mean()

0.75670033670033676