Download the training and testing datasets to our local directory.

In [1]:
import requests
import csv

urls = {
        'X_train' : "https://s3.amazonaws.com/drivendata/data/7/public/4910797b-ee55-40a7-8668-10efd5c1b960.csv",
        'y_train' : "https://s3.amazonaws.com/drivendata/data/7/public/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv",
        'X_test' : "https://s3.amazonaws.com/drivendata/data/7/public/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv",
        'y_test' : "https://s3.amazonaws.com/drivendata/data/7/public/SubmissionFormat.csv"
        }

for i in urls:
    r = requests.get(urls[i])

    text = r.iter_lines()

    reader = csv.reader(text, delimiter=',')

    mylist = list(reader)

    with open(str(i)+'.csv', 'wb') as fp:
        a = csv.writer(fp, delimiter=',')
        data = mylist
        a.writerows(data)

Load our train and test datasets into pandas.

In [2]:
import pandas as pd
import numpy as np

In [153]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')

Let's inspect our target variable:

In [154]:
del y_train['id']

In [155]:
y_train['status_group'].value_counts(normalize = True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

In [6]:
for i in X_train.columns:
    print i,',' ,'{:.1%}'.format(np.mean(X_train[i].isnull())),'nulls',',',type(X_train[i][0]), X_train[i].nunique()

id , 0.0% nulls , <type 'numpy.int64'> 59400
amount_tsh , 0.0% nulls , <type 'numpy.float64'> 98
date_recorded , 0.0% nulls , <type 'str'> 356
funder , 6.1% nulls , <type 'str'> 1897
gps_height , 0.0% nulls , <type 'numpy.int64'> 2428
installer , 6.2% nulls , <type 'str'> 2145
longitude , 0.0% nulls , <type 'numpy.float64'> 57516
latitude , 0.0% nulls , <type 'numpy.float64'> 57517
wpt_name , 0.0% nulls , <type 'str'> 37400
num_private , 0.0% nulls , <type 'numpy.int64'> 65
basin , 0.0% nulls , <type 'str'> 9
subvillage , 0.6% nulls , <type 'str'> 19287
region , 0.0% nulls , <type 'str'> 21
region_code , 0.0% nulls , <type 'numpy.int64'> 27
district_code , 0.0% nulls , <type 'numpy.int64'> 20
lga , 0.0% nulls , <type 'str'> 125
ward , 0.0% nulls , <type 'str'> 2092
population , 0.0% nulls , <type 'numpy.int64'> 1049
public_meeting , 5.6% nulls , <type 'bool'> 2
recorded_by , 0.0% nulls , <type 'str'> 1
scheme_management , 6.5% nulls , <type 'str'> 12
scheme_name , 47.4% nulls , <type '

In [459]:
X_train.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


## Variable Clean-up

### Variables to delete

In [7]:
def removal():
    # id: we drop the id column because it is not a useful predictor.
    # 'amount_tsh' is mostly blank - delete
    # construction_year: we will NOT YET delete this column since ~35% of the values are zeros.
    # wpt_name: not useful, delete (too many values)
    # subvillage: too many values, delete
    # scheme_name: this is almost 50% nulls, so we will delete this column
    # num_private: we will delete this column because ~99% of the values are zeros.
    # region: drop this b/c is seems very similar to region_code, though not 100% sure about this one!
    z = ['id', 'amount_tsh', 'num_private', 'wpt_name', 
          'subvillage', 'scheme_name', 'region', 'extraction_type_group',
         'payment', 'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group', 'ward']
    for i in z:
        del X_train[i]
        del X_test[i]
    return

construction year: let's try filling in the nulls with a given value

In [50]:
def construction():
    for i in [X_train, X_test]:
        i['construction_year'].replace(0, X_train['construction_year'].mean(), inplace=True)

date_recorded: this might be a useful variable for this analysis, although the year itself would be useless in a practical scenario moving into the future. We will convert this column into a datetime, and we will also create 'year_recorded' and 'month_recorded' columns just in case those levels prove to be useful. A visual inspection of both casts significant doubt on that possibility, but we'll proceed for now. We will delete date_recorded itself, since random forest cannot accept datetime

In [9]:
def dates():
    for i in [X_train, X_test]:
        i['date_recorded'] = pd.to_datetime(i['date_recorded'])
        i['date_recorded'] = pd.to_datetime(i['date_recorded'])
        i['year_recorded'] = i['date_recorded'].apply(lambda x: x.year)
        i['month_recorded'] = i['date_recorded'].apply(lambda x: x.month)
        del i['date_recorded']
    return

`gps_height`, `latitude`, `longitude`

In [113]:
def locs():
    trans = ['longitude', 'latitude', 'gps_height', 'population']
    for i in [X_train, X_test]:
        i.loc[i.longitude == 0, 'latitude'] = 0
        for z in trans:
            i[z].replace(0., np.NaN, inplace = True)
            i[z].replace(1., np.NaN, inplace = True)
            data = X_train.groupby(['subvillage'])[z]
            i[z] = data.transform(lambda x: x.fillna(x.mean()))
            data = X_train.groupby(['district_code'])[z]
            i[z] = data.transform(lambda x: x.fillna(x.mean()))
            data = X_train.groupby(['basin'])[z]
            i[z] = data.transform(lambda x: x.fillna(x.mean()))
            i[z] = i[z].fillna(X_train[z].mean())
    return

In [11]:
# public_meeting: we will fill the nulls as 'False'
# permit: we will fill the nulls as 'False'

def bools():
    z = ['public_meeting', 'permit']
    for i in z:
        X_train[i].fillna(False, inplace = True)
        X_train[i] = X_train[i].apply(lambda x: float(x))
        X_test[i].fillna(False, inplace = True)
        X_test[i] = X_test[i].apply(lambda x: float(x))
    return

string columns

In [12]:
def codes():
    # convert region_code and district_code to string objects, since they are actually categorical variables
    for i in ['region_code', 'district_code']:
        X_train[i] = X_train[i].apply(lambda x: str(x))
        X_test[i] = X_test[i].apply(lambda x: str(x))
    return

In [14]:
def dummies():
    global X_train, X_test
    columns = [i for i in X_train.columns if type(X_train[i].iloc[1]) == str]
    for column in columns:
        good_cols = []
        X_train[column].fillna('NULL', inplace = True)
        dumms = pd.get_dummies(X_train[column], prefix = column+'_')
        for i in dumms.columns:
        #    if chi2_contingency(pd.crosstab(dumms[i], y_train['status_group']))[1] < .001:
            good_cols.append(i)
        good_cols = [i for i in good_cols if i in pd.get_dummies(X_test[column], prefix = column+'_').columns]
        X_train = pd.concat((X_train, pd.get_dummies(X_train[column], prefix = column+'_')[good_cols]), axis = 1)
        X_test = pd.concat((X_test, pd.get_dummies(X_test[column], prefix = column+'_')[good_cols]), axis = 1)
        del X_train[column]
        del X_test[column]
    return

In [34]:
def dummies3():
    global X_train, X_test
    columns = [i for i in X_train.columns if type(X_train[i].iloc[1]) == str]
    status = pd.get_dummies(y_train['status_group'])
    for column in columns:
        good_cols = []
        X_train[column].fillna('NULL', inplace = True)
        dumms = pd.get_dummies(X_train[column], prefix = column+'_')
        for i in dumms.columns:
            if status[dumms[i] == 1]['functional'].mean() > (status['functional'].mean() + .1):
                good_cols.append(i)
            elif status[dumms[i] == 1]['non functional'].mean() > (status['non functional'].mean() + .1):
                good_cols.append(i)
            elif status[dumms[i] == 1]['functional needs repair'].mean() > (status['functional needs repair'].mean() + .1):
                good_cols.append(i)
        good_cols = [i for i in good_cols if i in pd.get_dummies(X_test[column], prefix = column+'_').columns]
        X_train = pd.concat((X_train, pd.get_dummies(X_train[column], prefix = column+'_')[good_cols]), axis = 1)
        X_test = pd.concat((X_test, pd.get_dummies(X_test[column], prefix = column+'_')[good_cols]), axis = 1)
        del X_train[column]
        del X_test[column]
    return

In [15]:
def dummies2():
    global X_train, X_test
    columns = [i for i in X_train.columns if type(X_train[i].iloc[1]) == str]
    status = pd.get_dummies(y_train['status_group'])
    for column in columns:
        func = []
        non_func = []
        repair = []
        X_train[column].fillna('NULL', inplace = True)
        dumms = pd.get_dummies(X_train[column], prefix = column+'_')
        for i in dumms.columns:
            if status[dumms[i] == 1]['functional'].mean() > (status['functional'].mean() + .1):
                func.append(i)
            elif status[dumms[i] == 1]['non functional'].mean() > (status['non functional'].mean() + .1):
                non_func.append(i)
            elif status[dumms[i] == 1]['functional needs repair'].mean() > (status['functional needs repair'].mean() + .1):
                repair.append(i)
        func = [i for i in func if i in pd.get_dummies(X_test[column], prefix = column+'_').columns]
        non_func = [i for i in non_func if i in pd.get_dummies(X_test[column], prefix = column+'_').columns]
        repair = [i for i in repair if i in pd.get_dummies(X_test[column], prefix = column+'_').columns]
        
        for i in [X_train, X_test]:
            if len(func) > 0:
                i['func_'+column] = pd.get_dummies(i[column], prefix = column+'_')[func].max(axis=1)
            if len(non_func) > 0:    
                i['non_func_'+column] = pd.get_dummies(i[column], prefix = column+'_')[non_func].max(axis=1)
            if len(repair) > 0:
                i['repair_'+column] = pd.get_dummies(i[column], prefix = column+'_')[repair].max(axis=1)
        del X_train[column]
        del X_test[column]
    return

In [156]:
construction()
locs()
dates()
bools()
codes()
removal()
dummies()

In [157]:
print np.shape(X_train)
print np.shape(X_test)

(59400, 1845)
(14850, 1845)


### Run Random Forest classifier

In [158]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(criterion='entropy',
                                n_estimators=1000,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf.fit(X_train, y_train.values.ravel())
print "%.4f" % rf.oob_score_ 

0.8083


In [159]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, X_train, y_train.values.ravel())
scores.mean()

0.80099326599326603

### Generate submission file

In [161]:
predictions = rf.predict(X_test)
y_test = pd.read_csv('y_test.csv')
pred = pd.DataFrame(predictions, columns = [y_test.columns[1]])
del y_test['status_group']
y_test = pd.concat((y_test, pred), axis = 1)
y_test.to_csv('y_test.csv', sep=",", index = False)

In [674]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [850]:
from sklearn.ensemble import AdaBoostClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1,
                              random_state=0)

ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=100, 
                         learning_rate=.5,
                         random_state=0)

In [851]:
scores = cross_val_score(ada, X_train, y_train.values.ravel())

In [852]:
scores.mean()

0.73193602693602688

In [786]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_range = [0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{'C': param_range}]

gs = GridSearchCV(estimator=LogisticRegression(random_state=0),
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

scores = cross_val_score(gs, X_train, y_train.values.ravel(), scoring='accuracy', cv=5)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy: 0.747 +/- 0.004


In [None]:
from sklearn.svm import SVC

param_range = [0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{'C': param_range, 
               'kernel': ['rbf']}]

gs = GridSearchCV(estimator=SVC(),
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

scores = cross_val_score(gs, X_train, y_train.values.ravel(), scoring='accuracy', cv=5)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [27]:
nonbi = []
for i in X_train.columns:
    if X_train[i].nunique() > 2:
        nonbi.append(i)

for i in nonbi:
    del X_train[i]

In [33]:
from sklearn.naive_bayes import BernoulliNB

nb = BernoulliNB()

scores = cross_val_score(nb, X_train, y_train.values.ravel())
scores.mean()

0.70023569023569021