# Prepare data (just for materialHardship)

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# from missing_values import final_over_balanced, df_train, median_impute_df_train, median_impute_df_train

In [2]:
mean_impute_df_train, median_impute_df_train, final_over_balanced = pd.read_pickle('mean_impute_df_train'), pd.read_pickle('median_impute_df_train'), pd.read_pickle('final_over_balanced')

In [3]:
final_over_balanced.shape

(5668, 11986)

See missing_values notebook for full explanation of the preceding dataframes. We'll do some testing here to see which seems to work best on validation data.

In [4]:
np.random.seed(1234)
print('reading in csv files')
# File read in
# See documentation for more descriptions
# http://www.fragilefamilieschallenge.org/apply/
background = pd.read_csv('background.csv', low_memory=False)
print('read background.csv')
train = pd.read_csv('train.csv', low_memory=False)
print('read train.csv')
prediction = pd.read_csv('prediction.csv', low_memory=False)
print('read prediction.csv')
print("Files read")

reading in csv files
read background.csv
read train.csv
read prediction.csv
Files read


Background is super WIDE. We have only 4242 IDs, but almost 13k features!

In [5]:
background.shape

(4242, 12943)

OK...so they want predictions for...everything. So we are going to make a **training** set using the rows of background that have non-NA values for materialHardship.

Then we need to make predictions for every challengeID that is a) not train or b) is in train by has NA for materialHardship 

In [35]:
all_y_train_materialHardship = train[['challengeID', 'materialHardship']]
# non NA y_train ds and data
non_na_y_train_materialHardship = all_y_train_materialHardship.dropna()
non_na_y_train_materialHardship.head()

Unnamed: 0,challengeID,materialHardship
2,6,0.090909
3,7,0.0
5,9,0.181818
6,10,0.090909
7,13,0.181818


In [34]:
# y_train_ids that are na
na_y_train_materialHardship = all_y_train_materialHardship.loc[~all_y_train_materialHardship['challengeID'].isin(non_na_y_train_materialHardship['challengeID'])]
na_y_train_materialHardship.head()

Unnamed: 0,challengeID,materialHardship
0,1,
1,3,
4,8,
14,28,
17,36,


Our training set will have all ids that we have non-NA material hardship data for. 

In [19]:
df_train = background.loc[background['challengeID'].isin(non_na_y_train_materialHardship['challengeID'])]
df_train.head()

Unnamed: 0,challengeID,m1intmon,m1intyr,m1lenhr,m1lenmin,cm1twoc,cm1fint,cm1tdiff,cm1natsm,m1natwt,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
5,6,-3,,0,25,,1,,,,...,8.5157,10.558813,-3.0,-3.0,7.022328,-3.0,10.564085,-3,-3.0,10.255825
6,7,-3,,0,35,,1,,,,...,-3.0,-3.0,9.660643,9.861125,-3.0,10.991854,-3.0,-3,10.972726,10.8598
8,9,-3,,0,30,,1,,,,...,-3.0,-3.0,11.689877,9.373199,-3.0,8.194868,-3.0,-3,9.84238,9.566678
9,10,-3,,0,33,,1,,,,...,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,10.564085,-3,-3.0,10.10587
12,13,-3,,1,-3,,1,,,,...,10.78074,-3.0,-3.0,-3.0,8.944119,-3.0,9.781718,-3,-3.0,9.566678


In [32]:
# prediction set
# all ids in background, excluding train_ids
background_non_train_ids = background.loc[~background['challengeID'].isin(train['challengeID'])]

# add back in ids that had NA in material hardship train set
background_mh_nas = background.loc[background['challengeID'].isin(na_y_train_materialHardship['challengeID'])]
# background_mh_nas.head()

# combine 
df_prediction = background_non_train_ids.append(background_mh_nas)
df_prediction.sort_values(by='challengeID').head(10)

Unnamed: 0,challengeID,m1intmon,m1intyr,m1lenhr,m1lenmin,cm1twoc,cm1fint,cm1tdiff,cm1natsm,m1natwt,...,m4d9,m4e23,f4d6,f4d7,f4d9,m5c6,m5d20,m5k10,f5c6,k5f1
0,1,-3,,-3,40,,0,,,,...,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3,-3.0,-3.0
1,2,-3,,0,40,,1,,,,...,-3.0,8.473318,-3.0,-3.0,-3.0,-3.0,9.845074,-3,-3.0,9.723551
2,3,-3,,0,35,,1,,,,...,-3.0,-3.0,9.097495,10.071504,-3.0,-3.0,-3.0,-3,-3.0,-3.0
3,4,-3,,0,30,,1,,,,...,-3.0,-3.0,9.512706,10.286578,-3.0,10.677285,-3.0,-3,8.522331,10.608137
4,5,-3,,0,25,,1,,,,...,-3.0,-3.0,11.076016,9.615958,-3.0,9.731979,-3.0,-3,10.115313,9.646466
7,8,-3,,1,10,,1,,,,...,-3.0,10.558813,-3.0,-3.0,-3.0,-3.0,-3.0,-3,-3.0,-3.0
10,11,-3,,0,40,,1,,,,...,-3.0,10.558813,-3.0,-3.0,-3.0,-3.0,-3.0,-3,-3.0,10.724639
11,12,-3,,0,23,,1,,,,...,11.548771,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,-3,-3.0,9.566678
14,15,-3,,0,25,,1,,,,...,-3.0,-3.0,-3.0,-3.0,-3.0,-3.0,10.085272,-3,-3.0,10.255825
16,17,-3,,-3,32,,1,,,,...,-3.0,-3.0,-3.0,-3.0,-3.0,10.991854,-3.0,-3,10.972726,10.415491


SO, we have the data that we will train on:
    
    - 'df_train' with labels 'non_na_y_train_materialHardship'
    - We'll use this model to predict output (materialHardship) values for 'df_prediction'

# XGBoost Model Build

In [37]:
###############
# XGBoost train
###############

# Only include columns with DataFrame.dtypes
# that are int, float or bool.
print('drop non xgb data types')
df_train_mh_good_dtypes = df_train.select_dtypes(include=(int, float, bool))
df_test_mh_good_dtypes = df_prediction.select_dtypes(include=(int, float, bool))

drop non xgb data types


In [38]:
# Get rid of challengeID column for training
y_train_mh = non_na_y_train_materialHardship['materialHardship']
y_train_mh.head()

2    0.090909
3    0.000000
5    0.181818
6    0.090909
7    0.181818
Name: materialHardship, dtype: float64

In [39]:
# Create validation set
x_train, x_valid, y_train, y_valid = train_test_split(df_train_mh_good_dtypes, y_train_mh, test_size=0.2)
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
print('validation and training data ready')

validation and training data ready


In [40]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'reg:logistic'
params['eval_metric'] = 'rmse'
params['eta'] = 0.02
params['max_depth'] = 20

watchlist = [(d_train, 'train'), (d_valid, 'valid')]


In [42]:
# XGB train
# model = xgb.XGBClassifier()
print('train xgb')
# bst = model.fit(df_test_mh, y_train_mh)
bst = xgb.train(params, d_train, 5, watchlist, early_stopping_rounds=50, verbose_eval=10)


train xgb
[0]	train-rmse:0.41714	valid-rmse:0.421895
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[4]	train-rmse:0.387825	valid-rmse:0.395099


In [43]:
# Feature importances df
d = bst.get_fscore()  # print feature scores
feature = []
f_importance = []
for w in sorted(d, key=d.get, reverse=True):
    feature.append(w)
    f_importance.append(d[w])

feature_import_df = pd.DataFrame(
    {'feature': feature,
     'feature importance': f_importance,
    })

feature_import_df.head(10)

Unnamed: 0,feature,feature importance
0,m2b5c1,6
1,f2h8c2,5
2,m4k24e,5
3,f2k12a,5
4,m5f23l,5
5,f5a6a,4
6,m3c35,4
7,f2k9b,4
8,m3k27b,4
9,hv4f1j,4


In [44]:
print('predict with xgb')
d_test = xgb.DMatrix(df_test_mh_good_dtypes)
p_test = bst.predict(d_test)

predict with xgb


In [46]:
pd.DataFrame(p_test).head()

Unnamed: 0,0
0,0.457971
1,0.457945
2,0.457078
3,0.457945
4,0.483691


# Create Submission file

In [50]:
# Need to add train labels to submission too, I think
print('create submission')
preds = pd.DataFrame()
preds['challengeID'] = df_prediction['challengeID']
preds['materialHardship'] = p_test
preds.sort_values(by='challengeID').head(10)

create submission


Unnamed: 0,challengeID,materialHardship
0,1,0.457971
1,2,0.457971
2,3,0.464431
3,4,0.457945
4,5,0.457078
7,8,0.464091
10,11,0.457945
11,12,0.483691
14,15,0.457078
16,17,0.457078


In [58]:
# tack on given train values
sub = preds.append(non_na_y_train_materialHardship)
sub.sort_values(by='challengeID').head(15)

# sub.to_csv('simple_xgb.csv', index=False)

Unnamed: 0,challengeID,materialHardship
0,1,0.457971
1,2,0.457971
2,3,0.464431
3,4,0.457945
4,5,0.457078
2,6,0.090909
3,7,0.0
7,8,0.464091
5,9,0.181818
6,10,0.090909


Here, we send in a submission that predicts all missing values (both missing MH values from train set and also values for IDs that were never in train to start). 