# Prepare data (just for materialHardship)

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
# import matplotlib.pyplot as plt

# from missing_values import final_over_balanced, df_train, median_impute_df_train, median_impute_df_train

In [212]:
mean_impute_df_train, median_impute_df_train, final_over_balanced = pd.read_pickle('mean_impute_df_train'), pd.read_pickle('median_impute_df_train'), pd.read_pickle('final_over_balanced')

In [185]:
non_na_y_train_materialHardship =  pd.read_pickle('non_na_y_train_materialHardship')

In [186]:
df_prediction =  pd.read_pickle('df_prediction')

See missing_values notebook for full explanation of the preceding dataframes. We'll do some testing here to see which seems to work best on validation data.

# XGBoost Model Build - median imputation

Here, we use "good dtypes" and impute missing values to predict missing hardship on 0-1 scale using decision trees and an objective of logistic regression.

In [213]:
# Create validation set with median imputation
x_train, x_valid, y_train, y_valid = train_test_split(median_impute_df_train, non_na_y_train_materialHardship['materialHardship'], test_size=0.2)
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
print('validation and training data ready')

validation and training data ready


We need to be clear on what columns we will want in our prediction set. So below, I take the prediction columsn that make it into the x_train set. There are some that are excluded because of dtype or NAs.

In [7]:
df_prediction_good_cols = df_prediction[x_train.columns.values] # keep the columns that are in training data...

In [8]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'reg:logistic'
params['eval_metric'] = 'rmse'
params['eta'] = 0.02
params['max_depth'] = 20

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [124]:
# XGB train
# model = xgb.XGBClassifier()
print('train xgb')
# bst = model.fit(df_test_mh, y_train_mh)
bst = xgb.train(params, d_train, 200, watchlist, early_stopping_rounds=50, verbose_eval=10)

train xgb
[0]	train-rmse:0.418423	valid-rmse:0.416287
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[10]	train-rmse:0.348895	valid-rmse:0.352355
[20]	train-rmse:0.291984	valid-rmse:0.301555
[30]	train-rmse:0.245217	valid-rmse:0.262343
[40]	train-rmse:0.206623	valid-rmse:0.230887
[50]	train-rmse:0.174954	valid-rmse:0.206442
[60]	train-rmse:0.149035	valid-rmse:0.188827
[70]	train-rmse:0.127659	valid-rmse:0.17615
[80]	train-rmse:0.11	valid-rmse:0.167096
[90]	train-rmse:0.095386	valid-rmse:0.160258
[100]	train-rmse:0.083191	valid-rmse:0.155774
[110]	train-rmse:0.073105	valid-rmse:0.152577
[120]	train-rmse:0.06485	valid-rmse:0.150967
[130]	train-rmse:0.057856	valid-rmse:0.149973
[140]	train-rmse:0.052047	valid-rmse:0.149379
[150]	train-rmse:0.047143	valid-rmse:0.149116
[160]	train-rmse:0.043036	valid-rmse:0.149133
[170]	train-rmse:0.039461	valid-rmse:0.149065
[180]	train-rmse:0.036432	valid-rm

In [152]:
# Feature importances df
# d = bst.get_fscore()  # print feature scores
# feature = []
# f_importance = []
# for w in sorted(d, key=d.get, reverse=True):
#     feature.append(w)
#     f_importance.append(d[w])

# feature_import_df = pd.DataFrame(
#     {'feature': feature,
#      'feature importance': f_importance,
#     })

# feature_import_df.head(10)

In [126]:
print('predict with xgb')
d_test = xgb.DMatrix(df_prediction_good_cols)
# d_test
p_test = bst.predict(d_test)

predict with xgb


In [127]:
pd.DataFrame(p_test).head()

Unnamed: 0,0
0,0.228553
1,0.208423
2,0.192934
3,0.217176
4,0.100321


In [128]:
pd.DataFrame(p_test).shape

(2783, 1)

# Create Submission file

In [129]:
# Need to add train labels to submission too, I think
print('create submission')
preds = pd.DataFrame()
preds['challengeID'] = df_prediction['challengeID']
preds['materialHardship'] = p_test
preds.sort_values(by='challengeID').head(10)

create submission


Unnamed: 0,challengeID,materialHardship
0,1,0.228553
1,2,0.208423
2,3,0.192934
3,4,0.217176
4,5,0.100321
7,8,0.186167
10,11,0.210725
11,12,0.084569
14,15,0.121868
16,17,0.103562


In [131]:
# tack on given train values
sub = preds.append(non_na_y_train_materialHardship)
sub.sort_values(by='challengeID').head(15)

sub.to_csv('simple_xgb.csv', index=False)

Here, we send in a submission that predicts all missing values (both missing MH values from train set and also values for IDs that were never in train to start). 

# Create another model with balanced data. How will this do?

In [214]:
# pop material hardship to make its own df
mh = final_over_balanced.pop('materialHardship')
mh.head()

0    1
1    0
2    2
3    1
4    2
Name: materialHardship, dtype: object

In [215]:
# Create validation set with median imputation
x_train_bal, x_valid_bal, y_train_bal, y_valid_bal = train_test_split(final_over_balanced, mh, test_size=0.2)
d_train = xgb.DMatrix(x_train_bal, label=y_train_bal/11) # divide by 11 to get in 0-1 range

In [216]:
print('validation and training data ready')

validation and training data ready


In [219]:
#x_train_bal.pop('index')

x_valid_matched = x_valid[x_train_bal.columns.values]
# x_train_bal['index']

Above, we make df prediction match columns we train on so we can make appropriate predictions.

In [220]:
d_valid = xgb.DMatrix(x_valid_matched, label=y_valid) # divide by 11 to get in 0-1 range

In [221]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'reg:logistic'
params['eval_metric'] = 'rmse'
params['eta'] = 0.02
params['max_depth'] = 20

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
# XGB train
# model = xgb.XGBClassifier()
print('train xgb')
# bst = model.fit(df_test_mh, y_train_mh)
bst = xgb.train(params, d_train, 200, watchlist, early_stopping_rounds=30, verbose_eval=10)

train xgb
[0]	train-rmse:0.285918	valid-rmse:0.417607
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 30 rounds.
[10]	train-rmse:0.238287	valid-rmse:0.352763
[20]	train-rmse:0.199262	valid-rmse:0.299537
[30]	train-rmse:0.167175	valid-rmse:0.25524
[40]	train-rmse:0.140774	valid-rmse:0.218636
[50]	train-rmse:0.118972	valid-rmse:0.188779
[60]	train-rmse:0.100983	valid-rmse:0.163795
[70]	train-rmse:0.086177	valid-rmse:0.143259
[80]	train-rmse:0.073904	valid-rmse:0.126565
[90]	train-rmse:0.063751	valid-rmse:0.112856
[100]	train-rmse:0.055348	valid-rmse:0.102085
[110]	train-rmse:0.048345	valid-rmse:0.093159
[120]	train-rmse:0.042504	valid-rmse:0.085623
[130]	train-rmse:0.037614	valid-rmse:0.079452
[140]	train-rmse:0.033484	valid-rmse:0.074847
[150]	train-rmse:0.029994	valid-rmse:0.071361
[160]	train-rmse:0.027079	valid-rmse:0.068714
[170]	train-rmse:0.024641	valid-rmse:0.066611
[180]	train-rmse:0.022537	val

In [224]:
import pickle
pickle.dump(bst, open("xgb.pickle.dat", "wb"))

In [225]:
# x_train_bal.pop('index')
df_prediction_good_cols = df_prediction[x_train_bal.columns.values] # keep the columns that are in training data...

In [226]:
print('predict with xgb')
d_test = xgb.DMatrix(df_prediction_good_cols)
# d_test
p_test = bst.predict(d_test)

predict with xgb


In [227]:
pd.DataFrame(p_test).head()

Exception ignored in: <bound method Booster.__del__ of <xgboost.core.Booster object at 0x11fbd8da0>>
Traceback (most recent call last):
  File "/Users/wnowak/Downloads/xgboost-0.6a2/xgboost/xgboost/python-package/xgboost/core.py", line 669, in __del__
    _LIB.XGBoosterFree(self.handle)
AttributeError: 'Booster' object has no attribute 'handle'


Unnamed: 0,0
0,0.174514
1,0.14691
2,0.129642
3,0.208358
4,0.107629


In [228]:
# Need to add train labels to submission too, I think
print('create submission')
preds = pd.DataFrame()
preds['challengeID'] = df_prediction['challengeID']
preds['materialHardship'] = p_test
preds.sort_values(by='challengeID').head(10)

create submission


Unnamed: 0,challengeID,materialHardship
0,1,0.174514
1,2,0.14691
2,3,0.129642
3,4,0.208358
4,5,0.107629
7,8,0.155597
10,11,0.15137
11,12,0.132926
14,15,0.083359
16,17,0.069868


In [229]:
# tack on given train values
sub = preds.append(non_na_y_train_materialHardship)
sub.sort_values(by='challengeID').head(15)
sub.head()

Unnamed: 0,challengeID,materialHardship
0,1,0.174514
1,2,0.14691
2,3,0.129642
3,4,0.208358
4,5,0.107629


In [230]:
sub.to_csv('xgb_balance.csv', index=False)