In [1]:
# ff.py for demo purposes

In [18]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt

import timeit

start = timeit.timeit()

In [3]:
np.random.seed(1234)
print('reading in csv files')
# File read in
# See documentation for more descriptions
# http://www.fragilefamilieschallenge.org/apply/
background = pd.read_csv('background.csv', low_memory=False)
print('read background.csv')
train = pd.read_csv('train.csv', low_memory=False)
print('read train.csv')
prediction = pd.read_csv('prediction.csv', low_memory=False)
print('read prediction.csv')
print("Files read")

reading in csv files
read background.csv
read train.csv
read prediction.csv
Files read


In [20]:
# Select each column individually from train df
# y_train_grit = train[['challengeID', 'grit']]
# y_train_gpa = train[['challengeID', 'gpa']]
y_train_materialHardship = train[['challengeID', 'materialHardship']]
# y_train_eviction = train[['challengeID', 'eviction']]
# y_train_layoff = train[['challengeID', 'layoff']]
# y_train_jobTraining = train[['challengeID', 'jobTraining']]
y_train_materialHardship.head(30)

Unnamed: 0,challengeID,materialHardship
0,1,
1,3,
2,6,0.090909
3,7,0.0
4,8,
5,9,0.181818
6,10,0.090909
7,13,0.181818
8,14,0.272727
9,16,0.090909


In [5]:


# Let's try to just predict materialHardship first
#########################################
########### materialHardship #############
#########################################

# Get rid of NAs in material hardship.
# There are individuals in the training set
# (which contains 6 total vars) that have NA
# for certain of the variables...so let's eliminate.
y_train_materialHardship = y_train_materialHardship.dropna()


# Create train df with rows in y_train
# Subset background df to do so.
df_train_mh = background.loc[background['challengeID'].isin(y_train_materialHardship['challengeID'])]

# Create a test dataframe too. This df contains
# individuals who are NOT in the training set.
# We will need to make predictions on these.
df_test_mh = background.loc[~background['challengeID'].isin(y_train_materialHardship['challengeID'])]



In [6]:



###############
# XGBoost train
###############

# Only include columns with DataFrame.dtypes
# that are int, float or bool.
print('drop non xgb data types')
df_train_mh_good_dtypes = df_train_mh.select_dtypes(include=(int, float, bool))

df_test_mh_good_dtypes = df_test_mh.select_dtypes(include=(int, float, bool))

# Get rid of challengeID column for training
y_train_mh = y_train_materialHardship['materialHardship']


drop non xgb data types


In [7]:

# Create validation set
x_train, x_valid, y_train, y_valid = train_test_split(df_train_mh_good_dtypes, y_train_mh, test_size=0.2)
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
print('validation and training data ready')

validation and training data ready


In [8]:

# Set our parameters for xgboost
params = {}
params['objective'] = 'reg:logistic'
params['eval_metric'] = 'rmse'
params['eta'] = 0.02
params['max_depth'] = 20

watchlist = [(d_train, 'train'), (d_valid, 'valid')]


In [21]:


# XGB train
# model = xgb.XGBClassifier()
print('train xgb')
tqdm.pandas()
# bst = model.fit(df_test_mh, y_train_mh)
bst = xgb.train(params, d_train, 100, watchlist, early_stopping_rounds=50, verbose_eval=10)


train xgb
[0]	train-rmse:0.41714	valid-rmse:0.421895
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[10]	train-rmse:0.347951	valid-rmse:0.358895
[20]	train-rmse:0.29121	valid-rmse:0.308537
[30]	train-rmse:0.244752	valid-rmse:0.268224
[40]	train-rmse:0.206586	valid-rmse:0.236487
[50]	train-rmse:0.175107	valid-rmse:0.211977
[60]	train-rmse:0.149115	valid-rmse:0.192985
[70]	train-rmse:0.127745	valid-rmse:0.17808
[80]	train-rmse:0.109966	valid-rmse:0.166582
[90]	train-rmse:0.09515	valid-rmse:0.158094
[99]	train-rmse:0.08409	valid-rmse:0.152514


In [23]:
# Feature importances df
d = bst.get_fscore()  # print feature scores
feature = []
f_importance = []
for w in sorted(d, key=d.get, reverse=True):
    feature.append(w)
    f_importance.append(d[w])

feature_import_df = pd.DataFrame(
    {'feature': feature,
     'feature importance': f_importance,
    })

feature_import_df.head(10)

Unnamed: 0,feature,feature importance
0,m5f23e,66
1,m4i23o,64
2,m5f23a,54
3,m4d6,47
4,m2j14a,44
5,m5f23l,42
6,f2h8c2,37
7,kind_b1a,36
8,cm4hhinc,35
9,k5c6,35


In [12]:
print('predict with xgb')
tqdm.pandas()
d_test = xgb.DMatrix(df_test_mh_good_dtypes)
tqdm.pandas()
p_test = bst.predict(d_test)

predict with xgb


In [17]:


# Need to add train labels to submission too, I think
print('create submission')
preds = pd.DataFrame()
preds['challengeID'] = df_test_mh['challengeID']
preds['materialHardship'] = p_test
# tack on given train values
orig = pd.DataFrame()
orig['challengeID'] = y_train_materialHardship['challengeID']
orig['materialHardship'] = y_train_materialHardship['materialHardship']
sub = preds.append(orig)
sub.sort_values(by='challengeID')

sub.to_csv('simple_xgb.csv', index=False)



create submission
