# Prepare data (just for materialHardship)

In [6]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
# import matplotlib.pyplot as plt

# from missing_values import final_over_balanced, df_train, median_impute_df_train, median_impute_df_train

In [60]:
mean_impute_df_train, median_impute_df_train, final_over_balanced = pd.read_pickle('mean_impute_df_train'), pd.read_pickle('median_impute_df_train'), pd.read_pickle('final_over_balanced')

In [61]:
non_na_y_train_materialHardship =  pd.read_pickle('non_na_y_train_materialHardship')

In [62]:
df_prediction =  pd.read_pickle('df_prediction')

In [63]:
final_over_balanced.shape

(5668, 11986)

See missing_values notebook for full explanation of the preceding dataframes. We'll do some testing here to see which seems to work best on validation data.

# XGBoost Model Build

In [64]:
# Create validation set 
x_train, x_valid, y_train, y_valid = train_test_split(median_impute_df_train, non_na_y_train_materialHardship['materialHardship'], test_size=0.2)
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
print('validation and training data ready')

validation and training data ready


In [71]:
df_prediction_good_cols = df_prediction[x_train.columns.values] # keep the columns that are in training data...

In [72]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'reg:logistic'
params['eval_metric'] = 'rmse'
params['eta'] = 0.02
params['max_depth'] = 20

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [73]:
# XGB train
# model = xgb.XGBClassifier()
print('train xgb')
# bst = model.fit(df_test_mh, y_train_mh)
bst = xgb.train(params, d_train, 5, watchlist, early_stopping_rounds=50, verbose_eval=10)


train xgb
[0]	train-rmse:0.418161	valid-rmse:0.417785
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[4]	train-rmse:0.388627	valid-rmse:0.391049


In [74]:
# Feature importances df
d = bst.get_fscore()  # print feature scores
feature = []
f_importance = []
for w in sorted(d, key=d.get, reverse=True):
    feature.append(w)
    f_importance.append(d[w])

feature_import_df = pd.DataFrame(
    {'feature': feature,
     'feature importance': f_importance,
    })

feature_import_df.head(10)

Unnamed: 0,feature,feature importance
0,m4i23o,5
1,m5f23e,4
2,m5f23l,4
3,hv3pvtwceil_m,3
4,p5q3bo,3
5,m4i16a_4,3
6,m5g18,3
7,hv3j16,3
8,m5f23a,3
9,m5b91,3


In [75]:
print('predict with xgb')
d_test = xgb.DMatrix(df_prediction_good_cols)
# d_test
p_test = bst.predict(d_test)

In [76]:
pd.DataFrame(p_test).head()

Unnamed: 0,0
0,0.479083
1,0.479083
2,0.479083
3,0.47188
4,0.47039


So the above works to train a simple model. But I want to see if I can use SKlearn to do some hyper-parameter grid search. I found a cool kaggle [kernel](https://www.kaggle.com/tanitter/introducing-kaggle-scripts/grid-search-xgboost-with-scikit-learn/run/23363) that might help.

In [None]:
# TO DO...work through
class XGBoostClassifier():
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.num_boost_round = num_boost_round
        self.params = params
        self.params.update({'objective': 'multi:softprob'})
 
    def fit(self, X, y, num_boost_round=None):
        num_boost_round = num_boost_round or self.num_boost_round
        self.label2num = dict((label, i) for i, label in enumerate(sorted(set(y))))
        dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in y])
        self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
 
    def predict(self, X):
        num2label = dict((i, label)for label, i in self.label2num.items())
        Y = self.predict_proba(X)
        y = np.argmax(Y, axis=1)
        return np.array([num2label[i] for i in y])
 
    def predict_proba(self, X):
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
 
    def score(self, X, y):
        Y = self.predict_proba(X)
        return 1 / logloss(y, Y)
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('num_boost_round')
        if 'objective' in params:
            del params['objective']
        self.params.update(params)
        return self
    
    
def logloss(y_true, Y_pred):
    label2num = dict((name, i) for i, name in enumerate(sorted(set(y_true))))
    return -1 * sum(math.log(y[label2num[label]]) if y[label2num[label]] > 0 else -np.inf for y, label in zip(Y_pred, y_true)) / len(Y_pred)


def main():
    clf = XGBoostClassifier(
        eval_metric = 'auc',
        num_class = 2,
        nthread = 4,
        eta = 0.1,
        num_boost_round = 80,
        max_depth = 12,
        subsample = 0.5,
        colsample_bytree = 1.0,
        silent = 1,
        )
    parameters = {
        'num_boost_round': [100, 250, 500],
        'eta': [0.05, 0.1, 0.3],
        'max_depth': [6, 9, 12],
        'subsample': [0.9, 1.0],
        'colsample_bytree': [0.9, 1.0],
    }
    clf = GridSearchCV(clf, parameters, n_jobs=1, cv=2)
    
    clf.fit([[1,2], [3,4], [2,1], [4,3]], ['a', 'b', 'a', 'b'])
    best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
    print(score)
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))
                
    print(clf.predict([[1,2]]))


if __name__ == '__main__':
    main()

# Create Submission file

In [None]:
# Need to add train labels to submission too, I think
print('create submission')
preds = pd.DataFrame()
preds['challengeID'] = df_prediction['challengeID']
preds['materialHardship'] = p_test
preds.sort_values(by='challengeID').head(10)

In [None]:
# tack on given train values
sub = preds.append(non_na_y_train_materialHardship)
sub.sort_values(by='challengeID').head(15)

# sub.to_csv('simple_xgb.csv', index=False)

Here, we send in a submission that predicts all missing values (both missing MH values from train set and also values for IDs that were never in train to start). 