# Intro

As the title indicates, this ipynb aims to import dataframes (features plus labels) and then develop functions / clases to process data, fit models, and make predictions!

# Import data (just for materialHardship)

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import sys
import math
from sklearn.grid_search import GridSearchCV



In [2]:
mean_impute_df_train_with_labels = pd.read_pickle('mean_impute_df_train_with_labels')
median_impute_df_train_with_labels  = pd.read_pickle('median_impute_df_train_with_labels')

In [3]:
final_over_balanced =  pd.read_pickle('final_over_balanced_decimals')

In [4]:
df_prediction =  pd.read_pickle('df_prediction')

Read files from pickles...we are reading full dataframes, with columns of features attached to label columns. Function below is used to split features from class labels and make models and predictions.

# Train

## Prepare data

Split features and labels.

In [5]:
# split data into features and labels
def split_data(df, label='materialHardship'):
    copy = df.copy() # copy df so i dont alter original df by popping
    y = copy.pop(label) # pop label
    return copy, y

Return features and labels...feed split_data function a full data frame. This can be any from the above imports.

In [6]:
copy, mh = split_data(median_impute_df_train_with_labels)

Test_train split. I call the test data the "validation" data because I think it is a better, less ambiguous term.

In [7]:
x_train_med, x_valid_med, y_train_med, y_valid_med = train_test_split(copy, mh, test_size=0.2) #train test split

In [238]:
num_rd = [1,5]
i=1
paramms = {
        "eval_metric" : 'mae',
        'objective' : 'reg:logistic',
        'eta' : 1,
        'max_depth': 1
       }

In [241]:
d_train = xgb.DMatrix(x_train_med,y_train_med/11) # make training matrix
d_valid = xgb.DMatrix(x_valid_med,y_valid_med/11) # make valid matrix
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

xgb.train(params=paramms,  # train on train matrix with params
            dtrain=d_train,
            evals = watchlist,
         num_boost_round=11) 

[0]	train-mae:0.147203	valid-mae:0.144564
[1]	train-mae:0.114837	valid-mae:0.114989
[2]	train-mae:0.108244	valid-mae:0.110495
[3]	train-mae:0.105297	valid-mae:0.110216
[4]	train-mae:0.102058	valid-mae:0.108428
[5]	train-mae:0.100458	valid-mae:0.107735
[6]	train-mae:0.098985	valid-mae:0.108918
[7]	train-mae:0.097252	valid-mae:0.105368
[8]	train-mae:0.095077	valid-mae:0.105486
[9]	train-mae:0.09456	valid-mae:0.106338
[10]	train-mae:0.093475	valid-mae:0.10722


<xgboost.core.Booster at 0x111841fd0>

Create XGB Classifier class. This was inspired by [this](https://www.kaggle.com/tanitter/introducing-kaggle-scripts/grid-search-xgboost-with-scikit-learn/run/23363) Kaggle script. I don't have CV functionality of SKlearn working here, though. As a result, I just wrote my own way to iterate through options to choose best model.

This class is a wrapper to create a XGB object. Then, in sequence, you can a) fit a model, b) predict a model, and c) find the mean squared error score of your model on some validation / test data.

In [222]:
class XGBoostClassifier():
    def __init__(self, **params): 
        self.clf = None 
        self.predd = None
        self.params = params # params attribute set initially
 
    def fit(self, x_train, y_train, x_valid, y_valid):
        d_train = xgb.DMatrix(x_train,y_train) # make training matrix
        d_valid = xgb.DMatrix(x_valid,y_valid) # make valid matrix
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        self.clf = xgb.train(
                            params=self.params['params'],  # train on train matrix with params
                            dtrain=d_train,
                            evals = watchlist) 
        return self.clf
 
    def predict(self, x_test):
        dtest = xgb.DMatrix(x_test)
        predd = self.clf.predict(dtest)
        return self.predd

#     def score(self, x_valid, y_valid):
#         Y = self.clf.predict(xgb.DMatrix(x_valid))
#         return mean_squared_error(y_valid, Y)
    
    def get_feat_score(self):# print feature scores
        return self.clf.get_fscore()

In [223]:
num_rd = [1,5]
i=1
paramms = {
        "eval_metric" : 'rmse',
        'objective' : 'reg:logistic',
        'eta' : 1,
        'max_depth': 1,
        'num_boost_round' : num_rd[i]
       }

In [224]:
# xclass = XGBoostClassifier(params = paramms)
xclass = XGBoostClassifier(params = paramms)

In [230]:
paramms

{'eta': 1,
 'eval_metric': 'rmse',
 'max_depth': 1,
 'num_boost_round': 5,
 'objective': 'reg:logistic'}

In [226]:
xclass.params

{'params': {'eta': 1,
  'eval_metric': 'rmse',
  'max_depth': 1,
  'num_boost_round': 5,
  'objective': 'reg:logistic'}}

In [227]:
fitt = xclass.fit(x_train_med, 
                  y_train_med/11,
                  x_valid_med, 
                  y_valid_med/11)

[0]	train-rmse:0.165802	valid-rmse:0.165117
[1]	train-rmse:0.14707	valid-rmse:0.149667
[2]	train-rmse:0.143777	valid-rmse:0.145141
[3]	train-rmse:0.141702	valid-rmse:0.145921
[4]	train-rmse:0.140933	valid-rmse:0.145243
[5]	train-rmse:0.140124	valid-rmse:0.144267
[6]	train-rmse:0.138483	valid-rmse:0.146228
[7]	train-rmse:0.136432	valid-rmse:0.141445
[8]	train-rmse:0.134645	valid-rmse:0.142783
[9]	train-rmse:0.13329	valid-rmse:0.142738


In [172]:
fitt

<xgboost.core.Booster at 0x114aea7f0>

In [173]:
fitt.predict(xgb.DMatrix(df_prediction_good_cols))

array([ 0.3254022 ,  0.18412694,  0.21786654, ...,  0.24428976,
        0.29976332,  0.08016646], dtype=float32)

In [174]:
d = xclass.get_feat_score()

def get_feature_df(d):
    '''given xgboost feature score object
    put into pretty df'''
    feature = []
    f_importance = []
    for w in sorted(d, key=d.get, reverse=True):
        feature.append(w)
        f_importance.append(d[w])

    feature_import_df = pd.DataFrame(
        {'feature': feature,
         'feature importance': f_importance,
        })
    return feature_import_df

In [175]:
get_feature_df(d).head()

Unnamed: 0,feature,feature importance
0,challengeID,47
1,m1citywt,21
2,m1lenmin,11
3,cm1bsex,4
4,m1a15,4


In [176]:
df_prediction_good_cols = df_prediction[x_train_med.columns.values] # keep the columns that are in training data...

In [177]:
predy = fitt.predict(xgb.DMatrix(df_prediction_good_cols))

In [178]:
train_preds = fitt.predict(xgb.DMatrix(x_train_med))

In [179]:
predy

array([ 0.3254022 ,  0.18412694,  0.21786654, ...,  0.24428976,
        0.29976332,  0.08016646], dtype=float32)

In [181]:
train_preds

array([ 0.07566336,  0.04096115,  0.41326064, ...,  0.03218818,
        0.27110511,  0.04082593], dtype=float32)

In [149]:
# errs = xclass.score(x_train_med, y_train_med/11)

In [150]:
# errs

0.0027180740026536234

In [182]:
mean_squared_error(y_train_med, train_preds)

3.7004435820937411

# Grid search cv?

In [79]:
num_rd = [1,20]
errors = []
for i in range(len(depths)):
    paramms = {
        "eval_metric" : 'rmse',
        'objective' : 'reg:logistic',
        'eta' : 0.02,
        'max_depth': 20,
        'num_boost_round' : num_rd[i]
       }
    xclass = XGBoostClassifier(params= paramms)
    fitt = xclass.fit(x_train_med, 
                  y_train_med/11,
                  x_valid_med, 
                  y_valid_med/11)
    predy = xclass.predict(df_prediction_good_cols)
    errs = xclass.score(x_train_med, y_train_med/11)
    errors.append(errs)

In [81]:
errors

[0.0027180740026536234, 0.0027180740026536234]

In [None]:
paramms = {"eval_metric" : 'rmse',
        'num_class' : 2,
        'eta' : 0.1,
        'num_boost_round' : 8,
        'max_depth' : 2,
        'subsample' : 0.5,
        'colsample_bytree' : 1.0,
        'objective': 'reg:logistic',
        'evals' :[ (xgb.DMatrix(x_train_med), 'train'),  (xgb.DMatrix(x_valid_med), 'valid')],
        'verbose_eval':True}

# focus here!!!!!!!!!!

## overfitting b/c valid error > train error

In [36]:
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# Build a classification task using 3 informative features
# X, y = make_classification(n_samples=1000,
#                            n_features=10,
#                            n_informative=3,
#                            n_redundant=0,
#                            n_repeated=0,
#                            n_classes=2,
#                            random_state=0,
#                            shuffle=False)


# rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'num_boost_round': [10, 25, 50]
}
# d_train = xgb.DMatrix(x_train_med,y_train_med)

CV_rfc = GridSearchCV(xclass, param_grid=param_grid, cv= 2)
CV_rfc.fit(x_train_med,y_train_med/11)
# print(CV_rfc.best_params_)

TypeError: fit() takes from 2 to 3 positional arguments but 5 were given

In [37]:
clf = xgb.XGBClassifier(n_estimators = 100, objective= 'binary:logistic',)
clf.fit(x_train_med,y_train_med/11,  early_stopping_rounds=10, eval_metric="rmse", 
    eval_set=[(x_valid_med, y_valid_med/11)])

XGBoostError: b'[08:17:43] src/metric/elementwise_metric.cc:28: Check failed: preds.size() == info.labels.size() (2920 vs. 292) label and prediction size not match, hint: use merror or mlogloss for multi-class classification\n\nStack trace returned 6 entries:\n[bt] (0) 0   libxgboost.so                       0x000000010f2811d5 _ZN4dmlc15LogMessageFatalD2Ev + 37\n[bt] (1) 1   libxgboost.so                       0x000000010f2c8f73 _ZNK7xgboost6metric13EvalEWiseBaseINS0_8EvalRMSEEE4EvalERKNSt3__16vectorIfNS4_9allocatorIfEEEERKNS_8MetaInfoEb + 995\n[bt] (2) 2   libxgboost.so                       0x000000010f27e319 _ZN7xgboost11LearnerImpl11EvalOneIterEiRKNSt3__16vectorIPNS_7DMatrixENS1_9allocatorIS4_EEEERKNS2_INS1_12basic_stringIcNS1_11char_traitsIcEENS5_IcEEEENS5_ISE_EEEE + 889\n[bt] (3) 3   libxgboost.so                       0x000000010f293884 XGBoosterEvalOneIter + 612\n[bt] (4) 4   _ctypes.so                          0x0000000103918127 ffi_call_unix64 + 79\n[bt] (5) 5   python                              0x00007fff5bffbe10 __progname + 140730441903472\n'

In [185]:
main()

TypeError: __init__() got an unexpected keyword argument 'params'

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x12007f7f0>>
Traceback (most recent call last):
  File "/Users/wnowak/Downloads/xgboost-0.6a2/xgboost/xgboost/python-package/xgboost/core.py", line 337, in __del__
    _check_call(_LIB.XGDMatrixFree(self.handle))
AttributeError: 'DMatrix' object has no attribute 'handle'
Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x1200570f0>>
Traceback (most recent call last):
  File "/Users/wnowak/Downloads/xgboost-0.6a2/xgboost/xgboost/python-package/xgboost/core.py", line 337, in __del__
    _check_call(_LIB.XGDMatrixFree(self.handle))
AttributeError: 'DMatrix' object has no attribute 'handle'


In [170]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
 
import sys
import math
 
import numpy as np
from sklearn.grid_search import GridSearchCV
 
# sys.path.append('xgboost/wrapper/')
import xgboost as xgb
 
 
class XGBoostClassifier():
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.num_boost_round = num_boost_round
        self.params = params
        self.params.update({'objective': 'multi:softprob'})
 
    def fit(self, X, y, num_boost_round=None):
        num_boost_round = num_boost_round or self.num_boost_round
        self.label2num = dict((label, i) for i, label in enumerate(sorted(set(y))))
        dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in y])
        self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
 
    def predict(self, X):
        num2label = dict((i, label)for label, i in self.label2num.items())
        Y = self.predict_proba(X)
        y = np.argmax(Y, axis=1)
        return np.array([num2label[i] for i in y])
 
    def predict_proba(self, X):
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
 
    def score(self, X, y):
        Y = self.predict_proba(X)
        return 1 / logloss(y, Y)
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('num_boost_round')
        if 'objective' in params:
            del params['objective']
        self.params.update(params)
        return self
    
    
def logloss(y_true, Y_pred):
    label2num = dict((name, i) for i, name in enumerate(sorted(set(y_true))))
    return -1 * sum(math.log(y[label2num[label]]) if y[label2num[label]] > 0 else -np.inf for y, label in zip(Y_pred, y_true)) / len(Y_pred)


def main():
    clf = XGBoostClassifier(
        eval_metric = 'auc',
        num_class = 2,
        nthread = 4,
        eta = 0.1,
        num_boost_round = 80,
        max_depth = 12,
        subsample = 0.5,
        colsample_bytree = 1.0,
        silent = 1,
        )
    parameters = {
        'num_boost_round': [100, 250, 500],
        'eta': [0.05, 0.1, 0.3],
        'max_depth': [6, 9, 12],
        'subsample': [0.9, 1.0],
        'colsample_bytree': [0.9, 1.0],
    }
    clf = GridSearchCV(clf, parameters, n_jobs=1, cv=2)
    
    clf.fit([[1,2], [3,4], [2,1], [4,3]], ['a', 'b', 'a', 'b'])
    best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
    print(score)
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))
                
    print(clf.predict([[1,2]]))


if __name__ == '__main__':
    main()


1.4426950408889634
colsample_bytree: 0.9
eta: 0.05
max_depth: 6
num_boost_round: 100
subsample: 0.9
['b']


# Create Submission file

In [129]:
# Need to add train labels to submission too, I think
print('create submission')
preds = pd.DataFrame()
preds['challengeID'] = df_prediction['challengeID']
preds['materialHardship'] = p_test
preds.sort_values(by='challengeID').head(10)

create submission


Unnamed: 0,challengeID,materialHardship
0,1,0.228553
1,2,0.208423
2,3,0.192934
3,4,0.217176
4,5,0.100321
7,8,0.186167
10,11,0.210725
11,12,0.084569
14,15,0.121868
16,17,0.103562


In [131]:
# tack on given train values
sub = preds.append(non_na_y_train_materialHardship)
sub.sort_values(by='challengeID').head(15)

sub.to_csv('simple_xgb.csv', index=False)

Here, we send in a submission that predicts all missing values (both missing MH values from train set and also values for IDs that were never in train to start). 