# Intro

As the title indicates, this ipynb aims to import dataframes (features plus labels) and then develop functions / clases to process data, fit models, and make predictions!

# Import data (just for materialHardship)

In [27]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import sys
import math
from sklearn.grid_search import GridSearchCV

In [12]:
mean_impute_df_train_with_labels = pd.read_pickle('mean_impute_df_train_with_labels')
median_impute_df_train_with_labels  = pd.read_pickle('median_impute_df_train_with_labels')

In [4]:
final_over_balanced =  pd.read_pickle('final_over_balanced_decimals')

In [20]:
df_prediction =  pd.read_pickle('df_prediction')

Read files from pickles...we are reading full dataframes, with columns of features attached to label columns. Function below is used to split features from class labels and make models and predictions.

# Train

## Prepare data

Split features and labels.

In [28]:
# split data into features and labels
def split_data(df, label='materialHardship'):
    copy = df.copy() # copy df so i dont alter original df by popping
    y = copy.pop(label) # pop label
    return copy, y

Return features and labels...feed split_data function a full data frame. This can be any from the above imports.

In [55]:
copy, mh = split_data(median_impute_df_train_with_labels)

Test_train split. I call the test data the "validation" data because I think it is a better, less ambiguous term.

In [39]:
x_train_med, x_valid_med, y_train_med, y_valid_med = train_test_split(copy, mh, test_size=0.2) #train test split

Create XGB Classifier class. This was inspiring by [this](https://www.kaggle.com/tanitter/introducing-kaggle-scripts/grid-search-xgboost-with-scikit-learn/run/23363) Kaggle script. I don't have CV functionality of SKlearn working here, though.

This class is a wrapper to create a XGB object. Then, in sequence, you can a) fit a model, b) predict a model, and c) find the mean squared error score of your model on some validation / test data.

In [157]:
class XGBoostClassifier():
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.num_boost_round = num_boost_round
        self.params = params
#         self.params.update({'objective': 'reg:logistic'})
 
    def fit(self, x_train, y_train, x_valid, y_valid):
        d_train = xgb.DMatrix(x_train,y_train)
        d_valid = xgb.DMatrix(x_valid,y_valid)
        self.clf = xgb.train(params=self.params, 
                             dtrain=d_train, 
                             num_boost_round=self.num_boost_round) # watchlist=watchlist
 
    def predict(self, x_test):
        dtest = xgb.DMatrix(x_test)
        predd = self.clf.predict(dtest)
        return predd

    def score(self, x_valid, y_valid):
        Y = self.clf.predict(xgb.DMatrix(x_valid))
        return mean_squared_error(y_valid, Y)
    
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('num_boost_round')
        if 'objective' in params:
            del params['objective']
        self.params.update(params)
        return self
    
    
    def get_feat_score(self):# print feature scores
        return self.clf.get_fscore()
#         return self.d
#         feature = []
#         f_importance = []
#         for w in sorted(d, key=d.get, reverse=True):
#             feature.append(w)
#             f_importance.append(d[w])
#         feature_import_df = pd.DataFrame(
#         {'feature': feature,
#          'feature importance': f_importance,
#         })
#         return self.feature_import_df

In [158]:
paramms = {"eval_metric" : 'rmse',
        'num_class' : 2,
        'nthread' : 4,
        'eta' : 0.1,
        'num_boost_round' : 8,
        'max_depth' : 2,
        'subsample' : 0.5,
        'colsample_bytree' : 1.0,
        'objective': 'reg:logistic',
        'evals' :[ (xgb.DMatrix(x_train_med), 'train'),  (xgb.DMatrix(x_valid_med), 'valid')],
        'verbose_eval':True}

In [159]:
xclass = XGBoostClassifier(params = paramms)

In [160]:
xclass.params

{'params': {'colsample_bytree': 1.0,
  'eta': 0.1,
  'eval_metric': 'rmse',
  'evals': [(<xgboost.core.DMatrix at 0x130a4c748>, 'train'),
   (<xgboost.core.DMatrix at 0x130a4ceb8>, 'valid')],
  'max_depth': 2,
  'nthread': 4,
  'num_boost_round': 8,
  'num_class': 2,
  'objective': 'reg:logistic',
  'subsample': 0.5,
  'verbose_eval': True}}

In [161]:
fitt = xclass.fit(x_train_med, 
                  y_train_med/11,
                  x_valid_med, 
                  y_valid_med/11)

In [163]:
d = xclass.get_feat_score()

def get_feature_df(d):
    '''given xgboost feature score object
    put into pretty df'''
    feature = []
    f_importance = []
    for w in sorted(d, key=d.get, reverse=True):
        feature.append(w)
        f_importance.append(d[w])

    feature_import_df = pd.DataFrame(
        {'feature': feature,
         'feature importance': f_importance,
        })
    return feature_import_df

In [164]:
get_feature_df(d)

Unnamed: 0,feature,feature importance
0,challengeID,33
1,m1lenmin,13
2,m1citywt,11
3,m1lenhr,7
4,m1a8,4
5,innatsm,4
6,m5f23l,4
7,m1e1d2,3
8,m1f7,3
9,m1d2f,2


In [111]:
df_prediction_good_cols = df_prediction[x_train_med.columns.values] # keep the columns that are in training data...

In [112]:
predy = xclass.predict(df_prediction_good_cols)

In [113]:
errs = xclass.score(x_train_med, y_train_med/11)

In [114]:
errs

0.0028335675516232978

# Create Submission file

In [129]:
# Need to add train labels to submission too, I think
print('create submission')
preds = pd.DataFrame()
preds['challengeID'] = df_prediction['challengeID']
preds['materialHardship'] = p_test
preds.sort_values(by='challengeID').head(10)

create submission


Unnamed: 0,challengeID,materialHardship
0,1,0.228553
1,2,0.208423
2,3,0.192934
3,4,0.217176
4,5,0.100321
7,8,0.186167
10,11,0.210725
11,12,0.084569
14,15,0.121868
16,17,0.103562


In [131]:
# tack on given train values
sub = preds.append(non_na_y_train_materialHardship)
sub.sort_values(by='challengeID').head(15)

sub.to_csv('simple_xgb.csv', index=False)

Here, we send in a submission that predicts all missing values (both missing MH values from train set and also values for IDs that were never in train to start). 