# Model Building

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
from scipy.stats import skew, boxcox
from scipy import sparse
from sklearn.model_selection import KFold,GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics, preprocessing
import time
import matplotlib.pyplot as plt
%matplotlib inline

## Load data

In [2]:
df_ori = pd.read_csv('../data/input_proc2.csv')
df_catcomb = pd.read_csv('../data/input_cat_comb1.csv')

In [4]:
train = pd.read_csv('../ProjectFiles/File2.csv')
train_size = train.shape[0]
df_ori.SPENDINGRESPONSE[train_size:].isnull().sum()

9231

In [5]:
df = pd.concat([df_ori,df_catcomb], axis=1)
df = df.loc[:,~df.columns.duplicated()]
df.shape

(29231, 4332)

In [6]:
print(df.columns.values)

['ID' 'f3_A' 'f3_B' ... 'f119_f120_US' 'f119_f120_UX' 'f119_f120_nan']


In [7]:
df_test = df.drop('ID', axis=1).values
train_x = df[:train_size].drop(['ID','SPENDINGRESPONSE'], axis=1).values
train_y = df[:train_size].SPENDINGRESPONSE.values

## Modeling: XGBoost

Use the same paramters from previous test to build the model. 

In [65]:
dtrain = xgb.DMatrix(train_x, label=train_y, missing = np.NAN)

In [64]:
%%time
scores = []
for max_depth in [5, 6, 7, 8, 9, 10]:

    params = dict()
    params['objective'] = 'binary:logistic'
    params['eta'] = 0.1
    params['max_depth'] = max_depth
    params['min_child_weight'] = 1
    params['colsample_bytree'] = 1
    params['subsample'] = 1
    params['gamma'] = 0
    params['seed'] = 1234
    #params['eval_metric'] = 'error'
    cv_results = xgb.cv(params, dtrain,
                    num_boost_round=10000,
                    nfold=5,
                    maximize=True, 
                    stratified=True,
                    shuffle=True,
                    verbose_eval=500,
                    seed=1234,
                    early_stopping_rounds=50)

    best_score = cv_results['test-error-mean'].min()
    best_iteration = len(cv_results)           
    print(max_depth, best_score, best_iteration)
    scores.append([best_score, params['eta'], params['max_depth'], params['min_child_weight'],
                   params['colsample_bytree'], params['subsample'], params['gamma'], best_iteration])

scores = pd.DataFrame(scores, columns=['score', 'eta', 'max_depth', 'min_child_weight',
                                       'colsample_bytree', 'subsample', 'gamma', 'best_iteration'])
best_max_depth = scores.sort_values(by='score', ascending=True)['max_depth'].values[0]
print('Best max_depth is', best_max_depth)   

[0]	train-error:0.303837+0.00144003	test-error:0.3119+0.00203429
5 0.3119002 1
[0]	train-error:0.301037+0.00216212	test-error:0.31215+0.00239473
6 0.31215020000000004 2
[0]	train-error:0.298712+0.00283042	test-error:0.31305+0.00254858
7 0.31305020000000006 2
[0]	train-error:0.295213+0.00387493	test-error:0.316+0.00231814
8 0.3160002 2
[0]	train-error:0.291388+0.00519291	test-error:0.31825+0.00467022
9 0.3182501999999999 2
[0]	train-error:0.2855+0.00803079	test-error:0.32035+0.00534301
10 0.32035 2
Best max_depth is 5
CPU times: user 1h 37min 58s, sys: 1min 36s, total: 1h 39min 34s
Wall time: 27min 21s


In [78]:
xgb_params = dict()
xgb_params['colsample_bytree'] = 0.5556186387129007
xgb_params['gamma'] = 1.9426003754806411
xgb_params['max_depth'] = 3
xgb_params['min_child_weight'] = 80
xgb_params['subsample'] = 0.2678974963534382
xgb_params['objective'] = 'binary:logistic'
xgb_params['eta'] = 0.01  # Smaller
xgb_params['max_depth'] = int(xgb_params['max_depth'])
xgb_params['min_child_weight'] = int(xgb_params['min_child_weight'])
xgb_params['subsample'] = xgb_params['subsample']
xgb_params['colsample_bytree'] = xgb_params['colsample_bytree']
xgb_params['gamma'] = xgb_params['gamma']
xgb_params['seed'] = 1234

cv_results = xgb.cv(params, xgb.DMatrix(train_x, label=train_y.reshape(train_x.shape[0], 1), missing=np.NAN),
                    num_boost_round=10000,
                    nfold=5,
                    maximize=True,
                    stratified=True,
                    shuffle=True,
                    seed=1234,
                    early_stopping_rounds=50)

best_iteration = len(cv_results)
best_score = cv_results['test-error-mean'].min()
print(best_score, best_iteration)

0.30845 1


In [79]:
model = xgb.train(xgb_params,
                  xgb.DMatrix(train_x, label=train_y.reshape(train_x.shape[0], 1), missing=np.NAN),
                  num_boost_round=best_iteration
                  )

df_test = df.drop(['ID','SPENDINGRESPONSE'], axis=1).values
preds = model.predict(xgb.DMatrix(df_test))

In [80]:
sub_df = pd.DataFrame({'ID': df.ID, 'Probability': preds})
sub_df.to_csv("../data/xgb_catcomb_prob_opt1.csv", index=False)

In [81]:
sub_df.shape

(29231, 2)