# Gradient Boosting Classifier

In [47]:
import pandas as pd 
import numpy as np


from sklearn.model_selection import GridSearchCV
import sklearn.metrics
from sklearn.metrics import log_loss
import gc; gc.enable()
import xgboost as xgb
from sklearn.model_selection import train_test_split

import matplotlib
import matplotlib.pyplot as plt

import pickle
import os
import os.path as path
import sys

In [31]:
save_dir = path.join(path.dirname(os.getcwd()), 'models')
data_dir = path.join(path.dirname(os.getcwd()), 'data', 'train_out.csv')

read = open('best_features.pkl', 'rb')
ranked_features = pickle.load(read)
read.close()

read = open('train_na.pkl', 'rb')
train_na = pickle.load(read)
read.close()

read = open('test_na.pkl', 'rb')
test_na = pickle.load(read)
read.close()

#ranked_features = ['expiration_date',
#                   '201702',
#                  'membership_expire_date',
#                  'consecutive_ones',
#                  'consecutive_zeros',
#                  'registered_via_7.0',
#                  'is_cancel',
#                  'is_auto_renew']

train_out = pd.read_csv(data_dir)
train_out.index = train_out.msno
avg = np.mean(train_out.loc[train_na, 'is_churn'])
train_out = train_out[~train_na]
train_X = train_out.drop(['msno', 'concated', 'is_churn'], axis = 1)
#train_X = train_X[ranked_features]
train_y = train_out.is_churn

data_dir = path.join(path.dirname(os.getcwd()), 'data', 'test_out.csv')
test_out = pd.read_csv(data_dir)
test_out.index = test_out.msno
test_X = test_out.drop(['msno', 'concated', 'is_churn'], axis = 1)
#test_X = test_X[ranked_features]
test_y = test_out.is_churn

In [70]:
ranked_features = ['expiration_date',
                   '201702',
                  'membership_expire_date',
                  'consecutive_ones',
                  'consecutive_zeros',
                  'registered_via_7.0',
                  'is_cancel',
                  'is_auto_renew']

['expiration_date',
 '201702',
 'is_auto_renew',
 'membership_expire_date',
 '201509',
 'city_12.0',
 'city_14.0',
 'city_18.0',
 '201502',
 'city_10.0',
 'registered_via_13.0',
 'consecutive_ones',
 'is_cancel',
 'registered_via_7.0',
 '201505',
 'city_21.0',
 'city_6.0',
 'num_unq',
 '201511']

### Training the Model

In [32]:
def xgb_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'log_loss', log_loss(labels, preds)

fold = 1
for i in range(fold):

    params = {
        'eta': 0.02, #use 0.002
        'max_depth': 7,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'seed': i,
        'silent': True
    }

    x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=0.3, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 150,  watchlist, 
                      feval=xgb_score, maximize=False, verbose_eval=50, early_stopping_rounds=50)
    if i != 0:
        pred += model.predict(xgb.DMatrix(test_X), ntree_limit=model.best_ntree_limit)
    else:
        pred = model.predict(xgb.DMatrix(test_X), ntree_limit=model.best_ntree_limit)
    
pred /= fold
pred = pred.clip(0.0000001, 0.999999)

[0]	train-log_loss:0.674897	valid-log_loss:0.67489
Multiple eval metrics have been passed: 'valid-log_loss' will be used for early stopping.

Will train until valid-log_loss hasn't improved in 50 rounds.
[50]	train-log_loss:0.237721	valid-log_loss:0.237788
[100]	train-log_loss:0.120564	valid-log_loss:0.120955


In [33]:
np.mean(pred)

0.063030228

### Creating Submission File

In [34]:
test_X.head()

Unnamed: 0_level_0,201501,201502,201503,201504,201505,201506,201507,201508,201509,201510,...,city_999.0,gender_NA,gender_female,gender_male,registered_via_3.0,registered_via_4.0,registered_via_7.0,registered_via_9.0,registered_via_13.0,registered_via_999.0
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,-1.0,-1.0,1.0,0.0,-1.0,1.0,1.0,1.0,-1.0,-1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1.0,1.0,1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [35]:
index = test_out.index
d = {
    'msno': index,
    'is_churn': pred
}

xgb_submission = pd.DataFrame(d)

save_dir = path.join(path.dirname(os.getcwd()), 'submissions')
xgb_submission.to_csv(path.join(save_dir, 'xgb_submission.csv'), index = False)
print('Saved XGB Predictions to:      %s' % path.join(save_dir, 'xgb_submission.csv'))

Saved XGB Predictions to:      C:\Users\Michael\Documents\python\kkbox\submissions\xgb_submission.csv


# Second Fold Prediction

In [36]:
temp = test_X.ix[:, 2:25]
test_X.ix[:, 1:24] = temp
test_X.ix[:, 25] = pred

In [37]:
test_X.ix[:, 25] = np.round(test_X.ix[:, 25])

In [38]:
def xgb_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'log_loss', log_loss(labels, preds)

fold = 1
for i in range(fold):

    params = {
        'eta': 0.02, #use 0.002
        'max_depth': 7,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'seed': i,
        'silent': True
    }

    x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=0.3, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 150,  watchlist, 
                      feval=xgb_score, maximize=False, verbose_eval=50, early_stopping_rounds=50)
    if i != 0:
        pred += model.predict(xgb.DMatrix(test_X), ntree_limit=model.best_ntree_limit)
    else:
        pred = model.predict(xgb.DMatrix(test_X), ntree_limit=model.best_ntree_limit)
    
pred /= fold
pred = pred.clip(0.0000001, 0.999999)

[0]	train-log_loss:0.674897	valid-log_loss:0.67489
Multiple eval metrics have been passed: 'valid-log_loss' will be used for early stopping.

Will train until valid-log_loss hasn't improved in 50 rounds.
[50]	train-log_loss:0.237721	valid-log_loss:0.237788
[100]	train-log_loss:0.120564	valid-log_loss:0.120955


In [39]:
index = test_out.index
d = {
    'msno': index,
    'is_churn': pred
}

xgb2_submission = pd.DataFrame(d)

save_dir = path.join(path.dirname(os.getcwd()), 'submissions')
xgb2_submission.to_csv(path.join(save_dir, 'xgb2_submission.csv'), index = False)
print('Saved XGB2 Predictions to:      %s' % path.join(save_dir, 'xgb2_submission.csv'))

Saved XGB2 Predictions to:      C:\Users\Michael\Documents\python\kkbox\submissions\xgb2_submission.csv


In [40]:
np.mean(pred)

0.090402849

In [53]:
xgb.plot_importance(model)
plt.show()

AttributeError: Unknown property max_num_features