# XGBoost

In [11]:
import pandas as pd
import numpy as np

import gc; gc.enable()
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.figure

import pickle
import os
import os.path as path
import sys

In [35]:
read = open('train_na.pkl', 'rb')
train_na = pickle.load(read)
read.close()

read = open('test_na.pkl', 'rb')
test_na = pickle.load(read)
read.close()

data_dir = path.join(path.dirname(os.getcwd()), 'data', 'train_out.csv')
train_out = pd.read_csv(data_dir)
train_out.index = train_out.msno
train_out = train_out[~train_na]
train_X = train_out.drop(['msno', 'concated', 
                          'is_churn'], axis = 1)
train_y = train_out.is_churn

data_dir = path.join(path.dirname(os.getcwd()), 'data', 'test_out.csv')
test_out = pd.read_csv(data_dir)
test_X = test_out.drop(['msno', 'concated', 
                          'is_churn'], axis = 1)
test_y = test_out.is_churn

### Training

In [36]:
def xgb_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'log_loss', log_loss(labels, preds)

fold = 1
for i in range(fold):
    params = {
        'eta': 0.02, #use 0.002
        'max_depth': 7,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'seed': i,
        'silent': True
    }
    x1, x2, y1, y2 = train_test_split(train_X, train_y, test_size=0.3, random_state=i)
    watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
    model = xgb.train(params, xgb.DMatrix(x1, y1), 150,  watchlist, feval=xgb_score, maximize=False, verbose_eval=50, early_stopping_rounds=50) #use 1500
    if i != 0:
        pred += model.predict(xgb.DMatrix(test_X), ntree_limit=model.best_ntree_limit)
    else:
        pred = model.predict(xgb.DMatrix(test_X), ntree_limit=model.best_ntree_limit)
pred /= fold
pred = pred.clip(0.0000001, 0.999999)

[0]	train-log_loss:0.674897	valid-log_loss:0.67489
Multiple eval metrics have been passed: 'valid-log_loss' will be used for early stopping.

Will train until valid-log_loss hasn't improved in 50 rounds.
[50]	train-log_loss:0.237721	valid-log_loss:0.237788
[100]	train-log_loss:0.120564	valid-log_loss:0.120955


In [42]:
np.mean(pred)

0.063030228

### Make Submission

In [43]:
index = test_out.msno

d = {
    'msno': index,
    'is_churn': pred
    }

xgb_submission = pd.DataFrame(d)
save_dir = path.join(path.dirname(os.getcwd()), 'submissions')
xgb_submission.to_csv(path.join(save_dir, 'xgb_submission.csv'), index = False)
print('Saved XGB Predictions to:      %s' % path.join(save_dir, 'xgb_submission.csv'))

Saved XGB Predictions to:      C:\Users\Michael\Documents\python\kkbox\submissions\xgb_submission.csv
