# Gradient Boosting Classifier

In [9]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import GridSearchCV
import sklearn.metrics
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier


import pickle
import os
import os.path as path
import sys

In [2]:
save_dir = path.join(path.dirname(os.getcwd()), 'models')
data_dir = path.join(path.dirname(os.getcwd()), 'data', 'train_out.csv')

read = open('best_features.pkl', 'rb')
ranked_features = pickle.load(read)
read.close()

read = open('train_na.pkl', 'rb')
train_na = pickle.load(read)
read.close()

read = open('test_na.pkl', 'rb')
test_na = pickle.load(read)
read.close()

#ranked_features = ['expiration_date',
#                   '201702',
#                  'membership_expire_date',
#                  'consecutive_ones',
#                  'consecutive_zeros',
#                  'registered_via_7.0',
#                  'is_cancel',
#                  'is_auto_renew']

train_out = pd.read_csv(data_dir)
train_out.index = train_out.msno
avg = np.mean(train_out.loc[train_na, 'is_churn'])
train_out = train_out[~train_na]
train_X = train_out.drop(['msno', 'concated', 'is_churn'], axis = 1)
train_X = train_X[ranked_features]
train_y = train_out.is_churn

data_dir = path.join(path.dirname(os.getcwd()), 'data', 'test_out.csv')
test_out = pd.read_csv(data_dir)
test_out.index = test_out.msno
test_X = test_out.drop(['msno', 'concated', 'is_churn'], axis = 1)
test_X = test_X[ranked_features]
test_y = test_out.is_churn

In [7]:
ranked_features

['expiration_date',
 '201702',
 'is_auto_renew',
 '201506',
 'membership_expire_date',
 'payment_plan_days',
 '201607',
 '201606']

### Training the Model

In [3]:
gbc = GradientBoostingClassifier(max_depth = 4, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 150, verbose = 1, random_state = 1)
gbc.fit(train_X, train_y)
prob = gbc.predict_proba(train_X)

      Iter       Train Loss   Remaining Time 
         1           0.3683            2.56m
         2           0.3296            2.55m
         3           0.3026            2.50m
         4           0.2816            2.48m
         5           0.2648            2.46m
         6           0.2501            2.45m
         7           0.2378            2.44m
         8           0.2274            2.41m
         9           0.2187            2.47m
        10           0.2113            2.59m
        20           0.1664            2.63m
        30           0.1515            2.61m
        40           0.1453            2.41m
        50           0.1425            2.24m
        60           0.1408            1.98m
        70           0.1396            1.74m
        80           0.1386            1.49m
        90           0.1377            1.25m
       100           0.1368            1.03m


In [4]:
print('Gradient Boosting log loss:           %f' % log_loss(train_y, prob))
save_dir = path.join(path.dirname(os.getcwd()), 'models', 'gbc_probs.pkl')
output = open(save_dir, 'wb')
pickle.dump(prob, output, protocol = pickle.HIGHEST_PROTOCOL)
output.close()
print('Gradient Boosting probs saved!')
save_dir = path.join(path.dirname(os.getcwd()), 'models', 'gbc.pkl')
output = open(save_dir, 'wb')
pickle.dump(gbc, output, protocol = pickle.HIGHEST_PROTOCOL)
output.close()
print('Gradient Boosting Classifier Saved!')

Gradient Boosting log loss:           0.066893
Gradient Boosting probs saved!
Gradient Boosting Classifier Saved!


### Creating Submission File

In [5]:
test_X = test_X.loc[~test_na]

pred = gbc.predict_proba(test_X)
pred = pred[:,1]
index = test_out.loc[~test_na, 'msno']
d = {
    'msno': index,
    'is_churn': pred
}

gbc_submission = pd.DataFrame(d)

gbc_submission = gbc_submission.append(pd.DataFrame({'msno': test_na[test_na == True].index, 
                                    'is_churn': [avg] * len(test_na[test_na == True].index)}))

gbc_submission.index = gbc_submission.msno

save_dir = path.join(path.dirname(os.getcwd()), 'submissions')
gbc_submission.to_csv(path.join(save_dir, 'gbc_submission.csv'), index = False)
print('Saved GBC Predictions to:      %s' % path.join(save_dir, 'gbc_submission.csv'))

Saved GBC Predictions to:      C:\Users\Michael\Documents\python\kkbox\submissions\gbc_submission.csv


In [6]:
np.mean(gbc_submission.is_churn)

0.0426113948224453