# Gradient Boosting Classifier

In [1]:
import pandas as pd 
import numpy as np


from sklearn.model_selection import GridSearchCV
import sklearn.metrics
from sklearn.metrics import log_loss
from sklearn.ensemble import AdaBoostClassifier

import pickle
import os
import os.path as path
import sys

In [2]:
save_dir = path.join(path.dirname(os.getcwd()), 'models')
data_dir = path.join(path.dirname(os.getcwd()), 'data', 'train_out.csv')

read = open('best_features.pkl', 'rb')
ranked_features = pickle.load(read)
read.close()

#ranked_features = ['expiration_date',
#                  'membership_expire_date',
#                  'registered_via_7.0',
#                  'is_auto_renew',
#                  'is_cancel']

train_out = pd.read_csv(data_dir)
train_X = train_out.drop(['msno', 'is_churn'], axis = 1)
train_X = train_X[ranked_features]
train_y = train_out.is_churn

data_dir = path.join(path.dirname(os.getcwd()), 'data', 'test_out.csv')
test_out = pd.read_csv(data_dir)
test_X = test_out.drop(['msno', 'is_churn'], axis = 1)
test_X = test_X[ranked_features]
test_y = test_out.is_churn

### Training the Model

In [12]:
AdaB = AdaBoostClassifier(n_estimators = 150)
AdaB.fit(train_X, train_y)
prob = AdaB.predict_proba(train_X)

In [13]:
print('AdaBoost log loss:           %f' % log_loss(train_y, prob))
save_dir = path.join(path.dirname(os.getcwd()), 'models', 'adab_probs.pkl')
output = open(save_dir, 'wb')
pickle.dump(prob, output, protocol = pickle.HIGHEST_PROTOCOL)
output.close()
print('AdaBoost probs saved!')
save_dir = path.join(path.dirname(os.getcwd()), 'models', 'adab.pkl')
output = open(save_dir, 'wb')
pickle.dump(AdaB, output, protocol = pickle.HIGHEST_PROTOCOL)
output.close()
print('Gradient Boosting Classifier Saved!')

AdaBoost log loss:           0.673674
AdaBoost probs saved!
Gradient Boosting Classifier Saved!


### Creating Submission File

In [6]:
test_X.loc[test_X['201702'].isnull(), '201702'] = 0

In [9]:
pred = AdaB.predict_proba(test_X)
pred = pred[:,1]
index = test_out['msno']
d = {
    'msno': index,
    'is_churn': pred
}

adab_submission = pd.DataFrame(d)

avg = np.mean(train_y)

output = open('test_mask.pkl', 'rb')
mask = pickle.load(output)
output.close()

replace = mask[mask == True].index 
adab_submission.index = adab_submission.msno
adab_submission.loc[replace, 'is_churn'] = avg

save_dir = path.join(path.dirname(os.getcwd()), 'submissions')
adab_submission.to_csv(path.join(save_dir, 'adab_submission.csv'), index = False)
print('Saved AdaB Predictions to:      %s' % path.join(save_dir, 'gbc_submission.csv'))

Saved AdaB Predictions to:      C:\Users\Michael\Documents\python\kkbox\submissions\gbc_submission.csv


In [10]:
np.mean(adab_submission.is_churn)

0.42170554420349804

# adab_submission.is_churn