# Logistic Regression

In [1]:
import pandas as pd 
import numpy as np


from sklearn.model_selection import GridSearchCV
import sklearn.metrics
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

import pickle
import os
import os.path as path
import sys

In [9]:
save_dir = path.join(path.dirname(os.getcwd()), 'models')
data_dir = path.join(path.dirname(os.getcwd()), 'data', 'train_out.csv')

read = open('best_features.pkl', 'rb')
ranked_features = pickle.load(read)
read.close()

read = open('train_na.pkl', 'rb')
train_na = pickle.load(read)
read.close()

read = open('test_na.pkl', 'rb')
test_na = pickle.load(read)
read.close()

ranked_features = ['expiration_date',
                   '201702',
                  'membership_expire_date',
                  'consecutive_ones',
                  'consecutive_zeros',
                  'registered_via_7.0',
                  'is_cancel',
                  'is_auto_renew']

train_out = pd.read_csv(data_dir)
train_out.index = train_out.msno
avg = np.mean(train_out.loc[train_na, 'is_churn'])
train_out = train_out[~train_na]
train_X = train_out.drop(['msno', 'is_churn'], axis = 1)
train_X = train_X[ranked_features]
train_y = train_out.is_churn

data_dir = path.join(path.dirname(os.getcwd()), 'data', 'test_out.csv')
test_out = pd.read_csv(data_dir)
test_out.index = test_out.msno
test_X = test_out.drop(['msno', 'is_churn'], axis = 1)
test_X = test_X[ranked_features]
test_y = test_out.is_churn

### Training the Model

In [10]:
ranked_features

['expiration_date',
 '201702',
 'membership_expire_date',
 'consecutive_ones',
 'consecutive_zeros',
 'registered_via_7.0',
 'is_cancel',
 'is_auto_renew']

In [11]:
log = LogisticRegression()
log.fit(train_X, train_y)
prob = log.predict_proba(train_X)

In [12]:
print('Logistic Regression log loss:           %f' % log_loss(train_y, prob))
save_dir = path.join(path.dirname(os.getcwd()), 'models', 'gbc_probs.pkl')
output = open(save_dir, 'wb')
pickle.dump(prob, output, protocol = pickle.HIGHEST_PROTOCOL)
output.close()
print('Logistic Regression probs saved!')
save_dir = path.join(path.dirname(os.getcwd()), 'models', 'log.pkl')
output = open(save_dir, 'wb')
pickle.dump(log, output, protocol = pickle.HIGHEST_PROTOCOL)
output.close()
print('Logistic Regression Classifier Saved!')

Logistic Regression log loss:           0.091552
Logistic Regression probs saved!
Logistic Regression Classifier Saved!


### Creating Submission File

In [13]:
test_X = test_X.loc[~test_na]

pred = log.predict_proba(test_X)
pred = pred[:,1]
index = test_out.loc[~test_na, 'msno']
d = {
    'msno': index,
    'is_churn': pred
}

log_submission = pd.DataFrame(d)

log_submission = log_submission.append(pd.DataFrame({'msno': test_na[test_na == True].index, 
                                    'is_churn': [avg] * len(test_na[test_na == True].index)}))

save_dir = path.join(path.dirname(os.getcwd()), 'submissions')
log_submission.to_csv(path.join(save_dir, 'log_submission.csv'), index = False)
print('Saved Log Predictions to:      %s' % path.join(save_dir, 'gbc_submission.csv'))

Saved Log Predictions to:      C:\Users\Michael\Documents\python\kkbox\submissions\gbc_submission.csv


In [14]:
np.mean(log_submission.is_churn)

0.05956939778864728