# Logistic Regression

In [1]:
import pandas as pd 
import numpy as np


from sklearn.model_selection import GridSearchCV
import sklearn.metrics
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

import pickle
import os
import os.path as path
import sys

In [2]:
save_dir = path.join(path.dirname(os.getcwd()), 'models')
data_dir = path.join(path.dirname(os.getcwd()), 'data', 'train_out.csv')

read = open('best_features.pkl', 'rb')
ranked_features = pickle.load(read)
read.close()

read = open('train_na.pkl', 'rb')
train_na = pickle.load(read)
read.close()

read = open('test_na.pkl', 'rb')
test_na = pickle.load(read)
read.close()

#ranked_features = ['expiration_date',
#                  'membership_expire_date',
#                  'registered_via_7.0',
#                  'is_auto_renew',
#                  'is_cancel']

train_out = pd.read_csv(data_dir)
train_out.index = train_out.msno
avg = np.mean(train_out.loc[train_na, 'is_churn'])
train_out = train_out[~train_na]
train_X = train_out.drop(['msno', 'is_churn'], axis = 1)
train_X = train_X[ranked_features]
train_y = train_out.is_churn

data_dir = path.join(path.dirname(os.getcwd()), 'data', 'test_out.csv')
test_out = pd.read_csv(data_dir)
test_out.index = test_out.msno
test_X = test_out.drop(['msno', 'is_churn'], axis = 1)
test_X = test_X[ranked_features]
test_y = test_out.is_churn

### Training the Model

In [3]:
ranked_features

['expiration_date',
 '201702',
 'is_auto_renew',
 'membership_expire_date',
 '201509',
 'city_12.0',
 'city_14.0',
 'city_18.0',
 '201502',
 'city_10.0',
 'registered_via_13.0',
 'consecutive_ones',
 'is_cancel',
 'registered_via_7.0',
 '201505',
 'city_21.0',
 'city_6.0',
 'num_unq',
 '201511']

In [4]:
log = LogisticRegression()
log.fit(train_X, train_y)
prob = log.predict_proba(train_X)

In [5]:
print('Logistic Regression log loss:           %f' % log_loss(train_y, prob))
save_dir = path.join(path.dirname(os.getcwd()), 'models', 'gbc_probs.pkl')
output = open(save_dir, 'wb')
pickle.dump(prob, output, protocol = pickle.HIGHEST_PROTOCOL)
output.close()
print('Logistic Regression probs saved!')
save_dir = path.join(path.dirname(os.getcwd()), 'models', 'log.pkl')
output = open(save_dir, 'wb')
pickle.dump(log, output, protocol = pickle.HIGHEST_PROTOCOL)
output.close()
print('Logistic Regression Classifier Saved!')

Logistic Regression log loss:           0.092731
Logistic Regression probs saved!
Logistic Regression Classifier Saved!


### Creating Submission File

In [6]:
test_X = test_X.loc[~test_na]

pred = log.predict_proba(test_X)
pred = pred[:,1]
index = test_out.loc[~test_na, 'msno']
d = {
    'msno': index,
    'is_churn': pred
}

log_submission = pd.DataFrame(d)

log_submission = log_submission.append(pd.DataFrame({'msno': test_na[test_na == True].index, 
                                    'is_churn': [avg] * len(test_na[test_na == True].index)}))

save_dir = path.join(path.dirname(os.getcwd()), 'submissions')
log_submission.to_csv(path.join(save_dir, 'log_submission.csv'), index = False)
print('Saved Log Predictions to:      %s' % path.join(save_dir, 'gbc_submission.csv'))

ValueError: array length 849780 does not match index length 970960

In [42]:
log_submission.loc[replace]

Unnamed: 0_level_0,is_churn,msno
msno,Unnamed: 1_level_1,Unnamed: 2_level_1
c147tEGAJb25r45hf3QBGOMgDZY4Dj+4r5C55UzCo+Q=,0.013124,c147tEGAJb25r45hf3QBGOMgDZY4Dj+4r5C55UzCo+Q=
2Oo0AioVkvZnYs5U0WchMbdQtkIyPtkr59MUEPmDJ9g=,0.013124,2Oo0AioVkvZnYs5U0WchMbdQtkIyPtkr59MUEPmDJ9g=
kSySd4rJkUzI7BUoH9YC0Xhw+jq90DGaAZduXE8Q0rQ=,0.005399,kSySd4rJkUzI7BUoH9YC0Xhw+jq90DGaAZduXE8Q0rQ=
65+TjvOvEPR7r/5p8SxJc8kNkuTcpK9rtQJT4FeFEFo=,0.013124,65+TjvOvEPR7r/5p8SxJc8kNkuTcpK9rtQJT4FeFEFo=
+8WKqzN6WM1FymTIq5seEiz4gNQcvboGEjZtJlzfhFc=,,
S/A6p+3q+SW9bRzqbyHG5vLMSo5Db5DhsnhHv8qegV0=,0.005399,S/A6p+3q+SW9bRzqbyHG5vLMSo5Db5DhsnhHv8qegV0=
+2FaW6slSzLNdwYOh0QOHZnMzodKMtPHCZF1d4C7Fb8=,0.013124,+2FaW6slSzLNdwYOh0QOHZnMzodKMtPHCZF1d4C7Fb8=
pciLjGsP1RFA8vIEnTk97g7IJsrGiiZxpcIqPoKbkdY=,0.013124,pciLjGsP1RFA8vIEnTk97g7IJsrGiiZxpcIqPoKbkdY=
4UoGsqadop9RCCDFQTBEI1PbD97h+1Omr+3uBAZ1Rjw=,,
eAItuBGgGO511ed4RfCT1M6AqA0+7h+Og2fpd8XrkzQ=,,
