# Logistic Regression

In [7]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pickle
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# environment settings
data_path = 'Data/'

# Deserialize previously saved data from "preprocessing"
with open(data_path+'train_pp.obj', 'rb') as train_pp, \
open(data_path+'test_pp.obj','rb') as test_pp:
    train_df = pickle.load(train_pp)
    test_df = pickle.load(test_pp)

In [8]:
#Separate training set from target variables
X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]

## Model Building

#### GridSearchCV

In [9]:
# Grid search to find best parameter values
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

clf = GridSearchCV(LogisticRegression(), param_grid, cv=10, scoring='accuracy')
clf.fit(X_train, y_train)

print('Best score: {}'.format(clf.best_score_))
print('Best parameters: {}'.format(clf.best_params_))

Best score: 0.8204264870931538
Best parameters: {'C': 100}


#### Standard K-fold (left only for reference)

In [10]:
# kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=7)
# model = LogisticRegression()
# scoring = 'accuracy'
# results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
# print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

## Prediction and data output

In [11]:
y_out = pd.DataFrame(data = clf.predict(test_df), columns=["Survived"])
y_out = y_out["Survived"].apply(lambda x: x-x if x<0.5 else x+(1-x)).astype(int)
df_out = pd.concat([test_df["PassengerId"],y_out], axis=1 )
df_out.to_csv(data_path+"outputs/logreg-res.csv",sep=",",index=False)