In [None]:
# import libraries
import numpy as np
import pandas as pd

In [None]:
# load training dataset
train = pd.read_csv('train_titanic.csv',index_col = 'PassengerId')

In [None]:
# define attributes and target variable
features = ['Pclass','Age','SibSp','Parch','Fare','Sex_male','Embarked_Q','Embarked_S']
target = ['Survived']
X = train [features]
y = train [target]

In [None]:
from sklearn import preprocessing

In [None]:
# Normalize attributes
zscore_scaler = preprocessing.StandardScaler().fit(X)
X_transform = pd.DataFrame(zscore_scaler.transform(X), columns = X.columns)

In [None]:
X.head()

In [None]:
X_transform.head()

In [None]:
# import Logistic Regression from sklearn
from sklearn.linear_model import LogisticRegression

In [None]:
# define model to be logistic regression
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# penalty='l1' means L1 regularization (recall LASSO regression); default is penality='L2' (L2 regularization). C=1.0 is inverse of regularization strength; must be a positive float.
# 'saga' is the algorithm to use in the optimization problem (finding the optimal coefficient values)
lr = LogisticRegression(penalty='l1', C=1.0, random_state=0, solver='saga')

In [None]:
# import cross validation and other evaluation tool 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
# change target column to array
yact = y.values.ravel()

In [None]:
score_cv = cross_val_score(lr, X_transform, yact, cv=10)

In [None]:
# accuracy from cross validation
score_cv.mean()

In [None]:
#predict value of target based on cross validation
pred_y = cross_val_predict(lr, X_transform, yact, cv=10)

In [None]:
# Confusion matrix
print(confusion_matrix(y, pred_y))

In [None]:
print(classification_report(yact, pred_y))

### ROC

In [None]:
# probablities for each prediction
proba_y = cross_val_predict(lr, X_transform, yact, cv=10, method='predict_proba')

In [None]:
proba_y

In [None]:
# probabilities of being '0' for all instances at default threshold of 0.5
proba_y_0 = proba_y[:,0]
proba_y_0

In [None]:
# lower threshold for '0' to be 0.4
proba_y_0_lt= [0 if i >= 0.4 else 1 for i in proba_y_0]
proba_y_0_lt

In [None]:
print("Confusion Matrix:", '\n',confusion_matrix(y, proba_y_0_lt))
print("Accuracy:",accuracy_score(y, proba_y_0_lt, normalize=True, sample_weight=None))

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
# calculate tpr and fpr based on different thresholds
fpr, tpr, thresholds = metrics.roc_curve(yact,proba_y_0,pos_label=0)

In [None]:
metrics.auc(fpr, tpr)

In [None]:
plt.plot(fpr,tpr)

In [None]:
# train model using training dataset
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.fit
lr.fit(X_transform, yact)

In [None]:
# show the intercept of the trained model (Theta_0)
lr.intercept_

In [None]:
# show the coefficients of independent attributes
# the reason that we use the function .flatten() here is to convert the 8X1 array to 1X8 array
coeff_df = pd.DataFrame(lr.coef_.flatten(), X.columns, columns=['Coefficient'])  
coeff_df