In [30]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from scipy import stats

from pylab import rcParams
from sklearn import preprocessing, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
# from sklearn import cross_validation
from sklearn.model_selection import cross_val_score, cross_val_predict

import statsmodels.api as sm
import pylab as pl

from sklearn.model_selection import RandomizedSearchCV


In [6]:
df_titanic = pd.DataFrame.from_csv('cleaned_up_training_set.csv')

In [7]:
df_titanic.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,male,Q,S
0,0,3,-0.512251,1,0,1,0,1
4,0,3,0.399814,0,0,1,0,1
6,0,1,1.732832,0,0,1,0,1
7,0,3,-1.915427,3,1,1,0,1
12,0,3,-0.652568,0,0,1,0,1


In [8]:
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 848 entries, 0 to 190
Data columns (total 8 columns):
Survived    848 non-null int64
Pclass      848 non-null int64
Age         848 non-null float64
SibSp       848 non-null int64
Parch       848 non-null int64
male        848 non-null int64
Q           848 non-null int64
S           848 non-null int64
dtypes: float64(1), int64(7)
memory usage: 59.6 KB


In [9]:
df_titanic['Survived'].value_counts()

1    424
0    424
Name: Survived, dtype: int64

In [10]:
X = df_titanic.iloc[:, 1:].values
y = df_titanic.iloc[:, 0].values

In [11]:
TEST_SIZE = 0.2
RANDOM_STATE = 123
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = TEST_SIZE, random_state = RANDOM_STATE)


# l1 logistic with sklearn

https://scikit-learn.org/0.18/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression. in the current version, penalty can only have l1 and l2. in the new sklearn version. there's also regular logistic regression as well.

In [14]:
clf = LogisticRegression(penalty = 'l1', random_state = RANDOM_STATE) # fit_intercept bool, default=True
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)

In [16]:
print(metrics.classification_report(y_test, y_pred))
print(metrics.roc_auc_score(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.86      0.79      0.82        86
          1       0.80      0.87      0.83        84

avg / total       0.83      0.83      0.83       170

0.8298726467331118
0.8294117647058824


In [29]:
pd.DataFrame(list(zip(df_titanic.columns[1:], clf.coef_[0])))

Unnamed: 0,0,1
0,Pclass,-1.447666
1,Age,-0.618417
2,SibSp,-0.365191
3,Parch,-0.048539
4,male,-2.594886
5,Q,0.0
6,S,0.0


### cv this quickly

In [34]:
clf = LogisticRegression(penalty = 'l1')
y_all_pred_cv = cross_val_predict(clf, X, y, cv = 5)
print(metrics.classification_report(y, y_all_pred_cv))

             precision    recall  f1-score   support

          0       0.82      0.81      0.82       424
          1       0.81      0.82      0.82       424

avg / total       0.82      0.82      0.82       848



In [35]:
cross_val_score(clf, X, y, cv = 5, scoring = 'f1')

array([0.82022472, 0.84705882, 0.76646707, 0.83832335, 0.80952381])

# do it with statsmodel

no easy way to do l1 and l2 with statsmdoel unfortunately

In [36]:
logit = sm.Logit(y_train, X_train)
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.581930
         Iterations 5


In [37]:
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,678.0
Model:,Logit,Df Residuals:,671.0
Method:,MLE,Df Model:,6.0
Date:,"Sat, 07 Mar 2020",Pseudo R-squ.:,0.1604
Time:,18:44:21,Log-Likelihood:,-394.55
converged:,True,LL-Null:,-469.95
,,LLR p-value:,5.2300000000000006e-30

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.2523,0.093,-2.727,0.006,-0.434,-0.071
x2,-0.0997,0.095,-1.047,0.295,-0.286,0.087
x3,-0.0837,0.106,-0.793,0.428,-0.290,0.123
x4,0.3414,0.127,2.679,0.007,0.092,0.591
x5,-1.6185,0.180,-8.975,0.000,-1.972,-1.265
x6,0.9791,0.480,2.041,0.041,0.039,1.919
x7,1.3356,0.232,5.766,0.000,0.882,1.790


In [40]:
y_test_pred_prob = result.predict(X_test)
y_test_pred = (y_test_pred_prob > 0.5).astype(int)

print(metrics.classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

          0       0.75      0.86      0.80        86
          1       0.83      0.70      0.76        84

avg / total       0.79      0.78      0.78       170

