# titanic survival classification with logistic reg


In [24]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [25]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [26]:
import pandas as pd
url = 'https://raw.githubusercontent.com/austinlasseter/plotly_dash_tutorial/master/00%20resources/titanic.csv'
df = pd.read_csv(url, index_col='Unnamed: 0')
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.25,Southampton
1,1,1,female,38.0,71.2833,Cherbourg
2,1,3,female,26.0,7.925,Southampton
3,1,1,female,35.0,53.1,Southampton
4,0,3,male,35.0,8.05,Southampton


# Preprocessing

In [27]:
df['Age'].describe()

count    712.000000
mean      29.642093
std       14.492933
min        0.420000
25%       20.000000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [28]:
# bin the age and create groups for the app
bin=[0,20,30,40,80]

In [29]:
agegr=pd.cut(df.Age,bin)
agegr=agegr.to_frame()
agegr.columns=['range']

In [32]:
agegr.head()

Unnamed: 0,range
0,"(20, 30]"
1,"(30, 40]"
2,"(20, 30]"
3,"(30, 40]"
4,"(30, 40]"


In [34]:
# better solution
df['Agegroup'] = '1'    # initially set all values to '1'; youngest
df.loc[df.Age.between(20, 28), 'Agegroup'] = '2'     
df.loc[df.Age.between(29, 38), 'Agegroup'] = '3'    
df.loc[df.Age.between(39, 60), 'Agegroup'] = '4' 
df.loc[df.Age.between(61, 80), 'Agegroup'] = '5'   

In [38]:
# display a cross-tabulation of two Series
df.Agegroup.value_counts()

2    198
3    172
1    166
4    155
5     21
Name: Agegroup, dtype: int64

In [39]:
df.head(5)

# convert 'agegroup' into the 'category' data type
#df['Agegroup'] = pd.Categorical(df.Agegroup, categories=['child', 'youngadult', 'adult', 'middleaged', 'elderly'])
#df.sort_values('Agegroup')   # sorts by the categorical ordering (low to high)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Agegroup
0,0,3,male,22.0,7.25,Southampton,2
1,1,1,female,38.0,71.2833,Cherbourg,3
2,1,3,female,26.0,7.925,Southampton,2
3,1,1,female,35.0,53.1,Southampton,3
4,0,3,male,35.0,8.05,Southampton,3


In [40]:
# create dummies for gender and few others
pd.get_dummies(df['Sex'])

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
6,0,1
7,0,1
8,1,0
9,1,0
10,1,0


In [41]:
# create dummies in one run for multiple variables...entire dataset...sex disappeared and generated two dummies instead
df2=pd.get_dummies(df, columns=['Sex'], prefix='', prefix_sep='')
df2.head(5)

Unnamed: 0,Survived,Pclass,Age,Fare,Embarked,Agegroup,female,male
0,0,3,22.0,7.25,Southampton,2,0,1
1,1,1,38.0,71.2833,Cherbourg,3,1,0
2,1,3,26.0,7.925,Southampton,2,1,0
3,1,1,35.0,53.1,Southampton,3,1,0
4,0,3,35.0,8.05,Southampton,3,0,1


In [42]:
# for embarked
df3=pd.get_dummies(df2, columns=['Embarked'], prefix='', prefix_sep='')
df3.head(5)

Unnamed: 0,Survived,Pclass,Age,Fare,Agegroup,female,male,Cherbourg,Queenstown,Southampton
0,0,3,22.0,7.25,2,0,1,0,0,1
1,1,1,38.0,71.2833,3,1,0,1,0,0
2,1,3,26.0,7.925,2,1,0,0,0,1
3,1,1,35.0,53.1,3,1,0,0,0,1
4,0,3,35.0,8.05,3,0,1,0,0,1


In [None]:
# we skip scaling now but you should scale, pickle...

In [None]:
# check missing values

In [45]:
# Pickle the dataset
with open('my_dataset.pkl', 'wb') as output:
    pickle.dump(df3, output)

In [46]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Logistic regression: log-odds of a categorical response being "true" (1) is modeled as a linear combination of the features
# look at 00b_logistic_regression_theory file for details about probability, odds and logodds, which last one is given as coef

In [60]:
# move on and start building the model
# institiate the model
# run logistic reg
# instantiate the sklearn class
log_model = LogisticRegression()
# Establish your target
y = df3['Survived']
# Establish your predictors - which is everything except the target variable
X = df3.drop(['Survived', 'Age'], axis=1)
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4, test_size=.3)
# fit on the training data
log_model.fit(X_train, y_train)
# predictx on the test data
y_preds = log_model.predict(X_test)
# check out the first 5
print(y_preds[:5])
print(list(y_test[:5]))

[0 0 1 0 0]
[0, 0, 1, 0, 0]




In [61]:
# evaluation code...three matrixes
from sklearn import metrics

In [62]:
# accuracy
metrics.accuracy_score(y_test, y_preds)

0.7897196261682243

In [63]:
# confusion matrix
metrics.confusion_matrix(y_test, y_preds)

array([[114,  18],
       [ 27,  55]], dtype=int64)

In [64]:
# f1 score is more meaninful than the accuray score, it accounts for the inbalanceness of the two categories. 
metrics.f1_score(y_test, y_preds)

0.7096774193548389

In [65]:
# what about coefficients of survival?
log_model.coef_

array([[-9.36054114e-01,  1.53092358e-03, -2.73678171e-01,
         1.90747301e+00, -5.87070530e-01,  7.79016201e-01,
        -7.61796363e-02,  6.17565915e-01]])

In [66]:
# as pclass goes up survival goes down; age goes up, survive goes down by -2.73...odds ratio
list(X.columns)

['Pclass',
 'Fare',
 'Agegroup',
 'female',
 'male',
 'Cherbourg',
 'Queenstown',
 'Southampton']

In [67]:
# Let's pickle the results
import pickle
file = open('my-titanic-model.pkl', 'wb')
pickle.dump(log_model, file)
file.close()

# Let's pretend we are in app.py

In [68]:
file = open('my-titanic-model.pkl', 'rb')
mymodel=pickle.load(file)
file.close()

In [69]:
X.loc[1]

Pclass               1
Fare           71.2833
Agegroup             3
female               1
male                 0
Cherbourg            1
Queenstown           0
Southampton          0
Name: 1, dtype: object

In [70]:
# let's make new data. gonna_die -- third calss, 80 years old, man, ....; gonna_live -- first calss, 25 years old, female...
gonna_die=[3,80,5,0,1,0,0,1]
gonna_live=[1,25,100,1,0,1,0,0]

In [71]:
mymodel.predict([gonna_live])

array([0], dtype=int64)

In [72]:
# Save X-train for visualization purposes...how?
train = X_train.copy()
train.to_pickle('train.pkl')