In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import acquire as acq
import prepare as prep

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

Create a new notebook, logistic_regression, use it to answer the following questions:

- Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [4]:
titanic = acq.get_titanic_data()

titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [17]:
# Creating new dataframe where we drop NaN values from 'age'

titanic = titanic.dropna(subset=['age'])

In [18]:
# Check

titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  714 non-null    int64  
 1   survived      714 non-null    int64  
 2   pclass        714 non-null    int64  
 3   sex           714 non-null    object 
 4   age           714 non-null    float64
 5   sibsp         714 non-null    int64  
 6   parch         714 non-null    int64  
 7   fare          714 non-null    float64
 8   embarked      712 non-null    object 
 9   class         714 non-null    object 
 10  deck          184 non-null    object 
 11  embark_town   712 non-null    object 
 12  alone         714 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 78.1+ KB


In [20]:
# Not going to run the dataset through prep_titanic, since that would drop age, and I can just set the X_train
# to the values I want myself. Also, don't really want to encode pclass either.

train_titanic, validate_titanic, test_titanic = prep.split_data(titanic, 'survived')

In [23]:
X_train = train_titanic[['age', 'fare', 'pclass']]
y_train = train_titanic.survived

X_validate = validate_titanic[['age', 'fare', 'pclass']]
y_validate = validate_titanic.survived

X_test = test_titanic[['age', 'fare', 'pclass']]
y_test = test_titanic.survived

In [145]:
# Baseline Prediction

train_titanic.survived.mode() # 0
(train_titanic.survived == 0).mean() #59.35%

0.5934579439252337

In [24]:
# Creating and fitting.

logit0 = LogisticRegression()
logit0.fit(X_train, y_train)

LogisticRegression()

In [34]:
# Predicting

y_pred = logit0.predict(X_train)
y_pred[:10]

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0])

In [31]:
titanic_pred = pd.DataFrame({'predicted': y_pred,
             'actual': y_train})
(titanic_pred.predicted == titanic_pred.actual).mean()

0.6985981308411215

In [32]:
# Evaluating

logit0.score(X_train, y_train)

# The model performs, I would say, marginally better than the baseline.

0.6985981308411215

In [36]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train, y_pred)
pd.DataFrame(cm, index=['Actual_Not', 'Actual_Sur'],
            columns = ['Pred_Not', 'Pred_Sur'])

Unnamed: 0,Pred_Not,Pred_Sur
Actual_Not,208,46
Actual_Sur,83,91


In [38]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.82      0.76       254
           1       0.66      0.52      0.59       174

    accuracy                           0.70       428
   macro avg       0.69      0.67      0.67       428
weighted avg       0.69      0.70      0.69       428



- Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [44]:
# Getting dummies

train_titanic['male'] = pd.get_dummies(train_titanic['sex'],drop_first=True)
validate_titanic['male'] = pd.get_dummies(validate_titanic['sex'],drop_first=True)
test_titanic['male'] = pd.get_dummies(test_titanic['sex'],drop_first=True)

In [46]:
# Reassigning X_train and others

X_train = train_titanic[['age', 'fare', 'pclass', 'male']]
y_train = train_titanic.survived

X_validate = validate_titanic[['age', 'fare', 'pclass', 'male']]
y_validate = validate_titanic.survived

X_test = test_titanic[['age', 'fare', 'pclass', 'male']]
y_test = test_titanic.survived

In [47]:
# Creating and fitting
logit1 = LogisticRegression()
logit1.fit(X_train, y_train)

LogisticRegression()

In [50]:
# Predicting

pd.DataFrame({'predicted': logit1.predict(X_train)
             , 'actual': y_train})

Unnamed: 0,predicted,actual
548,0,0
133,1,1
540,1,1
2,1,1
649,1,1
...,...,...
774,1,1
664,0,1
842,1,1
286,0,1


In [51]:
# Scoring

logit1.score(X_train, y_train)

# Score much higher than without encoded sex.

0.7873831775700935

In [53]:
y_pred1 = logit1.predict(X_train)

In [54]:
cm = confusion_matrix(y_train, y_pred1)
pd.DataFrame(cm, index=['Actual_Not', 'Actual_Sur'],
            columns = ['Pred_Not', 'Pred_Sur'])

Unnamed: 0,Pred_Not,Pred_Sur
Actual_Not,215,39
Actual_Sur,52,122


In [55]:
print(classification_report(y_train, y_pred1))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       254
           1       0.76      0.70      0.73       174

    accuracy                           0.79       428
   macro avg       0.78      0.77      0.78       428
weighted avg       0.79      0.79      0.79       428



In [56]:
logit1.score(X_validate, y_validate) # Good score compared to training dataset

0.7692307692307693

- Try out other combinations of features and models.

In [71]:
# This time including 'alone'

X_train = train_titanic[['age', 'fare', 'pclass', 'alone', 'male']]
y_train = train_titanic.survived

X_validate = validate_titanic[['age', 'fare', 'pclass', 'alone', 'male']]
y_validate = validate_titanic.survived

X_test = test_titanic[['age', 'fare', 'pclass', 'alone', 'male']]
y_test = test_titanic.survived

In [62]:
logit2 = LogisticRegression()
logit2.fit(X_train, y_train)
logit2.score(X_train, y_train)

0.7920560747663551

In [63]:
logit2.score(X_validate, y_validate)

0.7902097902097902

In [64]:
# This time removing 'pclass'

X_train = train_titanic[['age', 'fare', 'alone', 'male']]
y_train = train_titanic.survived

X_validate = validate_titanic[['age', 'fare', 'alone', 'male']]
y_validate = validate_titanic.survived

X_test = test_titanic[['age', 'fare', 'alone', 'male']]
y_test = test_titanic.survived

In [65]:
logit3 = LogisticRegression()
logit3.fit(X_train, y_train)
logit3.score(X_train, y_train)

0.7733644859813084

In [66]:
logit3.score(X_validate, y_validate)

0.7622377622377622

- Use you best 3 models to predict and evaluate on your validate sample.

In [None]:
# Already did.

- Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [72]:
logit2.score(X_test, y_test)

# Performed even better than the validation and training datasets!

0.8251748251748252

Unnamed: 0,age,fare,pclass,alone,male
548,33.0,20.525,3,0,1
133,29.0,26.000,2,0,0
540,36.0,71.000,1,0,0
2,26.0,7.925,3,1,0
649,23.0,7.550,3,1,0
...,...,...,...,...,...
774,54.0,23.000,2,0,0
664,20.0,7.925,3,0,1
842,30.0,31.000,1,1,0
286,30.0,9.500,3,1,1


Bonus3: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.
Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

In [75]:
C = [.01, .1, 1, 10, 100, 1000]

for c in C: 
    logitc = LogisticRegression(C=c)
    logitc.fit(X_train, y_train)
    print(f'For C = {c}, score on training dataset is {round(logitc.score(X_train, y_train), 2)}, and on validate is {round(logitc.score(X_validate, y_validate), 2)}')
    

For C = 0.01, score on training dataset is 0.71, and on validate is 0.75
For C = 0.1, score on training dataset is 0.79, and on validate is 0.79
For C = 1, score on training dataset is 0.79, and on validate is 0.79
For C = 10, score on training dataset is 0.79, and on validate is 0.78
For C = 100, score on training dataset is 0.79, and on validate is 0.78
For C = 1000, score on training dataset is 0.79, and on validate is 0.78


Bonus1 How do different strategies for handling the missing values in the age column affect model performance?

In [139]:
titanic_with_age_0 = acq.get_titanic_data()
titanic_with_age_0 = titanic_with_age_0.fillna(0)

titanic_with_age_mean = acq.get_titanic_data()
titanic_with_age_mean = titanic_with_age_mean.fillna(titanic_with_age_mean.age.mean())

titanic_with_age_mode = acq.get_titanic_data()
titanic_with_age_mode = titanic_with_age_mode.fillna(titanic_with_age_mode.age.mode()[0])

In [140]:
titanic_with_age_0['male'] = pd.get_dummies(titanic_with_age_0['sex'], drop_first=True)
titanic_with_age_mean['male'] = pd.get_dummies(titanic_with_age_mean['sex'], drop_first=True)
titanic_with_age_mode['male'] = pd.get_dummies(titanic_with_age_mode['sex'], drop_first=True)

In [141]:
train_titanic_0, validate_titanic_0, test_titanic_0 = prep.split_data(titanic_with_age_0, 'survived')
train_titanic_mean, validate_titanic_mean, test_titanic_mean = prep.split_data(titanic_with_age_mean, 'survived')
train_titanic_mode, validate_titanic_mode, test_titanic_mode = prep.split_data(titanic_with_age_mode, 'survived')

In [105]:
X_train = train_titanic_0[['age', 'fare', 'pclass', 'alone', 'male']]
y_train = train_titanic_0.survived

X_validate = validate_titanic_0[['age', 'fare', 'pclass', 'alone', 'male']]
y_validate = validate_titanic_0.survived

X_test = test_titanic_0[['age', 'fare', 'pclass', 'alone', 'male']]
y_test = test_titanic_0.survived

In [106]:
logit_0 = LogisticRegression()
logit_0.fit(X_train, y_train)
logit_0.score(X_train, y_train), logit_0.score(X_validate, y_validate)

(0.7808988764044944, 0.7696629213483146)

In [107]:
X_train = train_titanic_mean[['age', 'fare', 'pclass', 'alone', 'male']]
y_train = train_titanic_mean.survived

X_validate = validate_titanic_mean[['age', 'fare', 'pclass', 'alone', 'male']]
y_validate = validate_titanic_mean.survived

X_test = test_titanic_mean[['age', 'fare', 'pclass', 'alone', 'male']]
y_test = test_titanic_mean.survived

In [108]:
logit_mean = LogisticRegression()
logit_mean.fit(X_train, y_train)
logit_mean.score(X_train, y_train), logit_mean.score(X_validate, y_validate)

(0.8089887640449438, 0.7808988764044944)

In [142]:
X_train = train_titanic_mode[['age', 'fare', 'pclass', 'alone', 'male']]
y_train = train_titanic_mode.survived

X_validate = validate_titanic_mode[['age', 'fare', 'pclass', 'alone', 'male']]
y_validate = validate_titanic_mode.survived

X_test = test_titanic_mode[['age', 'fare', 'pclass', 'alone', 'male']]
y_test = test_titanic_mode.survived

In [143]:
logit_mode = LogisticRegression()
logit_mode.fit(X_train, y_train)
logit_mode.score(X_train, y_train), logit_mode.score(X_validate, y_validate)

(0.795880149812734, 0.7808988764044944)

In [None]:
# Interesting, that replacing NaNs with 0s still had relatively good perfomance when it came to predicting. I had
# assumed that it would have completely thrown off the algorithm and scores would be low. Good to know. Nevertheless,
# looked like replacing NaNs with the age.mean() resulted in the best score. 