In [30]:
import pandas as pd
import numpy as np
import plotly.express as px
px.defaults.width = 600

from sklearn.experimental import enable_halving_search_cv  
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, HalvingGridSearchCV
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
titanic_test = pd.read_csv(r'C:\Users\zsena\Documents\GitHub\handson-ml2\datasets\titanic\test.csv')
titanic_train = pd.read_csv(r'C:\Users\zsena\Documents\GitHub\handson-ml2\datasets\titanic\train.csv')

In [3]:
titanic_train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


The attributes have the following meaning:
* **PassengerId**: a unique identifier for each passenger
* **Survived**: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
* **Pclass**: passenger class.
* **Name**, **Sex**, **Age**: self-explanatory
* **SibSp**: how many siblings & spouses of the passenger aboard the Titanic.
* **Parch**: how many children & parents of the passenger aboard the Titanic.
* **Ticket**: ticket id
* **Fare**: price paid (in pounds)
* **Cabin**: passenger's cabin number
* **Embarked**: where the passenger embarked the Titanic

In [4]:
titanic_test = titanic_test.set_index('PassengerId')
titanic_train = titanic_train.set_index('PassengerId')

In [5]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [6]:
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [7]:
titanic_train.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [8]:
titanic_train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [9]:
titanic_train.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [10]:
titanic_train.corr()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096066
SibSp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.257307,-0.5495,0.096066,0.159651,0.216225,1.0


In [11]:
num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(sparse=False))
])

num_cols = ['Age', 'SibSp', 'Parch', 'Fare']
cat_cols = ['Sex', 'Embarked', 'Pclass']
labels = ['Survived']
titanic_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

X_train = titanic_pipe.fit_transform(titanic_train)
"""X_train drop col 9"""
y_train = titanic_train[labels]


In [13]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression()

In [14]:
rnd_for = RandomForestClassifier(random_state=69)
rnd_for.fit(X_train, y_train)

RandomForestClassifier(random_state=69)

In [15]:
X_test = titanic_pipe.fit_transform(titanic_test)


In [16]:
y_pred_log_reg = log_reg.predict(X_test)
y_pred_log_reg

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [18]:
log_reg_score = cross_val_score(log_reg, X_train, y_train, scoring='accuracy', cv=5)
log_reg_score

array([0.7877095 , 0.78651685, 0.78651685, 0.76966292, 0.83146067])

In [17]:
y_pred_rnd_for = rnd_for.predict(X_test)
y_pred_rnd_for

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [28]:
params = [{'max_features': ['sqrt', 'log2', 'auto'], 'n_estimators':[50, 100, 150]}]

rnd_for_grid_search = GridSearchCV(rnd_for, param_grid=params, cv=5, refit=True, scoring='accuracy')
rnd_for_grid_search.fit(X_train, y_train)
rnd_for_grid_search.best_score_

0.815956311593748

In [29]:
rnd_for_grid_search.best_params_

{'max_features': 'sqrt', 'n_estimators': 150}

In [33]:
rnd_for_score = cross_val_score(rnd_for, X_train, y_train, scoring='accuracy', cv=10)
rnd_for_score

array([0.76666667, 0.78651685, 0.76404494, 0.84269663, 0.87640449,
       0.84269663, 0.80898876, 0.76404494, 0.84269663, 0.84269663])

In [31]:
svc = SVC(random_state=69)
svc_params = [{'kernel': ['linear', 'rbf', 'sigmoid'], 'gamma':['auto', 'scaled']}]
svc_grid_search = GridSearchCV(svc, svc_params, scoring='accuracy', cv=5, verbose=2, refit=True)
svc_grid_search.fit(X_train, y_train)
svc_grid_search.best_score_

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ..........................gamma=auto, kernel=linear; total time=   0.0s
[CV] END ..........................gamma=auto, kernel=linear; total time=   0.0s
[CV] END ..........................gamma=auto, kernel=linear; total time=   0.0s
[CV] END ..........................gamma=auto, kernel=linear; total time=   0.0s
[CV] END ..........................gamma=auto, kernel=linear; total time=   0.0s
[CV] END .............................gamma=auto, kernel=rbf; total time=   0.0s
[CV] END .............................gamma=auto, kernel=rbf; total time=   0.0s
[CV] END .............................gamma=auto, kernel=rbf; total time=   0.0s
[CV] END .............................gamma=auto, kernel=rbf; total time=   0.0s
[CV] END .............................gamma=auto, kernel=rbf; total time=   0.0s
[CV] END .........................gamma=auto, kernel=sigmoid; total time=   0.0s
[CV] END .........................gamma=auto, ker

0.828278199736363

In [32]:
svc_grid_search.best_params_

{'gamma': 'auto', 'kernel': 'rbf'}

In [34]:
svc_score = cross_val_score(svc, X_train, y_train, scoring='accuracy', cv=10)
svc_score

array([0.8       , 0.84269663, 0.76404494, 0.86516854, 0.83146067,
       0.79775281, 0.83146067, 0.79775281, 0.86516854, 0.85393258])

In [41]:
px.box(y=[rnd_for_score, svc_score], points='all', notched=False, labels={'rnd_for_score': 'Random Forest', 'svc_score': 'SVC'})

In [46]:
titanic_test['AgeGroup'] = pd.cut(titanic_test['Age'], bins=[0, 15, 30, 45, 60, float('Inf')], labels=['<15', '<30', '<45', '<60', 'old'])
titanic_train['AgeGroup'] = pd.cut(titanic_train['Age'], bins=[0, 15, 30, 45, 60, float('Inf')], labels=['<15', '<30', '<45', '<60', 'old'])

In [47]:
num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(sparse=False))
])

num_cols = ['SibSp', 'Parch', 'Fare']
cat_cols = ['Sex', 'Embarked', 'Pclass', 'AgeGroup']
labels = ['Survived']
titanic_pipe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

X_train_age = titanic_pipe.fit_transform(titanic_train)

y_train = titanic_train[labels]

In [48]:
svc_score_age = cross_val_score(svc, X_train_age, y_train, scoring='accuracy', cv=10)
svc_score_age

array([0.81111111, 0.85393258, 0.76404494, 0.85393258, 0.83146067,
       0.80898876, 0.80898876, 0.7752809 , 0.86516854, 0.84269663])

In [51]:
px.box(y=[svc_score, svc_score_age])