In [1]:
# https://www.kaggle.com/datasets/yasserh/titanic-dataset

In [2]:
import numpy as np
from pandas import DataFrame
from pandas import Series
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.base import BaseEstimator
from optuna import create_study, Trial
from optuna.samplers import TPESampler
import joblib

In [3]:
from os import path
from zipfile import ZipFile
import logging
from functools import partial

In [4]:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')

In [5]:
data_folder: str = '../data'
models_folder: str = '../models'

In [6]:
def extract_dataset(archive_name: str = 'archive.zip', file_name: str = 'Titanic-Dataset.csv') -> None:
    """Extract the downloaded archive file into the data folder."""
    # Ubuntu OS
    downloads_path: str = path.join(path.expanduser('~'), 'Downloads')
    archive_path: str = path.join(downloads_path, archive_name)
    try:
        with ZipFile(archive_path, 'r') as zip_:
            try:
                zip_.extract(file_name, data_folder)
                logging.info(f'The file {file_name} has been extracted to {data_folder}.')
            except KeyError:
                print(f'There is no file "{file_name}" in the archive "{archive_path}".')
                logging.error(f'There is no file "{file_name}" in the archive "{archive_path}".')
    except FileNotFoundError:
        print(f'There is no archive "{archive_path}".')
        logging.error(f'There is no archive "{archive_path}".')

In [7]:
extract_dataset()

22-Sep-23 14:10:14 - INFO - The file Titanic-Dataset.csv has been extracted to ../data.


In [8]:
def load_data(file_name: str = 'Titanic-Dataset.csv') -> DataFrame:
    """Load the Titanic dataset into a dataframe."""
    file_path: str = path.join(data_folder, file_name)
    try:
        data: DataFrame = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f'There is no file such file "{file_name}" in the data folder.')
        logging.error(f'There is no file such file "{file_name}" in the data folder.')
    return data

In [9]:
# Load the dataset
file_name: str = 'Titanic-Dataset.csv'
data: DataFrame = load_data(file_name)
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
# Get the features and labels
def get_feature_cols(columns_to_drop: list[str] = ["PassengerId", "Name", "Ticket", "Cabin"]) -> list[str]:
    columns: list[str] = [column for column in data.columns.values.tolist() if column not in columns_to_drop]
    return columns
    
def get_features(dataframe: DataFrame, feature_cols: list[str]) -> (DataFrame, Series):
    """Get the features."""
    features: DataFrame = data[feature_cols]
    return features

def get_labels(dataframe: DataFrame, label_columns: list[str] = ['Survived']) -> Series:
    """Get the labels."""
    labels: Series = data[label_columns]
    return labels

features: DataFrame = get_features(data, get_feature_cols(['Survived']))
labels: Series = get_labels(data)

In [11]:
features.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [12]:
labels[:3]

Unnamed: 0,Survived
0,0
1,1
2,1


In [14]:
# Split into training and test set
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

In [15]:
# Preprocess the data

columns_to_drop: list[str] = ["PassengerId", "Name", "Ticket", "Cabin"]
numerical_features = ["Age", "Fare"]
categorical_features = ["Pclass", "Sex", "Embarked"]

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('drop_columns', 'drop', columns_to_drop),
        ('num', num_pipeline, numerical_features),
        ('cat', cat_pipeline, categorical_features)
    ],
    remainder='passthrough'
)

In [16]:
# Create the model
model: BaseEstimator = RandomForestClassifier(random_state=42)

In [17]:
# Create the pipeline
pipeline: Pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

In [18]:
# Use stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores: list[float] = cross_val_score(pipeline, features_train, labels_train.values.ravel(), cv=cv, scoring='accuracy')
cv_scores

array([0.82517483, 0.78321678, 0.82394366, 0.76056338, 0.80985915])

In [19]:
# Train model on entire dataset
pipeline.fit(features_train, labels_train.values.ravel())

In [20]:
# Evaluate pipeline
predictions: list[int] = pipeline.predict(features_test).tolist()
accuracy: float = accuracy_score(labels_test, predictions)
precision: float = precision_score(labels_test, predictions)
recall: float = recall_score(labels_test, predictions)
f1: float = f1_score(labels_test, predictions)

In [21]:
accuracy

0.8212290502793296

In [22]:
precision

0.8032786885245902

In [23]:
recall

0.7101449275362319

In [24]:
f1

0.7538461538461538

In [None]:
path = '/home/lyle/tutorial/titanic_analysis/analysis/models/trained/Gaussian Process.pkl'

In [25]:
def save_model(model: Pipeline, model_name: str) -> list[str]:
    """Save the trained model."""
    model_path: str = path.join(models_folder, model_name)
    return joblib.dump(pipeline, model_path)

In [26]:
model_name: str = 'random_forest_classifier.pkl'
save_model(pipeline, model_name)

['../models/random_forest_classifier.pkl']

In [27]:
def load_model(model_name: str) -> Pipeline:
    """Load a saved model."""
    model_path: str = path.join(models_folder, model_name)
    try:
        model: Pipeline = joblib.load(model_path)
    except FileNotFoundError:
        logging.error(f'There is no model "{model_name}" in the models folder.')
    return model

In [28]:
model_name: str = 'random_forest_classifier.pkl'
model: Pipeline = load_model(model_name)
model

In [29]:
ship_classes_map: dict[str, int] = {
    'First': 1,
    'Second': 2,
    'Third': 3
}

def preprocess_data(sample_data: list[dict[str, str | int | float]]) ->  DataFrame:
    """Preprocess the data for the model."""
    df_data: list[dict[str, str | int | float]] = []
    for record in sample_data:
        data: dict[str, str | int | float] = {}
        data['PassengerId'] = record['PassengerId']
        data['Pclass'] = ship_classes_map[record['PassengerClass']]
        data['Name'] = record['Name']
        data['Sex'] =  record['Sex'].lower()
        data['Ticket'] = record['Ticket']
        data['Age'] = record['Age']
        data['SibSp'] = record['SiblingSpouse']
        data['Parch'] = record['ParentChild']
        data['Fare'] = record['Fare']
        data['Cabin'] = record['Cabin']
        data['Embarked'] = record['Embarked']   
        df_data.append(data)
    return DataFrame(df_data)

In [31]:
result_classes_map: dict[str, int] = {
    1: 'First',
    2: 'Second',
    3: 'Third'
}

def post_process_data(passenger_data: Series)-> dict:
    data = {
        'PassengerId': passenger_data['PassengerId'],
        'PassengerClass': result_classes_map[passenger_data['Pclass']],
        'Name': passenger_data['Name'],
        'Sex': passenger_data['Sex'].capitalize(),
        'Ticket': passenger_data['Ticket'],
        'Age': passenger_data['Age'],
        'SiblingSpouse': passenger_data['SibSp'],
        'ParentChild': passenger_data['Parch'],
        'Fare': passenger_data['Fare'],
        'Cabin': passenger_data['Cabin'],
        'Embarked': passenger_data['Embarked']
    }
    return data

def predict_survival(data: DataFrame)-> dict[str, list[dict[str, str]]]:
    """Predixt whether the given passenger surivived."""
    predictions: dict[str, list[dict[str, str]]] = {}
    survival_map: dict[int, str] = {
        1: 'Survived',
        0: 'Died'
    }
    outcomes: list[int] = model.predict(data).tolist()
    predictions['predictions'] = [
        {
            'PassengerDetails': post_process_data(row),
            'Outcome': survival_map[outcomes[index]]
        }
        for index, row in df.iterrows()
    ]
    return predictions

In [32]:
from random import choice, randint
from faker import Faker

fake = Faker()
cabin_letters: list[str] = ['A','B','C','D','E','F','G']

sample_data: list[dict[str, str | int]] = [
    {
        'PassengerId': randint(1,10),
        'PassengerClass': choice(['First', 'Second', 'Third']),
        'Name': fake.name(),
        'Sex': choice(['Male', 'Female']),
        'Ticket': 'A/5 21171',
        'Age': randint(1,80),
        'SiblingSpouse': choice([1, 0, 3, 4, 2, 5, 8]),
        'ParentChild': choice([0, 1, 2, 5, 3, 4, 6]),
        'Fare': randint(0,500),
        'Cabin': f'{choice(cabin_letters)}{randint(1,101)}',
        'Embarked': choice(['S', 'C', 'Q'])
    }
    for _ in range(10)
]

df: DataFrame = preprocess_data(sample_data)
predictions = predict_survival(df)
predictions

{'predictions': [{'PassengerDetails': {'PassengerId': 9,
    'PassengerClass': 'Second',
    'Name': 'Michael Mcfarland',
    'Sex': 'Male',
    'Ticket': 'A/5 21171',
    'Age': 16,
    'SiblingSpouse': 1,
    'ParentChild': 2,
    'Fare': 8,
    'Cabin': 'G15',
    'Embarked': 'S'},
   'Outcome': 'Died'},
  {'PassengerDetails': {'PassengerId': 1,
    'PassengerClass': 'Second',
    'Name': 'Christopher Tran',
    'Sex': 'Female',
    'Ticket': 'A/5 21171',
    'Age': 45,
    'SiblingSpouse': 5,
    'ParentChild': 2,
    'Fare': 101,
    'Cabin': 'C96',
    'Embarked': 'S'},
   'Outcome': 'Survived'},
  {'PassengerDetails': {'PassengerId': 4,
    'PassengerClass': 'First',
    'Name': 'Adrienne Stewart',
    'Sex': 'Female',
    'Ticket': 'A/5 21171',
    'Age': 28,
    'SiblingSpouse': 3,
    'ParentChild': 0,
    'Fare': 283,
    'Cabin': 'F12',
    'Embarked': 'C'},
   'Outcome': 'Survived'},
  {'PassengerDetails': {'PassengerId': 10,
    'PassengerClass': 'Third',
    'Name': 'Mrs

In [33]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [34]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

In [37]:
for name, model in zip(names, classifiers):
    pipeline: Pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(features_train, labels_train.values.ravel())
    predictions: list[int] = pipeline.predict(features_test).tolist()
    accuracy: float = round(accuracy_score(labels_test, predictions), 2)
    print(name, accuracy)

Nearest Neighbors 0.83
Linear SVM 0.78
RBF SVM 0.78
Gaussian Process 0.82
Decision Tree 0.76
Random Forest 0.77
Neural Net 0.8
AdaBoost 0.79
Naive Bayes 0.78
QDA 0.64




In [36]:
round(0.45678, 2)

0.46