In [29]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.base import BaseEstimator
from pandas import DataFrame, Series
import pandas as pd
from joblib import load, dump

In [19]:
from os import path

In [20]:
data_folder: str = '../data'
models_folder: str = '../models'

In [21]:
def load_data(file_name: str = 'Titanic-Dataset.csv') -> DataFrame:
    """Load the Titanic dataset into a dataframe."""
    file_path: str = path.join(data_folder, file_name)
    try:
        data: DataFrame = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f'There is no file such file "{file_name}" in the data folder.')
        logging.error(f'There is no file such file "{file_name}" in the data folder.')
    return data

In [22]:
# Load the dataset
file_name: str = 'Titanic-Dataset.csv'
data: DataFrame = load_data(file_name)
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
# Get the features and labels
def get_feature_cols(columns_to_drop: list[str] = ["PassengerId", "Name", "Ticket", "Cabin"]) -> list[str]:
    columns: list[str] = [column for column in data.columns.values.tolist() if column not in columns_to_drop]
    return columns
    
def get_features(dataframe: DataFrame, feature_cols: list[str]) -> (DataFrame, Series):
    """Get the features."""
    features: DataFrame = data[feature_cols]
    return features

def get_labels(dataframe: DataFrame, label_columns: list[str] = ['Survived']) -> Series:
    """Get the labels."""
    labels: Series = data[label_columns]
    return labels

features: DataFrame = get_features(data, get_feature_cols(['Survived']))
labels: Series = get_labels(data)

In [24]:
# Split into training and test set
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

In [11]:
def load_model(model_name: str) -> Pipeline:
    """Load a saved model."""
    model_path: str = path.join(models_folder, model_name)
    try:
        model: Pipeline = load(model_path)
    except FileNotFoundError:
        logging.error(f'There is no model "{model_name}" in the models folder.')
    return model

In [12]:
model_name: str = 'random_forest_classifier.pkl'
model: Pipeline = load_model(model_name)
model

In [28]:
predictions: list[int] = model.predict(features_test).tolist()
accuracy: float = round(accuracy_score(labels_test, predictions), 2)
print(accuracy)

0.82


In [32]:
report = classification_report(labels_test, predictions, output_dict=True)
print(classification_report(labels_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       110
           1       0.80      0.71      0.75        69

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179



In [33]:
report

{'0': {'precision': 0.8305084745762712,
  'recall': 0.8909090909090909,
  'f1-score': 0.8596491228070176,
  'support': 110.0},
 '1': {'precision': 0.8032786885245902,
  'recall': 0.7101449275362319,
  'f1-score': 0.7538461538461538,
  'support': 69.0},
 'accuracy': 0.8212290502793296,
 'macro avg': {'precision': 0.8168935815504307,
  'recall': 0.8005270092226614,
  'f1-score': 0.8067476383265857,
  'support': 179.0},
 'weighted avg': {'precision': 0.8200120766010423,
  'recall': 0.8212290502793296,
  'f1-score': 0.8188647381237796,
  'support': 179.0}}