In [239]:
from __future__ import annotations
from os import path
from zipfile import ZipFile
import logging
from uuid import uuid4
from json import load, dump
import time

In [266]:
from pydantic import BaseModel, Field
from redis import Redis

In [316]:
import numpy as np
from pandas import DataFrame
from pandas import Series
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.base import BaseEstimator
import joblib

In [9]:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')

In [10]:
data_folder: str = '../data'
models_folder: str = '../models'

In [17]:
def extract_dataset(archive_name: str = 'archive.zip', file_name: str = 'Titanic-Dataset.csv') -> None:
    """Extract the downloaded archive file into the data folder."""
    # Ubuntu OS
    downloads_path: str = path.join(path.expanduser('~'), 'Downloads')
    archive_path: str = path.join(downloads_path, archive_name)
    try:
        with ZipFile(archive_path, 'r') as zip_:
            try:
                zip_.extract(file_name, data_folder)
                logging.info(f'The file {file_name} has been extracted to {path.join(data_folder, file_name)}.')
            except KeyError:
                print(f'There is no file "{file_name}" in the archive "{archive_path}".')
                logging.error(f'There is no file "{file_name}" in the archive "{archive_path}".')
    except FileNotFoundError:
        print(f'There is no archive "{archive_path}".')
        logging.error(f'There is no archive "{archive_path}".')
    return path.join(data_folder, file_name)

def load_data(file_name: str = 'Titanic-Dataset.csv') -> DataFrame:
    """Load the Titanic dataset into a dataframe."""
    file_path: str = path.join(data_folder, file_name)
    try:
        data: DataFrame = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f'There is no file such file "{file_name}" in the data folder.')
        logging.error(f'There is no file such file "{file_name}" in the data folder.')
    return data

In [18]:
archive_name: str = 'archive.zip'
file_name: str = 'Titanic-Dataset.csv'
data_path: str = extract_dataset()
data_path

22-Sep-23 16:33:15 - INFO - The file Titanic-Dataset.csv has been extracted to ../data/Titanic-Dataset.csv.


'../data/Titanic-Dataset.csv'

In [15]:
# Load the dataset
file_name: str = 'Titanic-Dataset.csv'
data: DataFrame = load_data(file_name)
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [40]:
class DatasetMetadata(BaseModel):
    source: str
    cols: list[str] = Field(default_factory=list)
    description: str
    path: str
    id: str

    @classmethod
    def from_metadata(cls, metadata_file: str) -> Self:
        """Create metadata from metadata file."""
        try:
            with open(metadata_file, 'r', encoding='utf-8') as f:
                metadata: dict = load(f)
        except FileNotFoundError: #JSONDecodeError
            logging.error(f'There is no file such path "{metadata_file}".')
        else:
            return cls(**metadata)

    def save(self, path: str) -> str:
        """Save the metadata to disk."""
        try:
            with open(path, 'w', encoding='utf-8') as f:
                dump(self.model_dump(), f)
        except FileNotFoundError:
            logging.error(f'There is no file such path "{path}".')
            return ''
        return path

dataset_metadata: DatasetMetadata = DatasetMetadata(
    source='https://www.kaggle.com/datasets/yasserh/titanic-dataset',
    cols=data.columns.values.tolist(),
    description='A dataset that shows the survivors of the titanic tragedy.',
    path=data_path,
    id=f'Dataset_{str(uuid4())}'
)

In [32]:
dataset_metadata.save('metadata.json')

'metadata.json'

In [41]:
metadata: DatasetMetadata = DatasetMetadata.from_metadata('metadata.json')
metadata

DatasetMetadata(source='https://www.kaggle.com/datasets/yasserh/titanic-dataset', cols=['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], description='A dataset that shows the survivors of the titanic tragedy.', path='../data/Titanic-Dataset.csv', id='Dataset_73ebe0e2-3224-470f-b80f-6174521544ac')

In [49]:
class DataSet(BaseModel):
    metadata: DatasetMetadata
    dataset_path: str = metadata.path

    def get_dataset(self) -> DataFrame:
        """Load the dataset."""
        data: DataFrame = pd.read_csv(self.dataset_path)
        return data

In [50]:
dataset: Dataset = DataSet(metadata=metadata)
dataset.dataset_path

'../data/Titanic-Dataset.csv'

In [52]:
data: DataFrame = dataset.get_dataset()
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [309]:
from datetime import datetime

class Metrics(BaseModel):
    accuracy: float
    precision: float
    recall: float
    f1: float
    
class Model:
    def __init__(self, name: str, model: BaseEstimator, save_path: str) -> None:
        self._model = model
        self._name: str = name
        self._metrics: Metrics = None
        self._train_time: str = None
        self.__save_path: str = save_path
        self.date: datetime = None
        self._trained_model: Pipeline = None
        self.redis: Redis = Redis(
            host='localhost',
            port=6379
        )

    @property
    def metrics(self) -> None:
        return self._metrics

    @metrics.setter
    def metrics(self, metrics: Metrics) -> None:
        self._metrics = metrics

    @property
    def trained_model(self) -> None:
        return self._trained_model

    @trained_model.setter
    def trained_model(self, trained_model: Pipeline) -> None:
        self._trained_model = trained_model

    @property
    def name(self) -> None:
        return self._name

    @name.setter
    def name(self, name: str) -> None:
        return self._name

    @property
    def model(self) -> None:
        return self._model

    @model.setter
    def model(self, model: BaseEstimator) -> None:
        return self._model
    
    @property
    def train_time(self) -> None:
        return self._train_time

    @train_time.setter
    def train_time(self, train_time: float) -> None:
        self._train_time = train_time

    @property
    def date(self) -> None:
        return self._date

    @date.setter
    def date(self, date: datetime) -> None:
        self._date = date

    def post_model_metrics(self) -> None:
        accuracy: str = 'models:accuracy'
        precision: str = 'models:precision'
        train_time: str = 'models:train_time'
        self.redis.zadd(name=accuracy, mapping={self.name: self.metrics.accuracy})
        self.redis.zadd(name=precision, mapping={self.name: self.metrics.precision})
        self.redis.zadd(name=train_time, mapping={self.name: self.train_time})

In [310]:
class ExperimentConfig(BaseModel):
    data_dir: str
    models_directory: str
    features_dir: str
    dataset_metadata: DatasetMetadata
    label_columns: list[str] = Field(default_factory=list)
    feature_cols: list[str] = Field(default_factory=list)
    columns_to_drop: list[str] = Field(default_factory=list)

class Experiment:
    def __init__(self, experiment_config: ExperimentConfig, preprocessor: ColumnTransformer, models: dict[str, BaseEstimator]):
        self.experiment_config = experiment_config
        self.preprocessor = preprocessor
        self.models = models
        self.dataset: DataSet = DataSet(metadata=experiment_config.dataset_metadata)
    
    def get_features(self) -> DataFrame:
        data: DataFrame = dataset.get_dataset()
        features: DataFrame = data[experiment_config.feature_cols]
        return features

    def get_labels(self) -> Series:
        data: DataFrame = dataset.get_dataset()
        labels: Series = data[experiment_config.label_columns]
        return labels

    def get_train_test_data(self) -> ((DataFrame, Series), (DataFrame, Series)):
        features = self.get_features()
        labels = self.get_labels()
        train_features, test_features, train_labels, test_labels = train_test_split(
            features, labels, test_size=0.2, random_state=42, stratify=labels
        )
        return (train_features, train_labels), (test_features, test_labels)

    def save_features(self) -> DataFrame:
        pas

    def save_labels(self) -> DataFrame:
        pass

    def train_model(self, model: Model) -> float:
        (train_features, train_labels), (test_features, test_labels) = self.get_train_test_data()
        pipeline: Pipeline = Pipeline(steps=[
            ('preprocessor', self.preprocessor),
            ('classifier', model.model)
        ])
        train_start_time: float = time.perf_counter()
        pipeline.fit(train_features, train_labels.values.ravel())
        train_stop_time: float = time.perf_counter()
        predictions: list[int] = pipeline.predict(test_features).tolist()
        accuracy: float = accuracy_score(test_labels, predictions)
        precision: float = precision_score(test_labels, predictions)
        recall: float = recall_score(test_labels, predictions)
        f1: float = f1_score(test_labels, predictions)
        model.metrics = Metrics(
            accuracy=round(accuracy,2),
            precision=round(precision,2),
            recall=round(recall,2),
            f1=round(f1,2)
        )
        model.train_time = train_stop_time - train_start_time
        model.data = datetime.now()
        model.trained_model = pipeline
        model.post_model_metrics()
        return model

    def run(self) -> None:
        for model in self.models:
            trained_model: Model = self.train_model(model)
            print(model.name, model.metrics)

In [311]:
metadata: DatasetMetadata = DatasetMetadata.from_metadata('metadata.json')
experiment_config: ExperimentConfig = ExperimentConfig(
    data_dir='../data',
    models_directory='../models',
    features_dir='../features',
    dataset_metadata=metadata,
    label_columns=['Survived'],
    feature_cols=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', "PassengerId", "Name", "Ticket", "Cabin"],
    columns_to_drop=["PassengerId", "Name", "Ticket", "Cabin"]
)

In [312]:
columns_to_drop: list[str] = ["PassengerId", "Name", "Ticket", "Cabin"]
numerical_features = ["Age", "Fare"]
categorical_features = ["Pclass", "Sex", "Embarked"]

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('drop_columns', 'drop', columns_to_drop),
        ('num', num_pipeline, numerical_features),
        ('cat', cat_pipeline, categorical_features)
    ],
    remainder='passthrough'
)

In [313]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]
models: list[Model] = [
    Model(
        save_path=path.join(models_folder, model_name),
        model=model,
        name=model_name
    ) 
    for model_name, model in zip(names, classifiers)
]

In [314]:
experiment: Experiment = Experiment(
    experiment_config=experiment_config,
    preprocessor=preprocessor,
    models=models
)

In [315]:
experiment.run()

Nearest Neighbors accuracy=0.83 precision=0.79 recall=0.75 f1=0.77
Linear SVM accuracy=0.78 precision=0.74 recall=0.65 f1=0.69
RBF SVM accuracy=0.78 precision=0.72 recall=0.68 f1=0.7
Gaussian Process accuracy=0.82 precision=0.82 recall=0.67 f1=0.74
Decision Tree accuracy=0.82 precision=0.78 recall=0.75 f1=0.76
Random Forest accuracy=0.77 precision=0.75 recall=0.58 f1=0.66
Neural Net accuracy=0.8 precision=0.85 recall=0.58 f1=0.69
AdaBoost accuracy=0.79 precision=0.72 recall=0.74 f1=0.73
Naive Bayes accuracy=0.78 precision=0.72 recall=0.7 f1=0.71
QDA accuracy=0.64 precision=0.57 recall=0.23 f1=0.33




In [317]:
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

24-Sep-23 17:00:37 - DEBUG - matplotlib data path: /home/lyle/tutorial/titanic_analysis/venv/lib/python3.10/site-packages/matplotlib/mpl-data
24-Sep-23 17:00:37 - DEBUG - CONFIGDIR=/home/lyle/.config/matplotlib
24-Sep-23 17:00:37 - DEBUG - interactive is False
24-Sep-23 17:00:37 - DEBUG - platform is linux
24-Sep-23 17:00:37 - DEBUG - CACHEDIR=/home/lyle/.cache/matplotlib
24-Sep-23 17:00:37 - DEBUG - Using fontManager instance from /home/lyle/.cache/matplotlib/fontlist-v330.json


In [324]:
features = experiment.get_features()
labels = experiment.get_labels()

In [325]:
features.shape

(891, 11)

In [326]:
labels.shape

(891, 1)

In [344]:
def feature_elimination(model, faetures, labels):
    rfecv = RFECV(estimator=model, 
                  step=1, 
                  cv=StratifiedKFold(10),
                  scoring='accuracy')
    rfecv.fit(faetures, labels.values.ravel())
    return rfecv

In [382]:
(train_features, train_labels), (test_features, test_labels) = experiment.get_train_test_data()
train_features = preprocessor.fit_transform(train_features)
test_features = preprocessor.fit_transform(test_features)
for model_data in models[2:]:
    try:
        model = model_data.model
        rfecv = feature_elimination(model, train_features, train_labels)
        print(f"Optimum number of features for {model_data.name} is: {rfecv.n_features_}")
        df_features = pd.DataFrame(columns = ['feature', 'support', 'ranking'])

        for i in range(features.shape[1]):
            row = {'feature': i, 'support': rfecv.support_[i], 'ranking': rfecv.ranking_[i]}
            df_features = pd.concat([df_features, pd.DataFrame([row])], ignore_index=True)
            
        df_features.sort_values(by='ranking').head(10)
        df_features = df_features[df_features['support']==True]
        cols = df_features.feature.values.tolist()
        print(cols)
        train_features = train_features[:, cols]
        model.fit(train_features, train_labels.values.ravel())
        predictions: list[int] = model.predict(test_features[:, cols]).tolist()
        accuracy: float = accuracy_score(test_labels, predictions)
        print(accuracy)
    except (ValueError, IndexError) as e:
        print(e)

when `importance_getter=='auto'`, the underlying estimator SVC should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.
when `importance_getter=='auto'`, the underlying estimator GaussianProcessClassifier should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.
Optimum number of features for Decision Tree is: 7
[0, 1, 2, 4, 6, 9, 10]
0.6871508379888268
Optimum number of features for Random Forest is: 7
index 7 is out of bounds for axis 0 with size 7
when `importance_getter=='auto'`, the underlying estimator MLPClassifier should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.
Optimum number of features for AdaBoost is: 7
index 7 is out of bounds for axis 0 with size 7
when `importance_getter=='auto'`, the underlying estima

In [333]:
print("Optimum number of features: %d" % rfecv.n_features_)

Optimum number of features: 6


In [364]:
df_features = pd.DataFrame(columns = ['feature', 'support', 'ranking'])

for i in range(features.shape[1]):
    row = {'feature': i, 'support': rfecv.support_[i], 'ranking': rfecv.ranking_[i]}
    df_features = pd.concat([df_features, pd.DataFrame([row])], ignore_index=True)
    
df_features.sort_values(by='ranking').head(10)
df_features = df_features[df_features['support']==True]
cols = df_features.feature.values.tolist()
cols

[0, 1, 3, 4, 6, 7, 9, 10, 11]

In [347]:
 x = df_features[df_features['support']==True]
x

Unnamed: 0,feature,support,ranking
0,0,True,1
1,1,True,1
4,4,True,1
5,5,True,1
10,10,True,1
11,11,True,1


In [349]:
x.feature.values.tolist()

[0, 1, 4, 5, 10, 11]

In [353]:
y = features[0:3]
y

array([[-0.5924806 , -0.50244517,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         1.        ,  0.        ],
       [ 0.63878901,  0.78684529,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [-0.2846632 , -0.48885426,  0.        ,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ]])

In [360]:
y[:, x.feature.values.tolist()]

array([[-0.5924806 , -0.50244517,  1.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.63878901,  0.78684529,  0.        ,  1.        ,  1.        ,
         0.        ],
       [-0.2846632 , -0.48885426,  1.        ,  1.        ,  0.        ,
         0.        ]])

In [388]:
def load_model(model_path: str) -> Pipeline:
    """Load a saved model."""
    try:
        model: Pipeline = joblib.load(model_path)
    except FileNotFoundError:
        logging.error(f'There is no such model "{model_path}".')
    return model

In [389]:
model_path = '/home/lyle/tutorial/titanic_analysis/analysis/models/trained/Nearest Neighbors'
model = load_model(model_path)
model

In [390]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [391]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('drop_columns', 'drop',
                                    ['PassengerId', 'Name', 'Ticket', 'Cabin']),
                                   ('num',
                                    Pipeline(steps=[('imputer', SimpleImputer()),
                                                    ('scaler', StandardScaler())]),
                                    ['Age', 'Fare']),
                                   ('cat',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('encoder', OneHotEncoder())]),
                                    ['Pclass', 'Sex', 'Embarked'])])),
  ('classifier', KNeighborsClassifier(n_neighbors=3))],
 'verbose': False,
 'preprocessor': ColumnTransformer(remainder='passthrough',
             

In [416]:
(train_features, train_labels), (test_features, test_labels) = experiment.get_train_test_data()
train_features = preprocessor.fit_transform(train_features)
test_features = preprocessor.fit_transform(test_features)

In [421]:
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)#Create new KNN object
knn_2 = KNeighborsClassifier()#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10)#Fit the model
best_model = clf.fit(train_features, train_labels.values.ravel())

In [422]:
best_model.best_estimator_.get_params()

{'algorithm': 'auto',
 'leaf_size': 1,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 25,
 'p': 2,
 'weights': 'uniform'}

In [423]:
train_features.shape, test_features.shape

((712, 12), (179, 12))

In [424]:
preds = best_model.predict(test_features)
accuray = accuracy_score(preds, test_labels)
accuracy

0.6871508379888268