In [75]:
# imports 

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression

In [76]:
train_df = pd.read_csv("../data/transformed/train_df_clean.csv", index_col=0)
test_df = pd.read_csv("../data/transformed/test_df_clean.csv", index_col=0)

In [77]:
# Sex and Embarked Column should be encoded as Numerical Value
encoder = OrdinalEncoder()

columns_to_encode = ['Sex', 'Embarked']

train_df[columns_to_encode] = encoder.fit_transform(train_df[columns_to_encode])
test_df[columns_to_encode] = encoder.transform(test_df[columns_to_encode])

In [78]:
label = 'Survived'
features = train_df.columns.drop(label).tolist()

X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df[label], test_size=0.2)
X_test_unseen = test_df[features]

In [79]:
def evaluvate_model(model, train, test):
    X_train, y_train = train
    X_test, y_test = test

    y_train_predicted = model.predict(X_train)
    y_test_predicted = model.predict(X_test)

    print("\n-------------- Train Classification Report --------------")
    print(classification_report(y_train, y_train_predicted))
    print("\n-------------- Test Classification Report --------------")
    print(classification_report(y_test, y_test_predicted))
    

In [84]:
lr = LogisticRegression(penalty='l2', max_iter=1000)
lr.fit(X_train, y_train)

evaluvate_model(lr, (X_train, y_train), (X_test, y_test))


-------------- Train Classification Report --------------
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       441
           1       0.78      0.72      0.75       271

    accuracy                           0.82       712
   macro avg       0.81      0.80      0.80       712
weighted avg       0.82      0.82      0.82       712


-------------- Test Classification Report --------------
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       108
           1       0.80      0.62      0.70        71

    accuracy                           0.79       179
   macro avg       0.79      0.76      0.77       179
weighted avg       0.79      0.79      0.78       179



In [81]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_leaf_nodes=40)
rfc.fit(X_train, y_train)

evaluvate_model(rfc, (X_train, y_train), (X_test, y_test))


-------------- Train Classification Report --------------
              precision    recall  f1-score   support

           0       0.90      0.97      0.94       441
           1       0.94      0.83      0.88       271

    accuracy                           0.92       712
   macro avg       0.92      0.90      0.91       712
weighted avg       0.92      0.92      0.92       712


-------------- Test Classification Report --------------
              precision    recall  f1-score   support

           0       0.80      0.93      0.86       108
           1       0.85      0.65      0.74        71

    accuracy                           0.82       179
   macro avg       0.83      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179



In [89]:
from datetime import datetime

now = datetime.now().strftime("%Y%m%d_%H%M%S")

output = pd.read_csv("../data/input/gender_submission.csv", index_col=0)
output['Survived'] = rfc.predict(X_test_unseen)

output.to_csv(f"../data/output/titanic_prediction_{now}.csv", index=True)