In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
from ml.src.config.paths import (
    MODEL_PATH,
    FEATURES_TRAIN_PATH,
)

In [None]:
train_data = pd.read_csv(FEATURES_TRAIN_PATH)
X = train_data[['Sex','Pclass','Age','Fare','Title','FamilySize']].copy()
y = train_data['Survived']

In [None]:
train_data.head()

In [None]:
# test_train split is tested on 0.1,0.2,0.3, and 0.1 gives the highest test accuracy 
# This part is only to compare traing with single feature and all features combined,
# the final model to export to Django is the model trained with all features combined
#
accuracy = []
number_of_features = X.shape[1]
for i in range(number_of_features):
    data = X.iloc[:,[i]]
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.1, random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] 
    accuracy.append(accuracy_score(y_test, y_pred)) 
    #print("Accuracy:", accuracy_score(y_test, y_pred))
    #print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] 
accuracy.append(accuracy_score(y_test, y_pred)) 

In [None]:
with open(MODEL_PATH, "wb") as f:
    pickle.dump(model, f)

print("Model saved to:", MODEL_PATH)

In [None]:
features = list(X.columns) 
features.append('All features')
# Plot
plt.figure(figsize=(6,4))
for i, acc in enumerate(accuracy):
    plt.bar(features[i], acc, color='skyblue',width=0.4)
    plt.text(i, acc + 0.02, f"{acc:.2f}", ha='center')  

plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Accuracy of Logistic Regression by Feature')
plt.show()