In [2]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [3]:
import pandas as pd
import numpy as np
import pickle

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_auc_score

In [5]:
import sys
import os
sys.path.append(os.path.abspath(".."))

from src.components.data_transformation import DataTransformation

project_root = os.path.abspath("..")
train_path = os.path.join(project_root, "artifacts", "train.csv")
test_path = os.path.join(project_root, "artifacts", "test.csv")

transformer = DataTransformation()
X_train, X_test, y_train, y_test, preprocessor_path = transformer.initiate_data_transformation(
    train_path, test_path
)

print("X_train shape:", X_train.shape)
print("Preprocessor saved at:", preprocessor_path)


X_train shape: (5634, 10978)
Preprocessor saved at: artifacts\preprocessor.pkl


In [6]:
log_reg = LogisticRegression(max_iter=1000, solver='liblinear')
log_reg.fit(X_train, y_train)

In [7]:
y_pred_lr = log_reg.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred_lr))
print("F1 Score: ", f1_score(y_test, y_pred_lr, pos_label="Yes"))
print("ROC AUC: ", roc_auc_score((y_test=="Yes").astype(int), (y_pred_lr=="Yes").astype(int)))
classification_report(y_test, y_pred_lr)

Accuracy:  0.8261178140525195
F1 Score:  0.6444121915820029
ROC AUC:  0.7522203360004969


'              precision    recall  f1-score   support\n\n          No       0.86      0.91      0.88      1036\n         Yes       0.70      0.60      0.64       373\n\n    accuracy                           0.83      1409\n   macro avg       0.78      0.75      0.76      1409\nweighted avg       0.82      0.83      0.82      1409\n'

In [8]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)


In [9]:
y_pred_rf = rf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("F1 Score: ", f1_score(y_test, y_pred_rf, pos_label="Yes"))
print("ROC AUC: ", roc_auc_score((y_test=="Yes").astype(int), (y_pred_rf=="Yes").astype(int)))
classification_report(y_test, y_pred_rf)

Accuracy:  0.801277501774308
F1 Score:  0.5583596214511041
ROC AUC:  0.6967248750090573


'              precision    recall  f1-score   support\n\n          No       0.83      0.92      0.87      1036\n         Yes       0.68      0.47      0.56       373\n\n    accuracy                           0.80      1409\n   macro avg       0.75      0.70      0.72      1409\nweighted avg       0.79      0.80      0.79      1409\n'

In [10]:
best_model = log_reg
model_path = os.path.join("artifacts","model.pkl")

with open(model_path, "wb") as f:
  pickle.dump(best_model, f)

print("Model saved at: ", model_path)

Model saved at:  artifacts\model.pkl
