# Modeling Building and Training

#### Data Preparation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier  
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.data_loader import load_data

X_train = load_data("../data/X_train_scaled.csv")
X_test = load_data("../data/X_test_scaled.csv")
y_train = load_data("../data/y_train_resampled.csv").values.ravel()
y_test = load_data("../data/y_test.csv").values.ravel()

#### Model Selection and Training

In [3]:
# Logistic Regression
log_clf = LogisticRegression(max_iter=1000, random_state=42)
log_clf.fit(X_train, y_train)

# Ensemble model
ensemble_clf = XGBClassifier(random_state=42)  # or RandomForestClassifier(random_state=42)
ensemble_clf.fit(X_train, y_train)


#### Model Evaluation

In [5]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_recall_curve, auc

def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    auc_pr = auc(recall, precision)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n{name} Evaluation:")
    print("F1 Score:", round(f1, 4))
    print("AUC-PR:", round(auc_pr, 4))
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", classification_report(y_test, y_pred))



In [6]:
# Evaluate both
evaluate_model("Logistic Regression", log_clf, X_test, y_test)
evaluate_model("XGBoost", ensemble_clf, X_test, y_test)


Logistic Regression Evaluation:
F1 Score: 0.5871
AUC-PR: 0.6676
Confusion Matrix:
 [[26096  1297]
 [ 1115  1715]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96     27393
           1       0.57      0.61      0.59      2830

    accuracy                           0.92     30223
   macro avg       0.76      0.78      0.77     30223
weighted avg       0.92      0.92      0.92     30223


XGBoost Evaluation:
F1 Score: 0.601
AUC-PR: 0.6611
Confusion Matrix:
 [[26454   939]
 [ 1211  1619]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96     27393
           1       0.63      0.57      0.60      2830

    accuracy                           0.93     30223
   macro avg       0.79      0.77      0.78     30223
weighted avg       0.93      0.93      0.93     30223



#### Saving the best Model

In [None]:
import joblib

joblib.dump(ensemble_clf, '../models/xgb_fraud_model.joblib')
print("✅ XGBoost model saved as models/xgb_fraud_model.joblib")

✅ XGBoost model saved as models/xgb_fraud_model.joblib
