## Data loading

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the processed data
fraud_data = pd.read_csv('../data/processed/fraud_one_hot_encoded.csv')
credit_data = pd.read_csv('../data/processed/credit_minmax_scaled.csv')

## Appending root directory amd autoloading

In [5]:
import sys
import os
sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2

## Spliting Fraud data as training and test

In [6]:
from src.data_split import DataSplitter

# For fraud_data, assume target column is 'class'
X_fraud, y_fraud = DataSplitter.separate_features_and_target(fraud_data, target_col='class')


In [7]:
# For credit_data, assume target column is 'Class'
X_credit, y_credit = DataSplitter.separate_features_and_target(credit_data, target_col='Class')


### Training and evaluating Logistic regretion with Fraud data

In [None]:
from scripts.logistic_regression import run_logistic_regression

# Use the processed fraud data from the data/processed directory
fraud_data_path = '../data/processed/fraud_one_hot_encoded.csv'

# Run logistic regression on the processed fraud data
fraud_logreg_metrics = run_logistic_regression(fraud_data_path, target_col='class')

# Display the results
print("Logistic Regression on processed fraud data:")
for metric, value in fraud_logreg_metrics.items():
    if metric != 'classification_report':
        print(f"{metric}: {value}")
    else:
        print("Classification Report:")
        print(value)

import joblib

# Save the trained logistic regression model to a file
# We need to retrain the model here to get the fitted model object
from scripts.logistic_regression import LogisticRegression
import pandas as pd

# Load the data
fraud_df = pd.read_csv(fraud_data_path)
from src.data_split import DataSplitter
X, y = DataSplitter.separate_features_and_target(fraud_df, target_col='class')

# Only keep numeric columns for modeling (as in the script)
X_numeric = X.select_dtypes(include=['float64', 'int64'])

# Split the data (as in the script)
splitter = DataSplitter()
X_train, X_test, y_train, y_test = splitter.train_test_split(X_numeric, y, test_size=0.2, random_state=42, stratify=True)

# Handle imbalance (as in the script)
from scripts.logistic_regression import ImbalanceHandler
imbalance_handler = ImbalanceHandler()
X_train_bal, y_train_bal = imbalance_handler.apply_smote(X_train, y_train)

# Train the model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_bal, y_train_bal)

# Save the model
joblib.dump(model, 'fraud_logreg_model.joblib')
print("Trained logistic regression model saved as 'fraud_logreg_model.joblib'")





Class distribution in y_train before SMOTE: {1: 11321, 0: 11320}
Columns used for modeling: ['user_id', 'purchase_value', 'age', 'ip_address', 'ip_int', 'hour_of_day', 'day_of_week', 'transaction_count']
Class distribution in y_train after SMOTE: {0: 11321, 1: 11321}
Logistic Regression on processed fraud data:
accuracy: 0.5046811517399753
precision: 0.5042819499341239
recall: 0.5409893992932863
f1: 0.5219911353562905
roc_auc: 0.5063436985519981
Classification Report:
{'0': {'precision': 0.5051428571428571, 'recall': 0.46838572942423173, 'f1-score': 0.48607038123167157, 'support': 2831.0}, '1': {'precision': 0.5042819499341239, 'recall': 0.5409893992932863, 'f1-score': 0.5219911353562905, 'support': 2830.0}, 'accuracy': 0.5046811517399753, 'macro avg': {'precision': 0.5047124035384905, 'recall': 0.5046875643587589, 'f1-score': 0.504030758293981, 'support': 5661.0}, 'weighted avg': {'precision': 0.5047124795769298, 'recall': 0.5046811517399753, 'f1-score': 0.5040275856430249, 'support':

### Training and evaluating Logistic regretion with Credit data

In [14]:
from scripts.logistic_regression import run_logistic_regression

# Use the processed fraud data from the data/processed directory
fraud_data_path = '../data/processed/credit_minmax_scaled.csv'

# Run logistic regression on the processed fraud data
fraud_logreg_metrics = run_logistic_regression(fraud_data_path, target_col='class')

# Display the results
print("Logistic Regression on processed fraud data:")
for metric, value in fraud_logreg_metrics.items():
    if metric != 'classification_report':
        print(f"{metric}: {value}")
    else:
        print("Classification Report:")
        print(value)






Loading data from: ../data/processed/credit_minmax_scaled.csv

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283726 entries, 0 to 283725
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    283726 non-null  float64
 1   V1      283726 non-null  float64
 2   V2      283726 non-null  float64
 3   V3      283726 non-null  float64
 4   V4      283726 non-null  float64
 5   V5      283726 non-null  float64
 6   V6      283726 non-null  float64
 7   V7      283726 non-null  float64
 8   V8      283726 non-null  float64
 9   V9      283726 non-null  float64
 10  V10     283726 non-null  float64
 11  V11     283726 non-null  float64
 12  V12     283726 non-null  float64
 13  V13     283726 non-null  float64
 14  V14     283726 non-null  float64
 15  V15     283726 non-null  float64
 16  V16     283726 non-null  float64
 17  V17     283726 non-null  float64
 18  V18     283726 non-null  float64
 19  V19   

### Training and evaluating Lightgbm with fraud data

In [20]:
from scripts.lightgbm_model import run_lightgbm

# Use the processed fraud data from the data/processed directory
fraud_data_path = '../data/processed/fraud_one_hot_encoded.csv'

# Run lightgbm on the processed fraud data
fraud_logreg_metrics = run_lightgbm(fraud_data_path, target_col='class')

# Display the results
print("lightgbm on processed fraud data:")
for metric, value in fraud_logreg_metrics.items():
    if metric != 'classification_report':
        print(f"{metric}: {value}")
    else:
        print("Classification Report:")
        print(value)




Loading data from: ../data/processed/fraud_one_hot_encoded.csv

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28302 entries, 0 to 28301
Columns: 164 entries, user_id to country_Zimbabwe
dtypes: bool(152), float64(7), int64(2), object(3)
memory usage: 6.7+ MB
None

Columns in DataFrame: ['user_id', 'signup_time', 'purchase_time', 'purchase_value', 'device_id', 'age', 'ip_address', 'class', 'ip_int', 'hour_of_day', 'day_of_week', 'transaction_count', 'source_Ads', 'source_Direct', 'source_SEO', 'browser_Chrome', 'browser_FireFox', 'browser_IE', 'browser_Opera', 'browser_Safari', 'sex_F', 'sex_M', 'country_Afghanistan', 'country_Albania', 'country_Algeria', 'country_Angola', 'country_Argentina', 'country_Armenia', 'country_Australia', 'country_Austria', 'country_Azerbaijan', 'country_Bahrain', 'country_Bangladesh', 'country_Barbados', 'country_Belarus', 'country_Belgium', 'country_Benin', 'country_Bermuda', 'country_Bolivia', 'country_Bosnia and Herzegowina', 'country

In [19]:
# Re-run the training to get the model object for saving
from scripts.lightgbm_model import DataSplitter, ImbalanceHandler
import pandas as pd
import lightgbm as lgb

df = pd.read_csv(fraud_data_path)
splitter = DataSplitter()
imbalance_handler = ImbalanceHandler()

# Find the correct target column (case-insensitive)
target_col = 'class'
if target_col not in df.columns:
    for col in df.columns:
        if col.lower() == target_col.lower():
            target_col = col
            break

X, y = splitter.separate_features_and_target(df, target_col)
X_train, X_test, y_train, y_test = splitter.train_test_split(X, y, test_size=0.2, random_state=42, stratify=True)
X_train_numeric = X_train.select_dtypes(include=['float64', 'int64'])
X_test_numeric = X_test.select_dtypes(include=['float64', 'int64'])
X_train_bal, y_train_bal = imbalance_handler.apply_smote(X_train_numeric, y_train)

model = lgb.LGBMClassifier(random_state=42)
model.fit(X_train_bal, y_train_bal)

joblib.dump(model, os.path.join(model_dir, model_filename))
print(f"LightGBM model saved to {os.path.join(model_dir, model_filename)}")


LightGBM model saved to ../model\logreg_credit_20250721_163443.joblib


### Training and evaluating Lightgbm with Credit data

In [None]:
from scripts.logistic_regression import run_logistic_regression

# Use the processed credit data from the data/processed directory
fraud_data_path = '../data/processed/credit_minmax_scaled.csv'

# Run lightgbm on the processed fraud data
fraud_logreg_metrics = run_logistic_regression(fraud_data_path, target_col='class')

# Display the results
print("lightgbm on processed fraud data:")
for metric, value in fraud_logreg_metrics.items():
    if metric != 'classification_report':
        print(f"{metric}: {value}")
    else:
        print("Classification Report:")
        print(value)

import os
import joblib
from datetime import datetime

# After training the model, save it to the model/ folder with a unique name
model_dir = "../model"
os.makedirs(model_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename = f"logreg_credit_{timestamp}.joblib"

# Re-run the training to get the model object
from scripts.logistic_regression import DataSplitter, ImbalanceHandler
import pandas as pd
from sklearn.linear_model import LogisticRegression

df = pd.read_csv(fraud_data_path)
splitter = DataSplitter()
imbalance_handler = ImbalanceHandler()

# Find the correct target column (case-insensitive)
target_col = 'class'
if target_col not in df.columns:
    for col in df.columns:
        if col.lower() == target_col.lower():
            target_col = col
            break

X, y = splitter.separate_features_and_target(df, target_col)
X_train, X_test, y_train, y_test = splitter.train_test_split(X, y, test_size=0.2, random_state=42, stratify=True)
X_train_numeric = X_train.select_dtypes(include=['float64', 'int64'])
X_test_numeric = X_test.select_dtypes(include=['float64', 'int64'])
X_train_bal, y_train_bal = imbalance_handler.apply_smote(X_train_numeric, y_train)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_bal, y_train_bal)

joblib.dump(model, os.path.join(model_dir, model_filename))
print(f"Model saved to {os.path.join(model_dir, model_filename)}")





Loading data from: ../data/processed/credit_minmax_scaled.csv

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283726 entries, 0 to 283725
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    283726 non-null  float64
 1   V1      283726 non-null  float64
 2   V2      283726 non-null  float64
 3   V3      283726 non-null  float64
 4   V4      283726 non-null  float64
 5   V5      283726 non-null  float64
 6   V6      283726 non-null  float64
 7   V7      283726 non-null  float64
 8   V8      283726 non-null  float64
 9   V9      283726 non-null  float64
 10  V10     283726 non-null  float64
 11  V11     283726 non-null  float64
 12  V12     283726 non-null  float64
 13  V13     283726 non-null  float64
 14  V14     283726 non-null  float64
 15  V15     283726 non-null  float64
 16  V16     283726 non-null  float64
 17  V17     283726 non-null  float64
 18  V18     283726 non-null  float64
 19  V19   