In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import sys
import os

In [2]:
sys.path.append(os.path.abspath('../scripts'))

In [3]:
from model_building_and_training import (
    separate_features_target_creditcard,
    separate_features_target_fraud,
    split_train_test,
    train_and_evaluate_model
)
from data_analysis_and_preprocessing import correct_data_types

In [4]:
# Load the datasets
creditcard_df = pd.read_csv('../data/creditcard.csv')
merged_fraud_df = pd.read_csv('../data/merged_fraud_data.csv')

# Correct data types for fraud data
merged_fraud_df1 = correct_data_types(merged_fraud_df)

In [5]:
# Drop datetime columns
fraud_df_cleaned = merged_fraud_df1.drop(columns=['signup_time', 'purchase_time'])

In [6]:
# Function to split the data into train and test sets
# For credit card data
X_creditcard, y_creditcard = separate_features_target_creditcard(creditcard_df)
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = split_train_test(X_creditcard, y_creditcard)

# For fraud data (merged fraud data)
X_fraud, y_fraud = separate_features_target_fraud(fraud_df_cleaned)
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = split_train_test(X_fraud, y_fraud)


In [7]:
# Print the shapes to verify the splits
print(f"Credit card train shape: {X_creditcard_train.shape}, Credit card test shape: {X_creditcard_test.shape}")
print(f"Fraud data train shape: {X_fraud_train.shape}, Fraud data test shape: {X_fraud_test.shape}")

Credit card train shape: (227845, 30), Credit card test shape: (56962, 30)
Fraud data train shape: (103316, 14), Fraud data test shape: (25830, 14)


In [8]:
# Preprocessing: Handle numerical and categorical columns
numerical_columns = ['age', 'ip_address', 'purchase_value', 'lower_bound_ip_address', 'upper_bound_ip_address', 'time_to_purchase', 'hour_of_day', 'day_of_week']
categorical_columns = ['device_id', 'source', 'browser', 'sex', 'country']

In [9]:
# Define preprocessing pipeline for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])


In [10]:
# Initialize models
logistic_regression = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
gradient_boosting = GradientBoostingClassifier()
mlp = MLPClassifier(max_iter=1000)

In [11]:
# Apply preprocessing pipeline and train models
# Preprocess the training and test data
X_fraud_train_transformed = preprocessor.fit_transform(X_fraud_train)
X_fraud_test_transformed = preprocessor.transform(X_fraud_test)

In [12]:
models = [logistic_regression, decision_tree, random_forest, gradient_boosting, mlp]
for model in models:
    print(f"Training and evaluating {model.__class__.__name__}...")
    trained_model = train_and_evaluate_model(model, X_fraud_train_transformed, X_fraud_test_transformed, y_fraud_train, y_fraud_test)

Training and evaluating LogisticRegression...
LogisticRegression Accuracy: 0.9524
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     23427
           1       0.96      0.51      0.67      2403

    accuracy                           0.95     25830
   macro avg       0.96      0.75      0.82     25830
weighted avg       0.95      0.95      0.95     25830

Training and evaluating DecisionTreeClassifier...
DecisionTreeClassifier Accuracy: 0.9525
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     23427
           1       0.90      0.55      0.68      2403

    accuracy                           0.95     25830
   macro avg       0.93      0.77      0.83     25830
weighted avg       0.95      0.95      0.95     25830

Training and evaluating RandomForestClassifier...
RandomForestClassifier Accuracy: 0.9567
              precision    recall  f1-score   support

           0       0.95      