In [1]:
import os, sys
notebook_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(notebook_dir,'..')))

In [2]:
from scripts.data_load import load_data
from scripts.preprocess import preprocess_data
from scripts.data_visualization import *
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import ipaddress

In [3]:
# Load data
fraud_data = load_data('../data/cleaned_fraud_data.csv')
credit_data = load_data('../data/creditcard.csv')
ip_country = load_data('../data/IpAddress_to_Country.csv')

In [14]:
# Fraud_Data.csv
X_fraud = fraud_data.drop(columns=['device_id', 'class', 'country'])
y_fraud = fraud_data['class']

# creditcard.csv
X_credit = credit_data.drop(columns=['Class'])
y_credit = credit_data['Class']

In [15]:

# Fraud_Data.csv
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# creditcard.csv
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(
    X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit
)

X_train_fraud['purchase_time'] = pd.to_datetime(X_train_fraud['purchase_time'])
X_train_fraud['purchase_time'] = X_train_fraud['purchase_time'].astype(int) / 10**9  # Convert to seconds

X_test_fraud['purchase_time'] = pd.to_datetime(X_test_fraud['purchase_time'])
X_test_fraud['purchase_time'] = X_test_fraud['purchase_time'].astype(int) / 10**9  # Convert to seconds


X_train_fraud['signup_time'] = pd.to_datetime(X_train_fraud['signup_time'])
X_train_fraud['signup_time'] = X_train_fraud['signup_time'].astype(int) / 10**9  # Convert to seconds

X_test_fraud['signup_time'] = pd.to_datetime(X_test_fraud['signup_time'])
X_test_fraud['signup_time'] = X_test_fraud['signup_time'].astype(int) / 10**9  # Convert to seconds


In [16]:
print(X_train_fraud.dtypes)

user_id                    int64
signup_time              float64
purchase_time            float64
purchase_value             int64
age                        int64
ip_address               float64
transaction_count          int64
time_diff                float64
hour_of_day                int64
day_of_week                int64
purchase_value_scaled    float64
source_Direct               bool
source_SEO                  bool
browser_FireFox             bool
browser_IE                  bool
browser_Opera               bool
browser_Safari              bool
sex_M                       bool
dtype: object


In [18]:
from sklearn.impute import SimpleImputer

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_fraud_imputed = imputer.fit_transform(X_train_fraud)
X_test_fraud_imputed = imputer.transform(X_test_fraud)

# Train Logistic Regression
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_fraud_imputed, y_train_fraud)

# Evaluate
y_pred = model.predict(X_test_fraud_imputed)
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, model.predict_proba(X_test_fraud_imputed)[:, 1]))



              precision    recall  f1-score   support

           0       0.95      0.64      0.76     27393
           1       0.17      0.69      0.27      2830

    accuracy                           0.64     30223
   macro avg       0.56      0.67      0.52     30223
weighted avg       0.88      0.64      0.72     30223

ROC-AUC: 0.7575597387019124


In [19]:
from sklearn.tree import DecisionTreeClassifier

# Train
model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
model.fit(X_train_fraud, y_train_fraud)

# Evaluate
y_pred = model.predict(X_test_fraud)
print("Decision Tree:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))

Decision Tree:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     27393
           1       0.53      0.55      0.54      2830

    accuracy                           0.91     30223
   macro avg       0.74      0.75      0.75     30223
weighted avg       0.91      0.91      0.91     30223

ROC-AUC: 0.7514647225523428


In [20]:
from sklearn.ensemble import RandomForestClassifier

# Train
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train_fraud, y_train_fraud)

# Evaluate
y_pred = model.predict(X_test_fraud)
print("Random Forest:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))

Random Forest:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27393
           1       0.97      0.53      0.68      2830

    accuracy                           0.95     30223
   macro avg       0.96      0.76      0.83     30223
weighted avg       0.96      0.95      0.95     30223

ROC-AUC: 0.7630690580335773


In [21]:
from xgboost import XGBClassifier

# Train
model = XGBClassifier(scale_pos_weight=len(y_train_fraud[y_train_fraud == 0]) / len(y_train_fraud[y_train_fraud == 1]))
model.fit(X_train_fraud, y_train_fraud)

# Evaluate
y_pred = model.predict(X_test_fraud)
print("Gradient Boosting:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))

Gradient Boosting:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97     27393
           1       0.74      0.54      0.62      2830

    accuracy                           0.94     30223
   macro avg       0.85      0.76      0.79     30223
weighted avg       0.93      0.94      0.93     30223

ROC-AUC: 0.7581157601455789


In [23]:
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_fraud_imputed = imputer.fit_transform(X_train_fraud)
X_test_fraud_imputed = imputer.transform(X_test_fraud)

# Train
model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
model.fit(X_train_fraud_imputed, y_train_fraud)

# Evaluate
y_pred = model.predict(X_test_fraud_imputed)
print("MLP:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))



MLP:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27393
           1       0.00      0.00      0.00      2830

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223

ROC-AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

# Reshape data for CNN
X_train_cnn = X_train_fraud.values.reshape(X_train_fraud.shape[0], X_train_fraud.shape[1], 1)
X_test_cnn = X_test_fraud.values.reshape(X_test_fraud.shape[0], X_test_fraud.shape[1], 1)

# Build CNN
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(50, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
model.fit(X_train_cnn, y_train_fraud, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_test_cnn) > 0.5).astype(int)
print("CNN:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))