In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_recall_curve,
    auc
)



In [5]:
%pip install imbalanced-learn

Collecting imbalanced-learn
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/65/60/103dc71019ec2fa987f42f9dbe88641a74edc57f8499fac8896955b66065/imbalanced_learn-0.14.0-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
   ---------------------------------------- 0.0/240.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/240.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/240.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/240.0 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/240.0 kB 163.8 kB/s eta 0:00:02
   ------ -------------------------------- 41.0/240.0 kB 163.4 kB/s eta 0:00:02
   --------- ----------------------------- 61.4/240.0 kB 233.8 kB/s eta 0:00:01
   -------------- ------------------------ 92.2/240.0 kB 308.0 kB/s eta 0:00:01
   ---------


[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from imblearn.over_sampling import SMOTE

In [4]:
# Loading the dataset
df = pd.read_csv("C:\\Users\\Admin\\Downloads\\synthetic_fraud_data.csv")


print(df.head())
print(df.info())


  transaction_id customer_id       card_number  \
0    TX_a0ad2a2a  CUST_72886  6646734767813109   
1    TX_3599c101  CUST_70474   376800864692727   
2    TX_a9461c6d  CUST_10715  5251909460951913   
3    TX_7be21fc4  CUST_16193   376079286931183   
4    TX_150f490b  CUST_87572  6172948052178810   

                          timestamp merchant_category merchant_type  \
0  2024-09-30 00:00:01.034820+00:00        Restaurant     fast_food   
1  2024-09-30 00:00:01.764464+00:00     Entertainment        gaming   
2  2024-09-30 00:00:02.273762+00:00           Grocery      physical   
3  2024-09-30 00:00:02.297466+00:00               Gas         major   
4  2024-09-30 00:00:02.544063+00:00        Healthcare       medical   

         merchant     amount currency    country  ...   device channel  \
0       Taco Bell     294.87      GBP         UK  ...  iOS App  mobile   
1           Steam    3368.97      BRL     Brazil  ...     Edge     web   
2     Whole Foods  102582.38      JPY      Japan  

In [8]:
### Defining features and target
target_col = "is_fraud"

# Drop IDs, sensitive info
drop_cols = ["transaction_id", "customer_id", "card_number", 
             "timestamp", "device_fingerprint", "ip_address", "merchant"]

X = df.drop(columns=drop_cols + [target_col])
y = df[target_col].astype(int)  # convert bool to int


In [9]:
### Separating numeric & categorical features
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

print("Numeric:", numeric_features)
print("Categorical:", categorical_features)


Numeric: ['amount', 'distance_from_home', 'transaction_hour']
Categorical: ['merchant_category', 'merchant_type', 'currency', 'country', 'city', 'city_size', 'card_type', 'card_present', 'device', 'channel', 'high_risk_merchant', 'weekend_transaction', 'velocity_last_hour']


In [11]:
# Preprocessing pipeline
numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [13]:
## Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Transform features
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)


In [None]:
### Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_transformed, y_train)




In [None]:
### Logistic Regression (baseline)
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
log_reg.fit(X_train_resampled, y_train_resampled)

y_pred_lr = log_reg.predict(X_test_transformed)
y_proba_lr = log_reg.predict_proba(X_test_transformed)[:, 1]

print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_lr))


In [None]:
### Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_resampled, y_train_resampled)

y_pred_rf = rf.predict(X_test_transformed)
y_proba_rf = rf.predict_proba(X_test_transformed)[:, 1]

print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))


In [None]:
### Precision-Recall Curve for Random Forest
precision, recall, _ = precision_recall_curve(y_test, y_proba_rf)
pr_auc = auc(recall, precision)

plt.figure(figsize=(6,4))
plt.plot(recall, precision, label=f'PR-AUC={pr_auc:.4f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()


In [None]:
### Isolation Forest (Anomaly Detection)
iso_forest = IsolationForest(
    n_estimators=300,
    contamination=y.mean(),  # estimated fraud rate
    random_state=42,
    n_jobs=-1
)
iso_forest.fit(X_train_transformed)

# Predictions: -1 = anomaly, 1 = normal â†’ map to fraud=1
iso_preds = iso_forest.predict(X_test_transformed)
iso_preds = np.where(iso_preds == -1, 1, 0)

print("Isolation Forest:")
print(classification_report(y_test, iso_preds))


In [None]:
### Summary of Model ROC-AUC
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "Isolation Forest"],
    "ROC_AUC": [
        roc_auc_score(y_test, y_proba_lr),
        roc_auc_score(y_test, y_proba_rf),
        np.nan  # ROC-AUC not meaningful for unsupervised without probabilities
    ]
})
print(results)


In [None]:
df['velocity_num'] = df['velocity_last_hour'].apply(lambda x: eval(x)['num_transactions'])
df['velocity_amount'] = df['velocity_last_hour'].apply(lambda x: eval(x)['total_amount'])
X = X.drop(columns=['velocity_last_hour'])

