In [114]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [101]:
datafile = "AIML_Dataset.csv"
data = pd.read_csv(datafile)

In [102]:
#feature engineering

data["amount_to_balance_ratio"] = data["amount"] / (data["oldbalanceOrg"] + 1)
data["balance_change_dest"] = data["newbalanceDest"] - data["oldbalanceDest"]
data["balance_change_org"] = data["oldbalanceOrg"] - data["newbalanceOrig"]
data["net_transaction"] = data["balance_change_dest"] - data["balance_change_org"]

In [103]:
data = data.drop(["step", "nameOrig", "nameDest", "isFlaggedFraud"], axis=1)

X = data.drop("isFraud", axis=1)
y = data["isFraud"]

In [105]:
categorical_features = ["type"]
numeric_features = [col for col in X.columns if col not in categorical_features]

In [106]:
# log-transforming
for col in ["amount", "balance_change_dest", "balance_change_org", "net_transaction"]:
    data[col] = np.log1p(np.abs(data[col]))  

In [107]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [108]:
# encoding categorical features
if "type" in data.columns:
    le = LabelEncoder()
    X_train["type"] = le.fit_transform(X_train["type"])
    X_test["type"] = le.transform(X_test["type"])

In [109]:
# scaling numerical features
scaler = StandardScaler()
num_cols = X_train.select_dtypes(include=np.number).columns
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [None]:
# smote
smote = SMOTE(random_state=42, sampling_strategy=0.1) 
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_resampled.value_counts())

Before SMOTE: isFraud
0    4448085
1       5749
Name: count, dtype: int64
After SMOTE: isFraud
0    4448085
1     444808
Name: count, dtype: int64


In [111]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features)
    ],
    remainder="drop"
)

In [117]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=8, n_jobs = -1, random_state=42))
])

In [118]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [119]:
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred, digits=4))

print("\nCONFUSION MATRIX:")
print(confusion_matrix(y_test, y_pred))

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0     1.0000    0.9999    0.9999   1906322
           1     0.9036    0.9963    0.9477      2464

    accuracy                         0.9999   1908786
   macro avg     0.9518    0.9981    0.9738   1908786
weighted avg     0.9999    0.9999    0.9999   1908786


CONFUSION MATRIX:
[[1906060     262]
 [      9    2455]]


CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0     0.9999    0.9468    0.9726   1906322
           1     0.0223    0.9387    0.0435      2464

    accuracy                         0.9467   1908786
   macro avg     0.5111    0.9427    0.5081   1908786
weighted avg     0.9987    0.9467    0.9714   1908786


CONFUSION MATRIX:
[[1804823  101499]
 [    151    2313]]