In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [29]:
datafile = "AIML_Dataset.csv"
data = pd.read_csv(datafile)

In [30]:
data = data.drop(["step", "nameOrig", "nameDest", "isFlaggedFraud"], axis=1)

X = data.drop("isFraud", axis=1)
y = data["isFraud"]

In [31]:
categorical_features = ["type"]
numeric_features = [col for col in X.columns if col not in categorical_features]

In [43]:
#feature engineering

data["amount_to_balance_ratio"] = data["amount"] / (data["oldbalanceOrg"] + 1)
data["balance_change_dest"] = data["newbalanceDest"] - data["oldbalanceDest"]
data["balance_change_org"] = data["oldbalanceOrg"] - data["newbalanceOrig"]
data["net_transaction"] = data["balance_change_dest"] - data["balance_change_org"]

In [None]:
# log-transforming
for col in ["amount", "balance_change_dest", "balance_change_org", "net_transaction"]:
    data[col] = np.log1p(np.abs(data[col]))  

In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [48]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features)
    ],
    remainder="drop"
)

In [49]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

In [50]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [41]:
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred, digits=4))

print("\nCONFUSION MATRIX:")
print(confusion_matrix(y_test, y_pred))

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0     0.9999    0.9468    0.9726   1906322
           1     0.0223    0.9387    0.0435      2464

    accuracy                         0.9467   1908786
   macro avg     0.5111    0.9427    0.5081   1908786
weighted avg     0.9987    0.9467    0.9714   1908786


CONFUSION MATRIX:
[[1804823  101499]
 [    151    2313]]
