# 📘 PaySim Anomaly Detection - Modeling Notebook

In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay

# Load dataset
df = pd.read_csv('../data/transactions.csv')
print("Total transactions:", df.shape[0])
print(df['isFraud'].value_counts())
print("Transaction types:", df['type'].unique())


In [None]:

# Visualize fraud distribution
sns.countplot(data=df, x='type', hue='isFraud')
plt.title("Fraudulent Transactions by Type")
plt.xticks(rotation=45)
plt.show()


In [None]:

# Feature engineering
df['amount_log'] = np.log1p(df['amount'])
df = df.drop(columns=['nameOrig', 'nameDest'])

# Select features
features = ['amount_log', 'step', 'isFlaggedFraud',
            'oldbalanceOrg','newbalanceOrig',
            'oldbalanceDest','newbalanceDest']
X = df[features].values
y = df['isFraud'].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:

# Train Isolation Forest
iso = IsolationForest(n_estimators=200, contamination=y.mean(), random_state=42)
iso.fit(X_scaled)

# Predictions
scores = iso.decision_function(X_scaled)
y_pred = (iso.predict(X_scaled) == -1).astype(int)


In [None]:

# Evaluation
print(classification_report(y, y_pred, digits=4))
print("ROC AUC Score:", roc_auc_score(y, scores))
ConfusionMatrixDisplay.from_predictions(y, y_pred, cmap="Blues")
plt.title("Anomaly Detection Confusion Matrix")
plt.show()


In [None]:

# Score distribution
sns.histplot(scores, kde=True, bins=50)
plt.title("Anomaly Score Distribution")
plt.xlabel("Anomaly Score (lower = more anomalous)")
plt.show()
