In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
df= pd.read_csv('pay sim.csv')
df.head()  # Replace with your dataset

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column] = label_encoder.fit_transform(df[column])
    print(f'Label encoding {column} completed.')


Label encoding type completed.
Label encoding nameOrig completed.
Label encoding nameDest completed.


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            int32  
 2   amount          float64
 3   nameOrig        int32  
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        int32  
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int32(3), int64(3)
memory usage: 461.2 MB


In [8]:
df.value_counts('isFraud')

isFraud
0    6354407
1       8213
Name: count, dtype: int64

## Isolation Forest for Anomaly Detection

In [9]:
# import pandas as pd
# import numpy as np
# from sklearn.ensemble import IsolationForest
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# y = df.isFraud
# x = df.drop(columns=['isFraud'])

# x_train, x_test, y_train, y_test = train_test_split(x ,y,test_size = 0.3,random_state = 0)

# # Fit Isolation Forest
# iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
# iso_forest.fit(x)

# # Predict anomalies
# y_pred = iso_forest.predict(x)

# # Convert predictions: -1 -> fraud (1), 1 -> normal (0)
# y_pred_converted = np.where(y_pred == -1, 1, 0)

# # Accuracy
# accuracy = accuracy_score(y, y_pred_converted)
# print(f"Isolation Forest Accuracy: {accuracy:.4f}")

# # More metrics
# print("\nClassification Report:")
# print(classification_report(y, y_pred_converted))

# print("\nConfusion Matrix:")
# print(confusion_matrix(y, y_pred_converted))

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# 1. Data Preparation
y = df['isFraud']
x = df.drop(columns=['isFraud'])

# Split BEFORE any fitting
x_train, x_test, y_train, y_test = train_test_split(
    x, y, 
    test_size=0.3, 
    random_state=42,
    stratify=y  # Maintain fraud ratio
)

# 2. Model Training
iso_forest = IsolationForest(
    n_estimators=150,  # Increased from 100
    contamination='auto',  # Let model estimate
    max_samples=256,  # Smaller subsets reduce overfitting
    random_state=42,
    n_jobs=-1  # Use all cores
)

# Train ONLY on x_train
iso_forest.fit(x_train)

# 3. Evaluation (ONLY on test set)
y_test_scores = iso_forest.decision_function(x_test)  # Anomaly scores
y_test_pred = iso_forest.predict(x_test)

# Convert predictions: -1 -> fraud (1), 1 -> normal (0)
y_test_pred_converted = np.where(y_test_pred == -1, 1, 0)

# 4. Proper Metrics
print("Test Set Performance:")
print(f"ROC-AUC: {roc_auc_score(y_test, y_test_scores):.4f}")  # Better for anomaly detection

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred_converted))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_converted))

# 5. Threshold Tuning (Optional)
# Find optimal threshold based on business needs
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_test, -y_test_scores)  # Note: we negate scores

# Example: Find threshold where recall >= 0.7
optimal_idx = np.argmax(recalls >= 0.7)
optimal_threshold = thresholds[optimal_idx]
y_test_optimized = (-y_test_scores > optimal_threshold).astype(int)

print("\nOptimized Classification Report:")
print(classification_report(y_test, y_test_optimized))


Test Set Performance:
ROC-AUC: 0.2373

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.87      0.93   1906322
           1       0.00      0.47      0.01      2464

    accuracy                           0.87   1908786
   macro avg       0.50      0.67      0.47   1908786
weighted avg       1.00      0.87      0.93   1908786


Confusion Matrix:
[[1661700  244622]
 [   1313    1151]]

Optimized Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00   1906322
           1       0.00      1.00      0.00      2464

    accuracy                           0.00   1908786
   macro avg       0.50      0.50      0.00   1908786
weighted avg       1.00      0.00      0.00   1908786



## One Class-Svm

In [10]:
import pandas as pd
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE  # For balanced training

# 1. Balanced Data Preparation
fraud = df[df['isFraud'] == 1]
normal = df[df['isFraud'] == 0].sample(len(fraud) * 10, random_state=42)  # 10:1 ratio
df_balanced = pd.concat([fraud, normal])

# 2. Enhanced Feature Engineering
def create_features(df):
    df = df.copy()
    # Transaction patterns
    df['balance_drop_pct'] = (df['oldbalanceOrg'] - df['newbalanceOrig']) / (df['oldbalanceOrg'] + 1e-6)
    df['recipient_risk'] = df.groupby('nameDest')['amount'].transform('count') / len(df)
    # Time dynamics
    df['time_since_last'] = df.groupby('nameOrig')['step'].diff().fillna(24)
    return df

X = create_features(df_balanced).drop(['isFraud','nameOrig','nameDest'], axis=1)
y = df_balanced['isFraud']

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 4. SMOTE Oversampling (Only on training!)
sm = SMOTE(sampling_strategy=0.3, random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

# 5. Optimized One-Class SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_res[y_res == 0])  # Train ONLY on normal

svm = OneClassSVM(
    kernel='rbf',
    nu=0.15,  # More sensitive to outliers
    gamma=0.001,  # Tighter decision boundary
    cache_size=2000
)
svm.fit(X_train_scaled)

# 6. Dynamic Thresholding
X_test_scaled = scaler.transform(X_test)
scores = svm.decision_function(X_test_scaled)

# Find threshold where 80% of fraud is caught
fraud_scores = scores[y_test == 1]
threshold = np.percentile(fraud_scores, 20)  # Bottom 20% of fraud scores
y_pred = (scores < threshold).astype(int)

# 7. Evaluation
print("🔍 Final Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))

# Feature Analysis
print("\n💡 Top Fraud Indicators:")
print(f"1. Balance Drop >50%: {(X_test[y_test==1]['balance_drop_pct'] > 0.5).mean():.1%}")
print(f"2. High-Risk Recipients: {(X_test[y_test==1]['recipient_risk'] > 0.01).mean():.1%}")



🔍 Final Confusion Matrix:
[[23504  1135]
 [ 1971   493]]

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.94     24639
           1       0.30      0.20      0.24      2464

    accuracy                           0.89     27103
   macro avg       0.61      0.58      0.59     27103
weighted avg       0.87      0.89      0.87     27103


💡 Top Fraud Indicators:
1. Balance Drop >50%: 98.6%
2. High-Risk Recipients: 0.0%


## Autoencoder

In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam

y = df.isFraud
x = df.drop(columns=['isFraud'])
# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)

# Train autoencoder only on normal (non-fraud) data
X_train = X_scaled[y == 0]  # Only normal transactions

# Autoencoder architecture
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(16, activation="relu", activity_regularizer=regularizers.l1(1e-5))(input_layer)
encoded = Dense(8, activation="relu")(encoded)
decoded = Dense(16, activation='relu')(encoded)
output_layer = Dense(input_dim, activation='linear')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train the autoencoder
autoencoder.fit(X_train, X_train,
                epochs=20,
                batch_size=256,
                shuffle=True,
                validation_split=0.2,
                verbose=1)

# Reconstruct full dataset
X_reconstructed = autoencoder.predict(X_scaled)

# Calculate reconstruction error
reconstruction_error = np.mean(np.power(X_scaled - X_reconstructed, 2), axis=1)

# Set threshold (mean + 3*std of normal errors is common)
threshold = np.mean(reconstruction_error[y == 0]) + 3 * np.std(reconstruction_error[y == 0])

# Predict: 1 = fraud if error > threshold
y_pred = np.where(reconstruction_error > threshold, 1, 0)

# Accuracy and Evaluation
accuracy = accuracy_score(y, y_pred)
print(f"Autoencoder Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y, y_pred))


Epoch 1/20
[1m19858/19858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2ms/step - loss: 0.0759 - val_loss: 0.0874
Epoch 2/20
[1m19858/19858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2ms/step - loss: 0.0021 - val_loss: 0.0835
Epoch 3/20
[1m19858/19858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2ms/step - loss: 0.0016 - val_loss: 0.0847
Epoch 4/20
[1m19858/19858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 2ms/step - loss: 0.0014 - val_loss: 0.0843
Epoch 5/20
[1m19858/19858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 2ms/step - loss: 0.0015 - val_loss: 0.0841
Epoch 6/20
[1m19858/19858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2ms/step - loss: 0.0013 - val_loss: 0.0835
Epoch 7/20
[1m19858/19858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2ms/step - loss: 0.0013 - val_loss: 0.0844
Epoch 8/20
[1m19858/19858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 2ms/step - loss: 0.0013 - val_loss: 0.0841


In [15]:
import pickle

with open('models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('models/autoencoder.pkl', 'wb') as f:
    pickle.dump(autoencoder, f)

with open('models/oc_svm.pkl', 'wb') as f:
    pickle.dump(svm, f)  

with open('models/iso_forest.pkl', 'wb') as f:
    pickle.dump(iso_forest, f)          
    