In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import joblib
import os

In [2]:
# Paths

DATA_PATH = '/content/drive/MyDrive/Fraud Detection/bank_transactions_featured.csv'
OUTPUT_DIR = '/content/drive/MyDrive/Fraud Detection/model_training'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# Load Dataset

df = pd.read_csv(DATA_PATH, parse_dates=['TransactionDate', 'PreviousTransactionDate'])


In [4]:
# Feature Selection: drop identifiers and timestamps

drop_cols = [
    'TransactionID', 'AccountID', 'TransactionDate', 'PreviousTransactionDate',
    'DeviceID', 'IP Address', 'MerchantID'
]
X = df.drop(columns=drop_cols)


# Unsupervised Labeling with Isolation Forest

In [7]:
#   - Scale numerical features
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
X_numeric = pd.DataFrame(scaler.fit_transform(X[numeric_cols]), columns=numeric_cols)

#   - Keep binary flags and time features as-is
#   - One-hot encode categorical features
other_cols = [c for c in X.columns if c not in numeric_cols]
X_other = X[other_cols].reset_index(drop=True)
categorical_cols = X_other.select_dtypes(include=['object']).columns.tolist()  # Identify categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # Create OneHotEncoder instance
encoded_data = encoder.fit_transform(X_other[categorical_cols])  # Fit and transform categorical features
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))  # Create DataFrame from encoded data
X_other = X_other.drop(columns=categorical_cols)  # Drop original categorical columns
X_other = pd.concat([X_other, encoded_df], axis=1)  # Concatenate encoded features

#   - Combine
X_if = pd.concat([X_numeric, X_other], axis=1)

#   - Train Isolation Forest
iso = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.01, random_state=42)
iso.fit(X_if)
scores = -iso.decision_function(X_if)

In [8]:
#   - Pseudo-label: top 5% anomalies
threshold = np.percentile(scores, 95)
df['is_fraud'] = (scores >= threshold).astype(int)
joblib.dump(iso, os.path.join(OUTPUT_DIR, 'isolation_forest.pkl'))

['/content/drive/MyDrive/Fraud Detection/model_training/isolation_forest.pkl']

In [9]:
# Prepare for Supervised Training
y = df['is_fraud']
X = X.drop(columns=[])  # already cleaned of IDs

In [11]:
# One-Hot Encode Categorical Features
cat_cols = ['TransactionType', 'Location', 'Channel', 'CustomerOccupation', 'user_primary_location']
# Remove the 'sparse' argument, or upgrade scikit-learn to version 1.2 or later
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # sparse is deprecated, use sparse_output
X_cat = pd.DataFrame(ohe.fit_transform(X[cat_cols]), index=X.index)
joblib.dump(ohe, os.path.join(OUTPUT_DIR, 'onehot_encoder.pkl'))

['/content/drive/MyDrive/Fraud Detection/model_training/onehot_encoder.pkl']

In [13]:
# Combine encoded and numeric
X_final = pd.concat([X_numeric, X_other, X_cat], axis=1)

In [14]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.3, stratify=y, random_state=42
)

In [15]:
# Train XGBoost Classifier with imbalance handling
neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
scale_pos_weight = neg / pos
model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    scale_pos_weight=scale_pos_weight,
    random_state=42
)
model.fit(X_train, y_train)
joblib.dump(model, os.path.join(OUTPUT_DIR, 'xgb_model.pkl'))

Parameters: { "use_label_encoder" } are not used.



['/content/drive/MyDrive/Fraud Detection/model_training/xgb_model.pkl']

In [16]:
# Evaluation
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Classification metrics
report = classification_report(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

# Print to console
print("Classification Report:\n", report)
print(f"ROC-AUC Score: {auc:.4f}")

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       676
           1       0.68      0.36      0.47        36

    accuracy                           0.96       712
   macro avg       0.83      0.68      0.73       712
weighted avg       0.95      0.96      0.95       712

ROC-AUC Score: 0.9183


In [17]:
# Save report
with open(os.path.join(OUTPUT_DIR, 'classification_report.txt'), 'w') as f:
    f.write(report + f"\nROC-AUC: {auc:.4f}\n")

In [18]:
# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure()
plt.plot(fpr, tpr, label=f'XGBoost (AUC = {auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.savefig(os.path.join(OUTPUT_DIR, 'roc_curve.png'))
plt.close()

print(f"Models and artifacts saved to {OUTPUT_DIR}")

Models and artifacts saved to /content/drive/MyDrive/Fraud Detection/model_training
