In [None]:
import kagglehub

path = kagglehub.dataset_download("sgpjesus/bank-account-fraud-dataset-neurips-2022")
print("Path to BAF dataset files:", path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/sgpjesus/bank-account-fraud-dataset-neurips-2022?dataset_version_number=2...


100%|██████████| 532M/532M [00:25<00:00, 21.6MB/s]

Extracting files...





Path to BAF dataset files: /root/.cache/kagglehub/datasets/sgpjesus/bank-account-fraud-dataset-neurips-2022/versions/2


In [None]:
!pip install xgboost imbalanced-learn plotly streamlit shap -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

df_baf = pd.read_csv(f'{path}/Base.csv')
print("BAF Dataset shape:", df_baf.shape)
print("\nFirst few rows:")
print(df_baf.head())
print("\nTarget distribution:")
print(df_baf['fraud_bool'].value_counts())


BAF Dataset shape: (1000000, 32)

First few rows:
   fraud_bool  income  name_email_similarity  prev_address_months_count  \
0           0     0.3               0.986506                         -1   
1           0     0.8               0.617426                         -1   
2           0     0.8               0.996707                          9   
3           0     0.6               0.475100                         11   
4           0     0.9               0.842307                         -1   

   current_address_months_count  customer_age  days_since_request  \
0                            25            40            0.006735   
1                            89            20            0.010095   
2                            14            40            0.012316   
3                            14            30            0.006991   
4                            29            40            5.742626   

   intended_balcon_amount payment_type  zip_count_4w  ...  has_other_cards  \
0     

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

for col in df_baf.columns:
    if df_baf[col].dtype == 'object':
        df_baf[col].fillna('Unknown', inplace=True)
    else:
        df_baf[col].fillna(df_baf[col].median(), inplace=True)

numerical_cols = df_baf.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    if 'amount' in col.lower() or 'balance' in col.lower():
        df_baf[f'{col}_log'] = np.log1p(df_baf[col].abs())
        df_baf[f'{col}_sqrt'] = np.sqrt(df_baf[col].abs())

le = LabelEncoder()
cat_cols = df_baf.select_dtypes(include=['object']).columns
cat_cols = [col for col in cat_cols if col != 'fraud_bool']

for col in cat_cols:
    df_baf[col + '_enc'] = le.fit_transform(df_baf[col].astype(str))
    df_baf.drop(columns=[col], inplace=True)

print('BAF Shape after encoding:', df_baf.shape)


BAF Shape after encoding: (1000000, 34)


In [None]:
from sklearn.model_selection import train_test_split

X_baf = df_baf.drop(['fraud_bool'], axis=1, errors='ignore')
y_baf = df_baf['fraud_bool']

if y_baf.dtype == 'bool':
    y_baf = y_baf.astype(int)

scaler_baf = StandardScaler()
X_baf_scaled = scaler_baf.fit_transform(X_baf)

X_train_baf, X_test_baf, y_train_baf, y_test_baf = train_test_split(
    X_baf_scaled, y_baf,
    test_size=0.2,
    stratify=y_baf,
    random_state=42
)

print("BAF Train shape:", X_train_baf.shape)
print("BAF Test shape:", X_test_baf.shape)


BAF Train shape: (800000, 33)
BAF Test shape: (200000, 33)


In [None]:
from imblearn.over_sampling import SMOTE

smote_baf = SMOTE(random_state=42)
X_train_baf_sm, y_train_baf_sm = smote_baf.fit_resample(X_train_baf, y_train_baf)

print("BAF Original fraud ratio:", y_train_baf.mean())
print("BAF Post-SMOTE fraud ratio:", y_train_baf_sm.mean())
print("BAF SMOTE applied: X_train_baf_sm shape:", X_train_baf_sm.shape)


BAF Original fraud ratio: 0.01102875
BAF Post-SMOTE fraud ratio: 0.5
BAF SMOTE applied: X_train_baf_sm shape: (1582354, 33)


In [None]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# 1) XGBoost for BAF
xgb_model_baf = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model_baf.fit(X_train_baf_sm, y_train_baf_sm)

# 2) Random Forest for BAF
rf_model_baf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model_baf.fit(X_train_baf_sm, y_train_baf_sm)

# 3) Logistic Regression for BAF
lr_model_baf = LogisticRegression(
    max_iter=1000,
    random_state=42
)
lr_model_baf.fit(X_train_baf_sm, y_train_baf_sm)

print("BAF Models trained successfully.")


BAF Models trained successfully.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

xgb_pred_baf = xgb_model_baf.predict(X_test_baf)
rf_pred_baf = rf_model_baf.predict(X_test_baf)
lr_pred_baf = lr_model_baf.predict(X_test_baf)

ensemble_pred_baf = (
    0.4 * xgb_pred_baf +
    0.3 * rf_pred_baf +
    0.3 * lr_pred_baf
).round().astype(int)

print('BAF Ensemble Performance:')
print('Accuracy:', accuracy_score(y_test_baf, ensemble_pred_baf))
print('Precision:', precision_score(y_test_baf, ensemble_pred_baf))
print('Recall:', recall_score(y_test_baf, ensemble_pred_baf))
print('F1-score:', f1_score(y_test_baf, ensemble_pred_baf))
print('AUC-ROC:', roc_auc_score(y_test_baf, ensemble_pred_baf))


BAF Ensemble Performance:
Accuracy: 0.92884
Precision: 0.08427820796460177
Recall: 0.5525838621940163
F1-score: 0.14625074985003
AUC-ROC: 0.7428101267955634


In [None]:
import pickle

model_package_baf = {
    'scaler': scaler_baf,
    'xgb': xgb_model_baf,
    'rf': rf_model_baf,
    'lr': lr_model_baf,
    'feature_names': list(X_baf.columns)
}

with open('baf_fraud_detection_model.pkl', 'wb') as f:
    pickle.dump(model_package_baf, f)

print("BAF Model saved to baf_fraud_detection_model.pkl")

from google.colab import files
files.download('baf_fraud_detection_model.pkl')


BAF Model saved to baf_fraud_detection_model.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>