In [1]:
# The following code will only execute
# successfully when compression is complete

import kagglehub

# Download latest version
path = kagglehub.dataset_download("iamsantoshsoni/ieee-cis-preprocessed-data-set")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'ieee-cis-preprocessed-data-set' dataset.
Path to dataset files: /kaggle/input/ieee-cis-preprocessed-data-set


In [2]:
import os
import pandas as pd

os.listdir(path)

Preprocessed_Train_df = pd.read_csv(path + "/Preprocessed_Train_df.csv")
Preprocessed_Test_df = pd.read_csv(path + "/Preprocessed_Test_df.csv")

In [3]:
Train_df = pd.concat([Preprocessed_Train_df,Preprocessed_Test_df], axis=0, ignore_index=True)
Train_df.shape

(590540, 414)

In [12]:
Train_df.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,addr1,addr2,dist1,dist2,C1,C2,C3,...,pca94,pca95,pca96,pca97,pca98,pca99,pca100,pca101,pca102,isFraud
0,3355246.0,9152120.0,44.5,226.0,87.0,-999.0,-999.0,1.0,1.0,0.0,...,0.01503,-0.01468,-0.039767,-0.012655,0.013793,0.012857,0.007238,-0.022553,-0.090133,0.0
1,3041843.0,1281492.0,339.95,476.0,87.0,1.0,-999.0,1.0,1.0,0.0,...,-0.009748,0.047296,0.075811,-0.078722,-0.044692,-0.058389,-0.022595,-0.046734,-0.011106,0.0
2,3104188.0,2281164.0,58.95,123.0,87.0,-999.0,-999.0,4.0,4.0,0.0,...,0.10602,-0.044436,-0.383793,0.247934,0.391686,0.211806,-0.107036,0.103106,-0.215461,0.0
3,3448091.0,11841531.0,67.95,204.0,87.0,13.0,-999.0,6.0,4.0,0.0,...,0.059155,-0.364216,-0.09634,0.324661,0.224111,0.273505,-0.199379,-0.088903,0.168881,0.0
4,3053846.0,1525539.0,150.0,325.0,87.0,-999.0,-999.0,2.0,2.0,0.0,...,0.871724,-0.15622,-0.398141,0.590618,0.496588,0.509823,-0.303707,-0.350005,0.094626,0.0


In [4]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train,y_test = train_test_split(Train_df.iloc[:,:-1], Train_df.iloc[:,-1], test_size=0.18, shuffle=True, random_state=6)

X_test,X_val, y_test,y_val = train_test_split(X_test,y_test, test_size=0.5, shuffle=True, random_state=6)


In [5]:
%pip install mlflow dagshub lightgbm catboost



In [6]:
import dagshub  
import mlflow


mlflow.set_tracking_uri('https://dagshub.com/santosh4thmarch/IEEE-CIS-Fraud-detection.mlflow')
dagshub.init(repo_owner='santosh4thmarch', repo_name='IEEE-CIS-Fraud-detection',mlflow=True)

In [None]:
import os
import mlflow
import mlflow.sklearn
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (precision_score, f1_score, roc_auc_score, 
                             accuracy_score, recall_score, average_precision_score)

# =============================================================================
# 1. EXPERIMENT SETUP
# =============================================================================
mlflow.set_experiment("Models_comparison_v4")

# =============================================================================
# 2. HYPERPARAMETERS
# =============================================================================

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',
    'scale_pos_weight': 27,
    'max_depth': 10,
    'min_child_weight': 5,
    'learning_rate': 0.02,
    'n_estimators': 3000,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'reg_alpha': 1,
    'reg_lambda': 2,
    'early_stopping_rounds': 80   
}

lgbm_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'is_unbalance': True,
    'num_leaves': 256,
    'min_data_in_leaf': 40,
    'max_depth': -1,
    'learning_rate': 0.01,
    'n_estimators': 2000,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'device': 'gpu',
    'n_jobs': -1,
}

cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'task_type': 'GPU',
    'depth': 8,
    'learning_rate': 0.03,
    'iterations': 2000,
    'auto_class_weights': 'Balanced',
    'l2_leaf_reg': 3,
    'border_count': 254,
    'random_seed': 42,
}

RandomF_params = {
    'n_estimators': 200,
    'max_depth': 20,
    'min_samples_split': 10,
    'min_samples_leaf': 4,
    'class_weight': 'balanced',
    'n_jobs': -1,
    'random_state': 6,
    'verbose': 0
}

# =============================================================================
# 3. MODEL CONFIGURATION
# =============================================================================

model_configs = [
    {
        "name": "XGBoost",
        "model": XGBClassifier(**xgb_params),
        "params": xgb_params
    },
    {
        "name": "LGBM",
        "model": LGBMClassifier(**lgbm_params),
        "params": lgbm_params
    },
    {
        "name": "CatBoost",
        "model": CatBoostClassifier(**cat_params),
        "params": cat_params
    },
    {
        "name": "RandomForest",
        "model": RandomForestClassifier(**RandomF_params),
        "params": RandomF_params
    }
]

# =============================================================================
# 4. TRAINING LOOP
# =============================================================================

print("Starting training loop...")

for config in model_configs:
    name, model, params = config['name'], config['model'], config['params']

    # Start mlflow run
    with mlflow.start_run(run_name=name):
        
        # --- A. Log Features (New Requirement) ---
        # Logs the list of column names to a file 'features.json' in MLflow
        if hasattr(X_train, 'columns'):
            mlflow.log_dict({"features": X_train.columns.tolist()}, "features.json")
        else:
            print(f"Warning: X_train does not have .columns attribute. Skipping feature logging for {name}.")

        # --- B. Log Code Artifact ---
        if os.path.exists('exp1.ipynb'):
            mlflow.log_artifact('exp1.ipynb')
        else:
            print("Warning: exp1.ipynb not found. Skipping artifact logging.")

        # --- C. Model Training ---
        if name == 'XGBoost':
            print(f"Training {name}...")
            model.fit(
                X_train, y_train,
                eval_set=[(X_train, y_train), (X_val, y_val)],
                verbose=False
            )
            print(f'Trained {name}!')

        elif name == 'LGBM':
            print(f"Training {name}...")
            call_backs = [early_stopping(stopping_rounds=80), log_evaluation(period=0)]
            model.fit(
                X_train, y_train,
                eval_set=[(X_train, y_train), (X_val, y_val)],
                callbacks=call_backs
            )

        elif name == 'CatBoost':
            continue 
            print(f"Training {name}...")
            model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=80,
                verbose=False
            )

        elif name == 'RandomForest':
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            print(f'Trained {name}!')

        # --- D. Log Best Iteration ---
        if hasattr(model, 'best_iteration_'):
            mlflow.log_param("best_iteration", model.best_iteration_)
        elif hasattr(model, 'best_iteration'):
            mlflow.log_param("best_iteration", model.best_iteration)

        # --- E. Evaluation & Metrics ---
        y_pred = model.predict(X_val)
        y_pred_proba = model.predict_proba(X_val)[:, 1]

        precison = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        F1 = f1_score(y_val, y_pred)
        roc = roc_auc_score(y_val, y_pred_proba)
        prauc = average_precision_score(y_val, y_pred_proba)

        # Log params
        mlflow.log_params(params)

        # Log metrics
        metrics = {
            "Precision": precison,
            'Recall': recall,
            'F1': F1,
            'AUC_ROC': roc,
            'PR-AUC': prauc
        }
        mlflow.log_metrics(metrics)

        # --- F. Log Model ---
        mlflow.sklearn.log_model(model, name)

        print(f"Finished {name} - Best Iteration: {model.best_iteration if hasattr(model, 'best_iteration') else getattr(model, 'best_iteration_', 'N/A')}")
        print(f"Model trained! with F1:{F1:.3f}\nAUC-ROC:{roc}\nPR-AUC:{prauc:.3f}")

2026/01/27 12:18:36 INFO mlflow.tracking.fluent: Experiment with name 'Models_comparison_v5' does not exist. Creating a new experiment.


Starting training loop...
Training XGBoost with Live Tracking...
Finished Training XGBoost


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


Logged XGBoost: F1: 0.823 | AUC: 0.974
üèÉ View run XGBoost at: https://dagshub.com/santosh4thmarch/IEEE-CIS-Fraud-detection.mlflow/#/experiments/5/runs/f88872e2603c4a83b8f1acd15223160d
üß™ View experiment at: https://dagshub.com/santosh4thmarch/IEEE-CIS-Fraud-detection.mlflow/#/experiments/5
üèÉ View run LGBM at: https://dagshub.com/santosh4thmarch/IEEE-CIS-Fraud-detection.mlflow/#/experiments/5/runs/e2b9309e17574a0cb69552e74ba4a477
üß™ View experiment at: https://dagshub.com/santosh4thmarch/IEEE-CIS-Fraud-detection.mlflow/#/experiments/5
Training CatBoost...


Default metric period is 5 because AUC is/are not implemented for GPU


Finished Training CatBoost




Logged CatBoost: F1: 0.580 | AUC: 0.964
üèÉ View run CatBoost at: https://dagshub.com/santosh4thmarch/IEEE-CIS-Fraud-detection.mlflow/#/experiments/5/runs/55124cd38ff44a3096a644d5907a6e61
üß™ View experiment at: https://dagshub.com/santosh4thmarch/IEEE-CIS-Fraud-detection.mlflow/#/experiments/5
Training RandomForest...
