In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [None]:
pip install mlflow dagshub

In [None]:
import dagshub
dagshub.init(repo_owner='tvani2', repo_name='IEEE-CIS-Fraud-Detection', mlflow=True)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class FullPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 target_column='isFraud',
                 transaction_thresh=0.6,
                 identity_thresh=0.9,
                 identity_df=None):
        self.target_column = target_column
        self.transaction_thresh = transaction_thresh
        self.identity_thresh = identity_thresh
        self.identity_df = identity_df  # identity will be passed during initialization

    def fit(self, X, y=None):
        # 1. Drop columns with too many missing values
        self.transaction_cols_to_keep = X.columns[X.isnull().mean() < self.transaction_thresh].tolist()
        if self.identity_df is not None:
            self.identity_cols_to_keep = self.identity_df.columns[self.identity_df.isnull().mean() < self.identity_thresh].tolist()
        else:
            self.identity_cols_to_keep = []

        # 2. Merge
        if self.identity_df is not None:
            identity_filtered = self.identity_df[self.identity_cols_to_keep]
            X = X[self.transaction_cols_to_keep].merge(identity_filtered, how='left', on='TransactionID')
        else:
            X = X[self.transaction_cols_to_keep]

        # 3. Separate numeric and categorical columns
        self.numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        self.categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

        # 4. Imputers
        self.numeric_imputer = SimpleImputer(strategy='mean')
        self.categorical_imputer = SimpleImputer(strategy='most_frequent')

        # Fit imputers
        self.numeric_imputer.fit(X[self.numeric_cols])
        self.categorical_imputer.fit(X[self.categorical_cols])

        # 5. Determine WOE and one-hot columns
        s = X[self.categorical_cols].nunique()
        self.woe_columns = list(s[s > 3].index)
        self.one_hot_columns = list(s[s <= 3].index)

        # 6. Fit WOE mappings
        if y is not None:
            df_woe = X[self.woe_columns].copy()
            df_woe['target'] = y.reset_index(drop=True)

            self.woe_mappings = {}
            self.woe_columns_fillna = df_woe[self.woe_columns].mode().T[0].to_dict()

            for col in self.woe_columns:
                groups = df_woe.groupby(col)['target'].agg(['count', 'mean'])
                groups['n_pos'] = groups['mean'] * groups['count']
                groups['n_neg'] = groups['count'] - groups['n_pos']

                total_pos = groups['n_pos'].sum()
                total_neg = groups['n_neg'].sum()

                groups['prop_pos'] = groups['n_pos'] / total_pos
                groups['prop_neg'] = groups['n_neg'] / total_neg

                groups['woe'] = np.log(groups['prop_pos'] / groups['prop_neg'])

                groups.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
                self.woe_mappings[col] = groups['woe'].to_dict()

        return self

    def transform(self, X):
        # 1. Drop columns with too many missing values
        if self.identity_df is not None:
            identity_filtered = self.identity_df[self.identity_cols_to_keep]
            X = X[self.transaction_cols_to_keep].merge(identity_filtered, how='left', on='TransactionID')
        else:
            X = X[self.transaction_cols_to_keep]

        # 2. Impute missing values
        X[self.numeric_cols] = self.numeric_imputer.transform(X[self.numeric_cols])
        X[self.categorical_cols] = self.categorical_imputer.transform(X[self.categorical_cols])

        # 3. Apply WOE encoding
        for col in self.woe_columns:
            new_col = f'{col}_woe'
            X[new_col] = (
                X[col]
                .map(self.woe_mappings[col])
                .fillna(self.woe_mappings[col].get(self.woe_columns_fillna[col], 0))
            )

        # 4. One-hot encode
        X = pd.get_dummies(X, columns=self.one_hot_columns, drop_first=True, dummy_na=True)

        # 5. Drop original WOE and one-hot columns
        cols_to_drop = [col for col in (self.woe_columns + self.one_hot_columns) if col in X.columns]
        X = X.drop(columns=cols_to_drop)

        return X

In [None]:
from sklearn.impute import SimpleImputer

# Load data
transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')

# Separate target
target_column = 'isFraud'
y = transaction[target_column]
X = transaction.drop(columns=[target_column])

# Initialize preprocessor
preprocessor = FullPreprocessor(
    target_column=target_column,
    transaction_thresh=0.6,
    identity_thresh=0.9,
    identity_df=identity
)

# Fit-transform
X_processed = preprocessor.fit_transform(X, y)

print(X_processed.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Training

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

def build_adaboost_pipeline():
    """
    Ultra-fast AdaBoost pipeline optimized for speed.
    """
    pipeline = Pipeline([
        ('model', AdaBoostClassifier(random_state=42))
    ])

    # Minimal hyperparameter grid (single values only)
    param_grid = {
        'model__n_estimators': [30],        # Only one value
        'model__learning_rate': [1.0]       # Only one value (default is 1.0)
    }

    model = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        scoring='roc_auc',
        cv=2,                               # Minimal folds
        verbose=0,                          # Disable verbose output
        n_jobs=-1,                          # Use all cores
        refit=True                          # Faster final fit
    )

    return model

# 1. Build the model (simplified)
model = build_adaboost_pipeline()

# 2. Start MLflow run (minimal logging)
mlflow.set_experiment("AdaBoost_Training")

with mlflow.start_run(run_name="AdaBoost_Training"):
    # 3. Train the model
    model.fit(X_train, y_train)

    # 4. Predict (only test set to save time)
    test_probs = model.predict_proba(X_test)[:, 1]
    test_roc_auc = roc_auc_score(y_test, test_probs)

    # 5. Minimal logging
    mlflow.log_params(model.best_params_)
    mlflow.log_metric("test_roc_auc", test_roc_auc)
    
    # 6. Save model (without artifacts to save time)
    mlflow.sklearn.log_model(model.best_estimator_, "model", 
                            serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE)

    print(f"Test ROC-AUC: {test_roc_auc:.4f}")
    print(f"Best params: {model.best_params_}")

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

def build_adaboost_pipeline():
        pipeline = Pipeline([
        ('model', AdaBoostClassifier(random_state=42))
    ])

    # Extended hyperparameter grid
    param_grid = {
        'model__n_estimators': [30, 50, 100],         # number of trees
        'model__learning_rate': [0.01, 0.1, 0.5, 1.0]  # learning rates
    }

    model = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        scoring='roc_auc',
        cv=3,               # Reasonable tradeoff between speed and reliability
        verbose=1,
        n_jobs=-1,
        refit=True
    )

    return model

# 1. Build the model (simplified)
model = build_adaboost_pipeline()

# 2. Start MLflow run (minimal logging)
mlflow.set_experiment("AdaBoost_Training")

with mlflow.start_run(run_name="AdaBoost_Training"):
    # 3. Train the model
    model.fit(X_train, y_train)

    # 4. Predict (only test set to save time)
    test_probs = model.predict_proba(X_test)[:, 1]
    test_roc_auc = roc_auc_score(y_test, test_probs)

    # 5. Minimal logging
    mlflow.log_params(model.best_params_)
    mlflow.log_metric("test_roc_auc", test_roc_auc)
    
    # 6. Save model (without artifacts to save time)
    mlflow.sklearn.log_model(model.best_estimator_, "model", 
                            serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE)

    print(f"Test ROC-AUC: {test_roc_auc:.4f}")
    print(f"Best params: {model.best_params_}")