In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [None]:
pip install mlflow dagshub

In [None]:
import dagshub
dagshub.init(repo_owner='tvani2', repo_name='IEEE-CIS-Fraud-Detection', mlflow=True)

# Cleaning and engineering

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class FullPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 target_column='isFraud',
                 transaction_thresh=0.6,
                 identity_thresh=0.9,
                 identity_df=None):
        self.target_column = target_column
        self.transaction_thresh = transaction_thresh
        self.identity_thresh = identity_thresh
        self.identity_df = identity_df  # identity will be passed during initialization

    def fit(self, X, y=None):
        # 1. Drop columns with too many missing values
        self.transaction_cols_to_keep = X.columns[X.isnull().mean() < self.transaction_thresh].tolist()
        if self.identity_df is not None:
            self.identity_cols_to_keep = self.identity_df.columns[self.identity_df.isnull().mean() < self.identity_thresh].tolist()
        else:
            self.identity_cols_to_keep = []

        # 2. Merge
        if self.identity_df is not None:
            identity_filtered = self.identity_df[self.identity_cols_to_keep]
            X = X[self.transaction_cols_to_keep].merge(identity_filtered, how='left', on='TransactionID')
        else:
            X = X[self.transaction_cols_to_keep]

        # 3. Separate numeric and categorical columns
        self.numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        self.categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

        # 4. Imputers
        self.numeric_imputer = SimpleImputer(strategy='mean')
        self.categorical_imputer = SimpleImputer(strategy='most_frequent')

        # Fit imputers
        self.numeric_imputer.fit(X[self.numeric_cols])
        self.categorical_imputer.fit(X[self.categorical_cols])

        # 5. Determine WOE and one-hot columns
        s = X[self.categorical_cols].nunique()
        self.woe_columns = list(s[s > 3].index)
        self.one_hot_columns = list(s[s <= 3].index)

        # 6. Fit WOE mappings
        if y is not None:
            df_woe = X[self.woe_columns].copy()
            df_woe['target'] = y.reset_index(drop=True)

            self.woe_mappings = {}
            self.woe_columns_fillna = df_woe[self.woe_columns].mode().T[0].to_dict()

            for col in self.woe_columns:
                groups = df_woe.groupby(col)['target'].agg(['count', 'mean'])
                groups['n_pos'] = groups['mean'] * groups['count']
                groups['n_neg'] = groups['count'] - groups['n_pos']

                total_pos = groups['n_pos'].sum()
                total_neg = groups['n_neg'].sum()

                groups['prop_pos'] = groups['n_pos'] / total_pos
                groups['prop_neg'] = groups['n_neg'] / total_neg

                groups['woe'] = np.log(groups['prop_pos'] / groups['prop_neg'])

                groups.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
                self.woe_mappings[col] = groups['woe'].to_dict()

        return self

    def transform(self, X):
        # 1. Drop columns with too many missing values
        if self.identity_df is not None:
            identity_filtered = self.identity_df[self.identity_cols_to_keep]
            X = X[self.transaction_cols_to_keep].merge(identity_filtered, how='left', on='TransactionID')
        else:
            X = X[self.transaction_cols_to_keep]

        # 2. Impute missing values
        X[self.numeric_cols] = self.numeric_imputer.transform(X[self.numeric_cols])
        X[self.categorical_cols] = self.categorical_imputer.transform(X[self.categorical_cols])

        # 3. Apply WOE encoding
        for col in self.woe_columns:
            new_col = f'{col}_woe'
            X[new_col] = (
                X[col]
                .map(self.woe_mappings[col])
                .fillna(self.woe_mappings[col].get(self.woe_columns_fillna[col], 0))
            )

        # 4. One-hot encode
        X = pd.get_dummies(X, columns=self.one_hot_columns, drop_first=True, dummy_na=True)

        # 5. Drop original WOE and one-hot columns
        cols_to_drop = [col for col in (self.woe_columns + self.one_hot_columns) if col in X.columns]
        X = X.drop(columns=cols_to_drop)

        return X

In [None]:
from sklearn.impute import SimpleImputer

# Load data
transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')

# Separate target
target_column = 'isFraud'
y = transaction[target_column]
X = transaction.drop(columns=[target_column])

# Initialize preprocessor
preprocessor = FullPreprocessor(
    target_column=target_column,
    transaction_thresh=0.6,
    identity_thresh=0.9,
    identity_df=identity
)

# Fit-transform
X_processed = preprocessor.fit_transform(X, y)

print(X_processed.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Feature selection

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

class CorrelationDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.to_drop_ = None

    def fit(self, X, y=None):
        # 1. Calculate correlation matrix
        corr_matrix = X.corr().abs()
        
        # 2. Upper triangle of the correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        
        # 3. Find features with correlation greater than threshold
        self.to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
        
        print(f"Columns to drop due to high correlation ({len(self.to_drop_)}): {self.to_drop_}")
        
        return self

    def transform(self, X):
        # 4. Drop them
        X_dropped = X.drop(columns=self.to_drop_, errors='ignore')
        return X_dropped

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [None]:
dropper = CorrelationDropper(threshold=0.9)

# Fit on training data and transform it
X_train_new = dropper.fit_transform(X_train)

# Only transform test data
X_test_new = dropper.transform(X_test)

# Done!
print("Train set:", X_train_new.shape)
print("Test set:", X_test_new.shape)

# Training

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# 1. Build a simple pipeline
pipeline = Pipeline([
    ('model', DecisionTreeClassifier(max_depth=5, random_state=42, class_weight='balanced'))
])

pipeline.fit(X_train, y_train)

# 3. Predict
train_preds = pipeline.predict(X_train)
train_probs = pipeline.predict_proba(X_train)[:, 1]

test_preds = pipeline.predict(X_test)
test_probs = pipeline.predict_proba(X_test)[:, 1]

# 4. Evaluate
print("Train ROC-AUC:", roc_auc_score(y_train, train_probs))
print(classification_report(y_train, train_preds))
print(confusion_matrix(y_train, train_preds))

print("\nTest ROC-AUC:", roc_auc_score(y_test, test_probs))
print(classification_report(y_test, test_preds))
print(confusion_matrix(y_test, test_preds))

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# 1. Build the pipeline
pipeline = Pipeline([
    ('model', DecisionTreeClassifier(max_depth=5, random_state=42, class_weight='balanced'))
])

# 2. Start an MLflow run
mlflow.set_experiment("Decision_Tree_Training")

with mlflow.start_run(run_name="Decision_Tree_Training_Run") as run:
    # 3. Train the model
    pipeline.fit(X_train, y_train)

    # 4. Predict
    train_preds = pipeline.predict(X_train)
    train_probs = pipeline.predict_proba(X_train)[:, 1]

    test_preds = pipeline.predict(X_test)
    test_probs = pipeline.predict_proba(X_test)[:, 1]

    # 5. Metrics
    train_roc_auc = roc_auc_score(y_train, train_probs)
    test_roc_auc = roc_auc_score(y_test, test_probs)

    train_report = classification_report(y_train, train_preds, output_dict=True)
    test_report = classification_report(y_test, test_preds, output_dict=True)

    train_conf_matrix = confusion_matrix(y_train, train_preds)
    test_conf_matrix = confusion_matrix(y_test, test_preds)

    # 6. Log parameters
    mlflow.log_param("model_type", "DecisionTreeClassifier")
    mlflow.log_param("max_depth", 5)
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("random_state", 42)

    # 7. Log metrics (ROC-AUC first)
    mlflow.log_metric("train_roc_auc", train_roc_auc)
    mlflow.log_metric("test_roc_auc", test_roc_auc)

    # 8. Log classification report metrics
    for label, metrics in train_report.items():
        if isinstance(metrics, dict):  # Skip 'accuracy' which is a float
            mlflow.log_metric(f"train_precision_{label}", metrics['precision'])
            mlflow.log_metric(f"train_recall_{label}", metrics['recall'])
            mlflow.log_metric(f"train_f1_score_{label}", metrics['f1-score'])

    for label, metrics in test_report.items():
        if isinstance(metrics, dict):
            mlflow.log_metric(f"test_precision_{label}", metrics['precision'])
            mlflow.log_metric(f"test_recall_{label}", metrics['recall'])
            mlflow.log_metric(f"test_f1_score_{label}", metrics['f1-score'])

    # 9. Optionally log confusion matrices (save them to artifacts if you want)
    import numpy as np
    import os

    # Save confusion matrices as .npy files and log them
    np.save('train_confusion_matrix.npy', train_conf_matrix)
    np.save('test_confusion_matrix.npy', test_conf_matrix)

    mlflow.log_artifact('train_confusion_matrix.npy')
    mlflow.log_artifact('test_confusion_matrix.npy')

    # 10. Save the model itself
    mlflow.sklearn.log_model(pipeline, "model")

print("✅ Finished training and logging to MLflow!")