In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
!pip install dagshub mlflow

Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub)
  Downloading gql-3.5.2-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting treelib>=1.6.4 (from dagshub)
  Downloading treelib-1.7.1-py3-none-any.whl.metadata (1.4 kB)
Collecting pathvalidate>=3.0.0 (from dagshub)
  Downloading pathvalidate-3.2.3-py3-none-any.whl.metadata (12 kB)
Collecting dagshub-annotation-converter>=0.1.5 (from dagshub)
  Downloading dagshub_annotation_converter-0.1.9-py3-none-any.whl.metadata (2.5 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloadin

In [3]:
import dagshub
dagshub.init(repo_owner='zeliz22', repo_name='ML_Fraud-Detection', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=57349f39-5612-471e-b287-a02c51749bb9&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=175d294b60223e7b27051057c5ee95ec5072f3500cc877b2eb65331e5eafebfd




Output()

In [4]:
train_transaction =  pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
train_identity =  pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')

In [5]:
train_merged = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")

print(train_merged.shape)

(590540, 434)


In [6]:
from sklearn.model_selection import train_test_split
def prepare_data(df, target='isFraud', test_size=0.15, random_state=42):
    # Split train/val/test
    train_val, test = train_test_split(
        df, test_size=test_size, stratify=df[target], random_state=random_state
    )
    train, val = train_test_split(
        train_val, 
        test_size=test_size/(1-test_size),  # Adjust for nested split
        stratify=train_val[target],
        random_state=random_state
    )
    
    # Separate X/y
    def _split(df):
        return df.drop(columns=[target, 'TransactionID']), df[target]
    
    X_train, y_train = _split(train)
    X_val, y_val = _split(val)
    X_test, y_test = _split(test)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(train_merged)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
from typing import Dict, Optional, Union

class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 numeric_strategy: str = 'mean',
                 categorical_strategy: str = 'most_frequent',
                 numeric_fill_value: Optional[Union[int, float]] = None,
                 categorical_fill_value: Optional[str] = None,
                 drop_threshold: float = 0.8):

        self.numeric_strategy = numeric_strategy
        self.categorical_strategy = categorical_strategy
        self.numeric_fill_value = numeric_fill_value
        self.categorical_fill_value = categorical_fill_value
        self.drop_threshold = drop_threshold
        self.numeric_impute_values_ = {}
        self.categorical_impute_values_ = {}
        self.columns_to_drop_ = []

    def fit(self, X: pd.DataFrame, y=None):
        """Learn imputation values from the data"""
        
        # Identify columns to drop
        null_ratios = X.isnull().mean()
        self.columns_to_drop_ = list(null_ratios[null_ratios > self.drop_threshold].index)
        X_clean = X.drop(columns=self.columns_to_drop_)
        
        # Separate numeric and categorical columns
        numeric_cols = X_clean.select_dtypes(include=np.number).columns
        categorical_cols = X_clean.select_dtypes(exclude=np.number).columns
        
        # Calculate numeric imputation values
        for col in numeric_cols:
            if self.numeric_strategy == 'mean':
                self.numeric_impute_values_[col] = X_clean[col].mean()
            elif self.numeric_strategy == 'median':
                self.numeric_impute_values_[col] = X_clean[col].median()
            elif self.numeric_strategy == 'constant':
                if self.numeric_fill_value is None:
                    raise ValueError("numeric_fill_value must be specified for constant strategy")
                self.numeric_impute_values_[col] = self.numeric_fill_value
            elif self.numeric_strategy != 'drop':
                raise ValueError(f"Unknown numeric strategy: {self.numeric_strategy}")
        
        # Calculate categorical imputation values
        for col in categorical_cols:
            if self.categorical_strategy == 'most_frequent':
                self.categorical_impute_values_[col] = X_clean[col].mode()[0]
            elif self.categorical_strategy == 'constant':
                if self.categorical_fill_value is None:
                    raise ValueError("categorical_fill_value must be specified for constant strategy")
                self.categorical_impute_values_[col] = self.categorical_fill_value
            elif self.categorical_strategy != 'drop':
                raise ValueError(f"Unknown categorical strategy: {self.categorical_strategy}")
        
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the learned imputation to new data"""
        
        # Drop high-null columns
        X_clean = X.drop(columns=self.columns_to_drop_)
        
        # Separate numeric and categorical columns
        numeric_cols = X_clean.select_dtypes(include=np.number).columns
        categorical_cols = X_clean.select_dtypes(exclude=np.number).columns
        
        # Apply numeric imputation
        for col in numeric_cols:
            if col in self.numeric_impute_values_:
                X_clean[col] = X_clean[col].fillna(self.numeric_impute_values_[col])
            elif self.numeric_strategy == 'drop':
                X_clean = X_clean.dropna(subset=[col])
        
        # Apply categorical imputation
        for col in categorical_cols:
            if col in self.categorical_impute_values_:
                X_clean[col] = X_clean[col].fillna(self.categorical_impute_values_[col])
            elif self.categorical_strategy == 'drop':
                X_clean = X_clean.dropna(subset=[col])
        
        return X_clean

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """Fit and transform in one step"""
        return self.fit(X, y).transform(X)

In [8]:
import mlflow
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class AdvancedDataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, missing_threshold=1):
        self.missing_threshold = missing_threshold
        self.numeric_fill = -999
        self.categorical_fill = "MISSING"
        self.columns_dropped = []
        self.missing_stats = {}
        
    def fit(self, X, y=None):
        # Calculate missing percentages
        missing_percent = X.isnull().mean()
        self.missing_stats = missing_percent.to_dict()
        
        # Identify columns to drop
        self.columns_dropped = list(missing_percent[missing_percent > self.missing_threshold].index)
        self.columns_kept = [col for col in X.columns if col not in self.columns_dropped]
        
        return self
    
    def transform(self, X):
        X = X.copy()
        
        # 1. Drop high-missing columns
        X = X.drop(columns=self.columns_dropped)
        
        # 2. Fill remaining missing values
        num_cols = X.select_dtypes(include=['number']).columns
        cat_cols = X.select_dtypes(exclude=['number']).columns
        
        X[num_cols] = X[num_cols].fillna(self.numeric_fill)
        X[cat_cols] = X[cat_cols].fillna(self.categorical_fill)
        
        return X

In [9]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class MissingValueHandler(BaseEstimator, TransformerMixin):
    def __init__(self, num_strategy='median', cat_strategy='most_frequent', 
                 create_flags=True, flag_threshold=0.01, flag_only=False):
        """
        Optimized missing value handler that avoids fragmentation warnings
        
        Parameters same as before
        """
        self.num_strategy = num_strategy
        self.cat_strategy = cat_strategy
        self.create_flags = create_flags
        self.flag_threshold = flag_threshold
        self.flag_only = flag_only
        
    def fit(self, X, y=None):
        # Safely detect column types
        self.num_cols_ = X.select_dtypes(include=np.number).columns.tolist()
        self.cat_cols_ = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Initialize storage
        self.impute_values_ = {}
        self.flag_cols_ = []
        
        # Process all columns with missing values
        for col in X.columns:
            missing_ratio = X[col].isna().mean()
            
            if missing_ratio > 1e-6:  # Small epsilon to avoid float precision issues
                # Flag creation logic
                if self.create_flags and missing_ratio >= self.flag_threshold:
                    self.flag_cols_.append(col)
                
                # Imputation value calculation (unless flag_only)
                if not self.flag_only:
                    if col in self.num_cols_:
                        if self.num_strategy == 'median':
                            self.impute_values_[col] = X[col].median()
                        elif self.num_strategy == 'mean':
                            self.impute_values_[col] = X[col].mean()
                        else:  # constant
                            self.impute_values_[col] = 0
                    elif col in self.cat_cols_:
                        if self.cat_strategy == 'most_frequent':
                            # Handle case where mode might be empty
                            mode = X[col].mode()
                            self.impute_values_[col] = mode[0] if not mode.empty else 'missing'
                        else:
                            self.impute_values_[col] = 'missing'
        
        return self
    
    def transform(self, X):
        # Create a single copy upfront
        X = X.copy()
        
        if self.create_flags and len(self.flag_cols_) > 0:
            # Create all flags at once using pd.concat (more efficient)
            flag_data = {f'{col}_missing_flag': X[col].isna().astype(np.int8) 
                         for col in self.flag_cols_}
            X = pd.concat([X, pd.DataFrame(flag_data)], axis=1)
        
        # Perform imputation (unless flag_only)
        if not self.flag_only:
            for col, value in self.impute_values_.items():
                if col in X.columns:  # Safety check
                    X[col] = X[col].fillna(value)
        
        return X

In [10]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

class CustomEncoder:
    def __init__(self, threshold = 3):
        self.threshold = threshold
        
        # Initialize encoders
        self.one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        
        # Store feature names for one-hot encoding
        self.one_hot_feature_names = None
        
    def fit(self, X, y=None):

        cat_cols = [col for col in X.columns if X[col].dtype == 'object']
        s = X[cat_cols].nunique()

        self.ordinal_cols = list(s[s > self.threshold].index)
        self.one_hot_cols = list(s[s <= self.threshold].index)

        if self.one_hot_cols:
            self.one_hot_encoder.fit(X[self.one_hot_cols])
            self.one_hot_feature_names = self.one_hot_encoder.get_feature_names_out(self.one_hot_cols)
        
        if self.ordinal_cols:
            self.ordinal_encoder.fit(X[self.ordinal_cols])
        
        return self
    
    def transform(self, X):

        X_transformed = X.copy()
        
        # Apply One-Hot Encoding
        if self.one_hot_cols:
            one_hot_encoded = self.one_hot_encoder.transform(X[self.one_hot_cols])
            one_hot_df = pd.DataFrame(one_hot_encoded, columns=self.one_hot_feature_names, index=X.index)
            X_transformed = pd.concat([X_transformed, one_hot_df], axis=1)
            X_transformed.drop(self.one_hot_cols, axis=1, inplace=True)
        
        # Apply Ordinal Encoding
        if self.ordinal_cols:
            ordinal_encoded = self.ordinal_encoder.transform(X[self.ordinal_cols])
            ordinal_df = pd.DataFrame(ordinal_encoded, columns=self.ordinal_cols, index=X.index)
            X_transformed[self.ordinal_cols] = ordinal_df
        
        return X_transformed
    
    def fit_transform(self, X, y = None):
        return self.fit(X).transform(X)

In [11]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class CorrelationFeatureDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.features_to_drop = []
        self.high_corr_pairs = []
        
    def fit(self, X, y): 
        X_corr = X.copy()
        X_corr['isFraud'] = y
        corr_matrix = X_corr.corr().abs()
        
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                
                if corr_matrix.iloc[i, j] > self.threshold:
                    self.high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
                    
        for feat1, feat2, _ in self.high_corr_pairs:
            if abs(X[feat1].corr(y)) < abs(X[feat2].corr(y)):
                self.features_to_drop.append(feat1)
            else:
                self.features_to_drop.append(feat2)
        
        self.features_to_drop = list(set(self.features_to_drop))
        return self

    
    def transform(self, X):
      return X.drop(columns=self.features_to_drop)
        
    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
    

In [12]:
pip uninstall scikit-learn imbalanced-learn -y

Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Found existing installation: imbalanced-learn 0.13.0
Uninstalling imbalanced-learn-0.13.0:
  Successfully uninstalled imbalanced-learn-0.13.0
Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1

Collecting scikit-learn==1.2.2
  Downloading scikit_learn-1.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.10.1
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Downloading scikit_learn-1.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imbalanced-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
nilearn 0.11.1 requires scikit-learn>=1.4.0, but you have scikit-learn 1.2.2 whi

In [15]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

# Create the pipeline
pipeline = Pipeline([
    ('missing', DataCleaner()),
    ('encoding', CustomEncoder()),
    ('correlation_drop', CorrelationFeatureDropper(threshold=0.8)),
    ('scaler', StandardScaler()),
    ('adaboost', AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=1),
        n_estimators=50,
        learning_rate=1.0,
        random_state=42
    ))
])

In [17]:
import mlflow
from sklearn.metrics import (roc_auc_score, f1_score, recall_score, 
                             precision_score, accuracy_score, 
                             average_precision_score, confusion_matrix, 
                             classification_report)

# Set up MLflow experiment
mlflow.set_experiment("AdaBoost_transactions+identity")

# Start a new run
with mlflow.start_run(run_name="Basic version using AdaBoost"):
    # Log parameters
    mlflow.log_params({
        "model_type": "AdaBoost",
        "missing_values": "Mean/most freuqent",
        "correlation_drop": "CorrelationFeatureDropper(threshold=0.8)",
        "encoding": "ordinal encoding + one_hot_encoding(columns with unique<3)",
        "scaler": "StandardScaler",
        "n_estimators": "50",
        "learning_rate": "1.0"
        
    })

    # Fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # Get predictions for all datasets
    datasets = {
        'train': (X_train, y_train),
        'val': (X_val, y_val),
        #'test': (X_test, y_test)
    }
    
    metrics = {}
    
    for dataset_name, (X, y) in datasets.items():
        # Get probabilities and predictions
        y_pred_proba = pipeline.predict_proba(X)[:, 1]
        y_pred = pipeline.predict(X)
        
        # Calculate metrics
        metrics[f"{dataset_name}_roc_auc"] = roc_auc_score(y, y_pred_proba)
        metrics[f"{dataset_name}_f1"] = f1_score(y, y_pred)
        metrics[f"{dataset_name}_recall"] = recall_score(y, y_pred)
        metrics[f"{dataset_name}_precision"] = precision_score(y, y_pred)
        metrics[f"{dataset_name}_accuracy"] = accuracy_score(y, y_pred)
        metrics[f"{dataset_name}_average_precision"] = average_precision_score(y, y_pred_proba)
        
        # For binary classification, also log metrics for class 1
        metrics[f"{dataset_name}_f1_class1"] = f1_score(y, y_pred, pos_label=1)
        metrics[f"{dataset_name}_recall_class1"] = recall_score(y, y_pred, pos_label=1)
        metrics[f"{dataset_name}_precision_class1"] = precision_score(y, y_pred, pos_label=1)
        
        # Print some key metrics
        print(f"\n{dataset_name.upper()} Metrics:")
        print(f"ROC AUC: {metrics[f'{dataset_name}_roc_auc']:.4f}")
    
    
    # Log all metrics to MLflow
    mlflow.log_metrics(metrics)
    
    # Log the model
    mlflow.sklearn.log_model(pipeline, "model")
    
    # Add a tag to identify this as baseline
    mlflow.set_tag("stage", "baseline")


TRAIN Metrics:
ROC AUC: 0.8608

VAL Metrics:
ROC AUC: 0.8586




🏃 View run Basic version using  at: https://dagshub.com/zeliz22/ML_Fraud-Detection.mlflow/#/experiments/3/runs/a87656459f0f4060930b393dea04901e
🧪 View experiment at: https://dagshub.com/zeliz22/ML_Fraud-Detection.mlflow/#/experiments/3
