In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [28]:
!pip install dagshub mlflow



In [29]:
import dagshub
dagshub.init(repo_owner='zeliz22', repo_name='ML_Fraud-Detection', mlflow=True)

In [30]:
train_transaction =  pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

In [31]:
from sklearn.model_selection import train_test_split

def prepare_data(df, target='isFraud', test_size=0.15, random_state=42):
    # Split train/val/test
    train_val, test = train_test_split(
        df, test_size=test_size, stratify=df[target], random_state=random_state
    )
    train, val = train_test_split(
        train_val, 
        test_size=test_size/(1-test_size),  # Adjust for nested split
        stratify=train_val[target],
        random_state=random_state
    )
    
    # Separate X/y
    def _split(df):
        return df.drop(columns=[target, 'TransactionID']), df[target]
    
    X_train, y_train = _split(train)
    X_val, y_val = _split(val)
    X_test, y_test = _split(test)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(train_transaction)

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
from typing import Dict, Optional, Union

class DataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 numeric_strategy: str = 'mean',
                 categorical_strategy: str = 'most_frequent',
                 numeric_fill_value: Optional[Union[int, float]] = None,
                 categorical_fill_value: Optional[str] = None,
                 drop_threshold: float = 0.8):

        self.numeric_strategy = numeric_strategy
        self.categorical_strategy = categorical_strategy
        self.numeric_fill_value = numeric_fill_value
        self.categorical_fill_value = categorical_fill_value
        self.drop_threshold = drop_threshold
        self.numeric_impute_values_ = {}
        self.categorical_impute_values_ = {}
        self.columns_to_drop_ = []

    def fit(self, X: pd.DataFrame, y=None):
        """Learn imputation values from the data"""
        
        # Identify columns to drop
        null_ratios = X.isnull().mean()
        self.columns_to_drop_ = list(null_ratios[null_ratios > self.drop_threshold].index)
        X_clean = X.drop(columns=self.columns_to_drop_)
        
        # Separate numeric and categorical columns
        numeric_cols = X_clean.select_dtypes(include=np.number).columns
        categorical_cols = X_clean.select_dtypes(exclude=np.number).columns
        
        # Calculate numeric imputation values
        for col in numeric_cols:
            if self.numeric_strategy == 'mean':
                self.numeric_impute_values_[col] = X_clean[col].mean()
            elif self.numeric_strategy == 'median':
                self.numeric_impute_values_[col] = X_clean[col].median()
            elif self.numeric_strategy == 'constant':
                if self.numeric_fill_value is None:
                    raise ValueError("numeric_fill_value must be specified for constant strategy")
                self.numeric_impute_values_[col] = self.numeric_fill_value
            elif self.numeric_strategy != 'drop':
                raise ValueError(f"Unknown numeric strategy: {self.numeric_strategy}")
        
        # Calculate categorical imputation values
        for col in categorical_cols:
            if self.categorical_strategy == 'most_frequent':
                self.categorical_impute_values_[col] = X_clean[col].mode()[0]
            elif self.categorical_strategy == 'constant':
                if self.categorical_fill_value is None:
                    raise ValueError("categorical_fill_value must be specified for constant strategy")
                self.categorical_impute_values_[col] = self.categorical_fill_value
            elif self.categorical_strategy != 'drop':
                raise ValueError(f"Unknown categorical strategy: {self.categorical_strategy}")
        
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the learned imputation to new data"""
        
        # Drop high-null columns
        X_clean = X.drop(columns=self.columns_to_drop_)
        
        # Separate numeric and categorical columns
        numeric_cols = X_clean.select_dtypes(include=np.number).columns
        categorical_cols = X_clean.select_dtypes(exclude=np.number).columns
        
        # Apply numeric imputation
        for col in numeric_cols:
            if col in self.numeric_impute_values_:
                X_clean[col] = X_clean[col].fillna(self.numeric_impute_values_[col])
            elif self.numeric_strategy == 'drop':
                X_clean = X_clean.dropna(subset=[col])
        
        # Apply categorical imputation
        for col in categorical_cols:
            if col in self.categorical_impute_values_:
                X_clean[col] = X_clean[col].fillna(self.categorical_impute_values_[col])
            elif self.categorical_strategy == 'drop':
                X_clean = X_clean.dropna(subset=[col])
        
        return X_clean

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """Fit and transform in one step"""
        return self.fit(X, y).transform(X)

In [33]:
import mlflow
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class AdvancedDataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, missing_threshold=1):
        self.missing_threshold = missing_threshold
        self.numeric_fill = -999
        self.categorical_fill = "MISSING"
        self.columns_dropped = []
        self.missing_stats = {}
        
    def fit(self, X, y=None):
        # Calculate missing percentages
        missing_percent = X.isnull().mean()
        self.missing_stats = missing_percent.to_dict()
        
        # Identify columns to drop
        self.columns_dropped = list(missing_percent[missing_percent > self.missing_threshold].index)
        self.columns_kept = [col for col in X.columns if col not in self.columns_dropped]
        
        return self
    
    def transform(self, X):
        X = X.copy()
        
        # 1. Drop high-missing columns
        X = X.drop(columns=self.columns_dropped)
        
        # 2. Fill remaining missing values
        num_cols = X.select_dtypes(include=['number']).columns
        cat_cols = X.select_dtypes(exclude=['number']).columns
        
        X[num_cols] = X[num_cols].fillna(self.numeric_fill)
        X[cat_cols] = X[cat_cols].fillna(self.categorical_fill)
        
        return X

In [34]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

class CustomEncoder:
    def __init__(self, threshold = 3):
        self.threshold = threshold
        
        # Initialize encoders
        self.one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        
        # Store feature names for one-hot encoding
        self.one_hot_feature_names = None
        
    def fit(self, X, y=None):

        cat_cols = [col for col in X.columns if X[col].dtype == 'object']
        s = X[cat_cols].nunique()

        self.ordinal_cols = list(s[s > self.threshold].index)
        self.one_hot_cols = list(s[s <= self.threshold].index)

        if self.one_hot_cols:
            self.one_hot_encoder.fit(X[self.one_hot_cols])
            self.one_hot_feature_names = self.one_hot_encoder.get_feature_names_out(self.one_hot_cols)
        
        if self.ordinal_cols:
            self.ordinal_encoder.fit(X[self.ordinal_cols])
        
        return self
    
    def transform(self, X):

        X_transformed = X.copy()
        
        # Apply One-Hot Encoding
        if self.one_hot_cols:
            one_hot_encoded = self.one_hot_encoder.transform(X[self.one_hot_cols])
            one_hot_df = pd.DataFrame(one_hot_encoded, columns=self.one_hot_feature_names, index=X.index)
            X_transformed = pd.concat([X_transformed, one_hot_df], axis=1)
            X_transformed.drop(self.one_hot_cols, axis=1, inplace=True)
        
        # Apply Ordinal Encoding
        if self.ordinal_cols:
            ordinal_encoded = self.ordinal_encoder.transform(X[self.ordinal_cols])
            ordinal_df = pd.DataFrame(ordinal_encoded, columns=self.ordinal_cols, index=X.index)
            X_transformed[self.ordinal_cols] = ordinal_df
        
        return X_transformed
    
    def fit_transform(self, X, y = None):
        return self.fit(X).transform(X)

In [35]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class CorrelationFeatureDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.features_to_drop = []
        self.high_corr_pairs = []
        
    def fit(self, X, y): 
        X_corr = X.copy()
        X_corr['isFraud'] = y
        corr_matrix = X_corr.corr().abs()
        
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                
                if corr_matrix.iloc[i, j] > self.threshold:
                    self.high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
                    
        for feat1, feat2, _ in self.high_corr_pairs:
            if abs(X[feat1].corr(y)) < abs(X[feat2].corr(y)):
                self.features_to_drop.append(feat1)
            else:
                self.features_to_drop.append(feat2)
        
        self.features_to_drop = list(set(self.features_to_drop))
        return self

    
    def transform(self, X):
      return X.drop(columns=self.features_to_drop)
        
    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
    

In [36]:
pip uninstall scikit-learn imbalanced-learn -y

Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Found existing installation: imbalanced-learn 0.10.1
Uninstalling imbalanced-learn-0.10.1:
  Successfully uninstalled imbalanced-learn-0.10.1
Note: you may need to restart the kernel to use updated packages.


In [37]:
pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1

Collecting scikit-learn==1.2.2
  Using cached scikit_learn-1.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.10.1
  Using cached imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Using cached scikit_learn-1.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
Using cached imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
Installing collected packages: scikit-learn, imbalanced-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
nilearn 0.11.1 requires scikit-learn>=1.4.0, but you have scikit-learn 1.2.2 which is incompatible.
bigframes 1.36.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
mlxtend 0.23.4 requires scikit-learn>=1.3.1, but you have scikit-learn 1.2.2 which is incompatible.[0m[31m
[0mSuccessfully installed imbalanced-learn

In [38]:
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from imblearn.under_sampling import RandomUnderSampler


pipeline = ImbPipeline([
    ('cleaner', AdvancedDataCleaner()),
    ('encoder', CustomEncoder()),
    ('scaler', StandardScaler()),  # Important for logistic regression
    # ('feature_selector', SelectFromModel(
    #     LogisticRegression(class_weight='balanced', penalty='l1', solver='liblinear'),
    #     threshold="median")),
    ('classifier', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=42,
        penalty='l2',
        C=0.1,
        solver='lbfgs'))
])


In [39]:
import mlflow
from sklearn.metrics import roc_auc_score

# Set up MLflow experiment
mlflow.set_experiment("Basic_log_regression")

# Start a new run
with mlflow.start_run(run_name="saving all columns and its missing values + encoder"):
    # Log parameters
    mlflow.log_params({
        "model_type": "LogisticRegression",
        "missing_values": "-999 for numeric,'MISSING' for categorical",
        "encoding": "WOE + one_hot_encoding(columns with unique<3)",
        "scaler": "StandartScaler"
    })
    
    # Train and evaluate model
    pipeline.fit(X_train, y_train)
    train_preds = pipeline.predict_proba(X_train)[:, 1]
    val_preds = pipeline.predict_proba(X_val)[:, 1]
    roc_auc_train = roc_auc_score(y_train, train_preds)
    roc_auc = roc_auc_score(y_val, val_preds)
    
    # Log metrics
    mlflow.log_metric("val_roc_auc_val", roc_auc)
    mlflow.log_metric("val_roc_auc_train", roc_auc_train)
    print(f"Logged ROC-AUC_VAL: {roc_auc:.4f}")
    print(f"Logged ROC-AUC_TRAIN: {roc_auc_train:.4f}")

    
    # Log the model
    mlflow.sklearn.log_model(pipeline, "model")
    
    # Add a tag to identify this as baseline
    mlflow.set_tag("stage", "baseline")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logged ROC-AUC_VAL: 0.8446
Logged ROC-AUC_TRAIN: 0.8457




🏃 View run saving all columns and its missing values + encoder at: https://dagshub.com/zeliz22/ML_Fraud-Detection.mlflow/#/experiments/0/runs/08b4eb47315b49dfa7ff74fa4daa8eed
🧪 View experiment at: https://dagshub.com/zeliz22/ML_Fraud-Detection.mlflow/#/experiments/0
