In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [37]:
pip install mlflow dagshub

Note: you may need to restart the kernel to use updated packages.


In [38]:
import dagshub
dagshub.init(repo_owner='tvani2', repo_name='IEEE-CIS-Fraud-Detection', mlflow=True)

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class FullPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 target_column='isFraud',
                 transaction_thresh=0.6,
                 identity_thresh=0.9,
                 identity_df=None):
        self.target_column = target_column
        self.transaction_thresh = transaction_thresh
        self.identity_thresh = identity_thresh
        self.identity_df = identity_df  # identity will be passed during initialization

    def fit(self, X, y=None):
        # 1. Drop columns with too many missing values
        self.transaction_cols_to_keep = X.columns[X.isnull().mean() < self.transaction_thresh].tolist()
        if self.identity_df is not None:
            self.identity_cols_to_keep = self.identity_df.columns[self.identity_df.isnull().mean() < self.identity_thresh].tolist()
        else:
            self.identity_cols_to_keep = []

        # 2. Merge
        if self.identity_df is not None:
            identity_filtered = self.identity_df[self.identity_cols_to_keep]
            X = X[self.transaction_cols_to_keep].merge(identity_filtered, how='left', on='TransactionID')
        else:
            X = X[self.transaction_cols_to_keep]

        # 3. Separate numeric and categorical columns
        self.numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        self.categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

        # 4. Imputers
        self.numeric_imputer = SimpleImputer(strategy='mean')
        self.categorical_imputer = SimpleImputer(strategy='most_frequent')

        # Fit imputers
        self.numeric_imputer.fit(X[self.numeric_cols])
        self.categorical_imputer.fit(X[self.categorical_cols])

        # 5. Determine WOE and one-hot columns
        s = X[self.categorical_cols].nunique()
        self.woe_columns = list(s[s > 3].index)
        self.one_hot_columns = list(s[s <= 3].index)

        # 6. Fit WOE mappings
        if y is not None:
            df_woe = X[self.woe_columns].copy()
            df_woe['target'] = y.reset_index(drop=True)

            self.woe_mappings = {}
            self.woe_columns_fillna = df_woe[self.woe_columns].mode().T[0].to_dict()

            for col in self.woe_columns:
                groups = df_woe.groupby(col)['target'].agg(['count', 'mean'])
                groups['n_pos'] = groups['mean'] * groups['count']
                groups['n_neg'] = groups['count'] - groups['n_pos']

                total_pos = groups['n_pos'].sum()
                total_neg = groups['n_neg'].sum()

                groups['prop_pos'] = groups['n_pos'] / total_pos
                groups['prop_neg'] = groups['n_neg'] / total_neg

                groups['woe'] = np.log(groups['prop_pos'] / groups['prop_neg'])

                groups.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
                self.woe_mappings[col] = groups['woe'].to_dict()

        return self

    def transform(self, X):
        # 1. Drop columns with too many missing values
        if self.identity_df is not None:
            identity_filtered = self.identity_df[self.identity_cols_to_keep]
            X = X[self.transaction_cols_to_keep].merge(identity_filtered, how='left', on='TransactionID')
        else:
            X = X[self.transaction_cols_to_keep]

        # 2. Impute missing values
        X[self.numeric_cols] = self.numeric_imputer.transform(X[self.numeric_cols])
        X[self.categorical_cols] = self.categorical_imputer.transform(X[self.categorical_cols])

        # 3. Apply WOE encoding
        for col in self.woe_columns:
            new_col = f'{col}_woe'
            X[new_col] = (
                X[col]
                .map(self.woe_mappings[col])
                .fillna(self.woe_mappings[col].get(self.woe_columns_fillna[col], 0))
            )

        # 4. One-hot encode
        X = pd.get_dummies(X, columns=self.one_hot_columns, drop_first=True, dummy_na=True)

        # 5. Drop original WOE and one-hot columns
        cols_to_drop = [col for col in (self.woe_columns + self.one_hot_columns) if col in X.columns]
        X = X.drop(columns=cols_to_drop)

        return X

In [33]:
from sklearn.impute import SimpleImputer

# Load data
transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')

# Separate target
target_column = 'isFraud'
y = transaction[target_column]
X = transaction.drop(columns=[target_column])

# Initialize preprocessor
preprocessor = FullPreprocessor(
    target_column=target_column,
    transaction_thresh=0.6,
    identity_thresh=0.9,
    identity_df=identity
)

# Fit-transform
X_processed = preprocessor.fit_transform(X, y)

print(X_processed.shape)

  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (


(590540, 277)


In [None]:
import mlflow
import mlflow.sklearn

# 1. Set experiment
mlflow.set_experiment('XGBoost_Training')

# 2. Start the run
with mlflow.start_run(run_name="XGBoost_Cleaning") as run:
    # 3. Initialize and fit your preprocessor
    preprocessor = FullPreprocessor(identity_df=identity)
    preprocessor.fit(X, y)  # Only pass X and y (identity is now inside preprocessor)

    # 4. Transform your data
    X_processed = preprocessor.transform(X)

    # 5. Log the preprocessor model
    mlflow.sklearn.log_model(preprocessor, "full_preprocessor")

    # 6. Optionally, log some metadata
    mlflow.log_param("transaction_thresh", preprocessor.transaction_thresh)
    mlflow.log_param("identity_thresh", preprocessor.identity_thresh)
    mlflow.log_metric("num_features_after_cleaning", X_processed.shape[1])

    print(f"Run ID: {run.info.run_id}")

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [None]:
# from sklearn.base import BaseEstimator, TransformerMixin
# import numpy as np
# import pandas as pd

# class CorrelationDropper(BaseEstimator, TransformerMixin):
#     def __init__(self, threshold=0.9):
#         self.threshold = threshold
#         self.to_drop_ = None

#     def fit(self, X, y=None):
#         # 1. Calculate correlation matrix
#         corr_matrix = X.corr().abs()
        
#         # 2. Upper triangle of the correlation matrix
#         upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        
#         # 3. Find features with correlation greater than threshold
#         self.to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
        
#         print(f"Columns to drop due to high correlation ({len(self.to_drop_)}): {self.to_drop_}")
        
#         return self

#     def transform(self, X):
#         # 4. Drop them
#         X_dropped = X.drop(columns=self.to_drop_, errors='ignore')
#         return X_dropped

#     def fit_transform(self, X, y=None):
#         return self.fit(X, y).transform(X)

In [12]:
!pip uninstall scikit-learn imbalanced-learn -y
!pip install scikit-learn imbalanced-learn --upgrade

Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Found existing installation: imbalanced-learn 0.13.0
Uninstalling imbalanced-learn-0.13.0:
  Successfully uninstalled imbalanced-learn-0.13.0
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hDownloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imbalanced-learn
[31

In [14]:
!pip uninstall scikit-learn imbalanced-learn -y
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1

Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Found existing installation: imbalanced-learn 0.13.0
Uninstalling imbalanced-learn-0.13.0:
  Successfully uninstalled imbalanced-learn-0.13.0
Collecting scikit-learn==1.2.2
  Downloading scikit_learn-1.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.10.1
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Downloading scikit_learn-1.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imbalanc

In [17]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler

# ====================== 1. CorrelationDropper ======================
class CorrelationDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.to_drop_ = []
        self.kept_features_ = []

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_ = X.columns.tolist()
            df = X
        else:
            self.feature_names_ = [f"f{i}" for i in range(X.shape[1])]
            df = pd.DataFrame(X, columns=self.feature_names_)
            
        corr_matrix = df.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
        self.kept_features_ = [f for f in self.feature_names_ if f not in self.to_drop_]
        print(f"Dropped {len(self.to_drop_)}/{len(self.feature_names_)} features due to high correlation")
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X[self.kept_features_]
        else:
            kept_indices = [i for i, f in enumerate(self.feature_names_) if f in self.kept_features_]
            return X[:, kept_indices]

# ====================== 2. XGBFeatureSelector ======================
class XGBFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold="mean", random_state=42):
        self.threshold = threshold
        self.random_state = random_state
        self.feature_mask_ = None
        self.selected_features_ = None

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            self.feature_names_ = X.columns.tolist()
        else:
            self.feature_names_ = [f"f{i}" for i in range(X.shape[1])]
            
        model = xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='auc',
            random_state=self.random_state,
            use_label_encoder=False
        )
        model.fit(X, y)
        
        importances = model.feature_importances_
        if self.threshold == "mean":
            thresh_value = importances.mean()
        elif isinstance(self.threshold, float):
            thresh_value = self.threshold
        else:
            raise ValueError("Unsupported threshold value")

        self.feature_mask_ = importances >= thresh_value
        self.selected_features_ = np.array(self.feature_names_)[self.feature_mask_]
        print(f"Selected {self.feature_mask_.sum()} / {len(importances)} features")
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X[self.selected_features_]
        return X[:, self.feature_mask_]

# ====================== 3. CustomPipeline ======================
class CustomPipeline:
    def __init__(self):
        self.undersampler = RandomUnderSampler(random_state=42, sampling_strategy=0.2)
        self.correlation_dropper = CorrelationDropper(threshold=0.9)
        self.feature_selector = XGBFeatureSelector(threshold="mean", random_state=42)
        self.scaler = StandardScaler()
        self.classifier = xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='auc',
            random_state=42,
            use_label_encoder=False,
            n_estimators=100,
            max_depth=5,
            tree_method='hist'
        )

    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        
        # Step 1: Undersample
        X_resampled, y_resampled = self.undersampler.fit_resample(X, y)
        
        # Step 2: Drop correlated features
        X_corr = self.correlation_dropper.fit_transform(X_resampled)
        
        # Step 3: Select important features
        X_selected = self.feature_selector.fit_transform(X_corr, y_resampled)
        
        # Step 4: Scale features
        X_scaled = self.scaler.fit_transform(X_selected)
        
        # Step 5: Train classifier
        self.classifier.fit(X_scaled, y_resampled)
        
        # Store intermediate results for access
        self.X_train_new_ = X_selected  # Or X_scaled if you want the final transformed version
        self.y_train_new_ = y_resampled
        
        return self


    def predict(self, X):
        X_ready = self.transform(X)
        return self.classifier.predict(X_ready)

    def predict_proba(self, X):
        X_ready = self.transform(X)
        return self.classifier.predict_proba(X_ready)

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
            
        X_corr = self.correlation_dropper.transform(X)
        X_selected = self.feature_selector.transform(X_corr)
        X_scaled = self.scaler.transform(X_selected)
        return X_scaled

In [21]:
# Create an instance of the pipeline
pipeline = CustomPipeline()

# Train the pipeline on training data
pipeline.fit(X_train, y_train)

# Predict using test data
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Evaluate
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# Access transformed training data (after undersampling, correlation drop, and feature selection)
X_train_new = pipeline.X_train_new_
y_train_new = pipeline.y_train_new_

# Access transformed test data (processed using fitted pipeline)
X_test_new = pipeline.transform(X_test)
y_test_new = y_test.values

  return op(a, b)


Dropped 91/277 features due to high correlation
Selected 30 / 186 features
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    113866
           1       0.56      0.55      0.56      4242

    accuracy                           0.97    118108
   macro avg       0.77      0.77      0.77    118108
weighted avg       0.97      0.97      0.97    118108

ROC AUC: 0.9040297532705361


In [None]:
# import mlflow
# import mlflow.xgboost
# from datetime import datetime
# import pandas as pd
# import numpy as np
# import xgboost as xgb
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
# from imblearn.under_sampling import RandomUnderSampler

# # Initialize MLflow
# mlflow.set_experiment("XGBoost_Training1")

# # ====================== 1. CorrelationDropper ======================
# class CorrelationDropper(BaseEstimator, TransformerMixin):
#     def __init__(self, threshold=0.9):
#         self.threshold = threshold
#         self.to_drop_ = []
#         self.kept_features_ = []

#     def fit(self, X, y=None):
#         if isinstance(X, pd.DataFrame):
#             self.feature_names_ = X.columns.tolist()
#             df = X
#         else:
#             self.feature_names_ = [f"f{i}" for i in range(X.shape[1])]
#             df = pd.DataFrame(X, columns=self.feature_names_)
            
#         corr_matrix = df.corr().abs()
#         upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
#         self.to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
#         self.kept_features_ = [f for f in self.feature_names_ if f not in self.to_drop_]
#         print(f"Dropped {len(self.to_drop_)}/{len(self.feature_names_)} features due to high correlation")
#         return self

#     def transform(self, X):
#         if isinstance(X, pd.DataFrame):
#             return X[self.kept_features_]
#         else:
#             kept_indices = [i for i, f in enumerate(self.feature_names_) if f in self.kept_features_]
#             return X[:, kept_indices]

# # ====================== 2. XGBFeatureSelector ======================
# class XGBFeatureSelector(BaseEstimator, TransformerMixin):
#     def __init__(self, threshold="mean", random_state=42):
#         self.threshold = threshold
#         self.random_state = random_state
#         self.feature_mask_ = None
#         self.selected_features_ = None

#     def fit(self, X, y):
#         if isinstance(X, pd.DataFrame):
#             self.feature_names_ = X.columns.tolist()
#         else:
#             self.feature_names_ = [f"f{i}" for i in range(X.shape[1])]
            
#         model = xgb.XGBClassifier(
#             objective='binary:logistic',
#             eval_metric='auc',
#             random_state=self.random_state,
#             use_label_encoder=False
#         )
#         model.fit(X, y)
        
#         importances = model.feature_importances_
#         if self.threshold == "mean":
#             thresh_value = importances.mean()
#         elif isinstance(self.threshold, float):
#             thresh_value = self.threshold
#         else:
#             raise ValueError("Unsupported threshold value")

#         self.feature_mask_ = importances >= thresh_value
#         self.selected_features_ = np.array(self.feature_names_)[self.feature_mask_]
#         print(f"Selected {self.feature_mask_.sum()} / {len(importances)} features")
#         return self

#     def transform(self, X):
#         if isinstance(X, pd.DataFrame):
#             return X[self.selected_features_]
#         return X[:, self.feature_mask_]

# # ====================== 3. CustomPipeline ======================
# class CustomPipeline:
#     def __init__(self):
#         self.undersampler = RandomUnderSampler(random_state=42, sampling_strategy=0.2)
#         self.correlation_dropper = CorrelationDropper(threshold=0.9)
#         self.feature_selector = XGBFeatureSelector(threshold="mean", random_state=42)
#         self.scaler = StandardScaler()
#         self.classifier = xgb.XGBClassifier(
#             objective='binary:logistic',
#             eval_metric='auc',
#             random_state=42,
#             use_label_encoder=False,
#             n_estimators=100,
#             max_depth=5,
#             tree_method='hist'
#         )
#         self.run_id = None

#     def fit(self, X, y):
#         with mlflow.start_run(run_name="XGBoost_Preprocessing") as run:
#             self.run_id = run.info.run_id
            
#             if not isinstance(X, pd.DataFrame):
#                 X = pd.DataFrame(X)
            
#             # Log initial dataset stats
#             mlflow.log_metric("initial_samples", X.shape[0])
#             mlflow.log_metric("initial_features", X.shape[1])
#             mlflow.log_metric("class_ratio", np.mean(y))
            
#             # Step 1: Undersample
#             X_resampled, y_resampled = self.undersampler.fit_resample(X, y)
#             mlflow.log_metric("undersampled_samples", X_resampled.shape[0])
#             mlflow.log_metric("new_class_ratio", np.mean(y_resampled))
            
#             # Step 2: Drop correlated features
#             self.correlation_dropper.fit(X_resampled)
#             X_corr = self.correlation_dropper.transform(X_resampled)
#             mlflow.log_metric("features_after_correlation_drop", X_corr.shape[1])
#             mlflow.log_param("correlation_threshold", self.correlation_dropper.threshold)
#             mlflow.log_text("\n".join(self.correlation_dropper.to_drop_), "dropped_features.txt")
            
#             # Step 3: Select important features
#             self.feature_selector.fit(X_corr, y_resampled)
#             X_selected = self.feature_selector.transform(X_corr)
#             mlflow.log_metric("final_features", X_selected.shape[1])
#             mlflow.log_text("\n".join(self.feature_selector.selected_features_), "selected_features.txt")
            
#             # Step 4: Scale features
#             X_scaled = self.scaler.fit_transform(X_selected)
            
#             # Step 5: Train classifier
#             self.classifier.fit(X_scaled, y_resampled)
            
#             # Store transformed data
#             self.X_train_new_ = X_selected
#             self.y_train_new_ = y_resampled
            
#             # Log model
#             mlflow.xgboost.log_model(self.classifier, "xgboost_model")
            
#             return self

#     def predict(self, X):
#         X_ready = self.transform(X)
#         return self.classifier.predict(X_ready)

#     def predict_proba(self, X):
#         X_ready = self.transform(X)
#         return self.classifier.predict_proba(X_ready)

#     def transform(self, X):
#         if not isinstance(X, pd.DataFrame):
#             X = pd.DataFrame(X)
            
#         X_corr = self.correlation_dropper.transform(X)
#         X_selected = self.feature_selector.transform(X_corr)
#         X_scaled = self.scaler.transform(X_selected)
#         return X_scaled

#     def evaluate(self, X_test, y_test):
#         with mlflow.start_run(run_id=self.run_id):
#             y_pred = self.predict(X_test)
#             y_proba = self.predict_proba(X_test)[:, 1]
            
#             # Calculate metrics
#             metrics = {
#                 "accuracy": accuracy_score(y_test, y_pred),
#                 "roc_auc": roc_auc_score(y_test, y_proba),
#                 "f1_score": f1_score(y_test, y_pred),
#                 "precision": precision_score(y_test, y_pred),
#                 "recall": recall_score(y_test, y_pred)
#             }
            
#             # Log metrics
#             mlflow.log_metrics(metrics)
            
#             # Log classification report
#             report = classification_report(y_test, y_pred, output_dict=True)
#             mlflow.log_dict(report, "classification_report.json")
            
#             # Log feature importance plot
#             import matplotlib.pyplot as plt
#             fig, ax = plt.subplots(figsize=(10, 6))
#             xgb.plot_importance(self.classifier, ax=ax)
#             plt.tight_layout()
#             mlflow.log_figure(fig, "feature_importance.png")
#             plt.close()
            
#             return metrics

# # ====================== 4. Example Usage ======================
# if __name__ == "__main__":
#     # Assuming you have X_train, y_train, X_test, y_test
#     pipeline = CustomPipeline()
    
#     # Train and track
#     pipeline.fit(X_train, y_train)
    
#     # Evaluate and log metrics
#     metrics = pipeline.evaluate(X_test, y_test)
    
#     # Get transformed data
#     X_train_new = pipeline.X_train_new_
#     y_train_new = pipeline.y_train_new_
    
#     print("Training complete! Metrics:")
#     print(metrics)
#     print(f"\nView results in MLflow UI with run ID: {pipeline.run_id}")

In [None]:
# DZVELIA ES KODI



# import mlflow
# import mlflow.sklearn

# from sklearn.base import BaseEstimator, TransformerMixin
# import numpy as np
# import pandas as pd

# # Your CorrelationDropper class
# class CorrelationDropper(BaseEstimator, TransformerMixin):
#     def __init__(self, threshold=0.9):
#         self.threshold = threshold
#         self.to_drop_ = None

#     def fit(self, X, y=None):
#         corr_matrix = X.corr().abs()
#         upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
#         self.to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
#         print(f"Columns to drop due to high correlation ({len(self.to_drop_)}): {self.to_drop_}")
#         return self

#     def transform(self, X):
#         X_dropped = X.drop(columns=self.to_drop_, errors='ignore')
#         return X_dropped

#     def fit_transform(self, X, y=None):
#         return self.fit(X, y).transform(X)

# # --- Now MLflow logging ---

# # Start the experiment
# mlflow.set_experiment("XGBoost_Training")

# with mlflow.start_run(run_name="XGBoost_Feature_Selection") as run:
#     dropper = CorrelationDropper(threshold=0.80)

#     # Fit and transform train
#     X_train_new = dropper.fit_transform(X_train)
    
#     # Transform test
#     X_test_new = dropper.transform(X_test)

#     # Log parameters
#     mlflow.log_param("correlation_threshold", dropper.threshold)
#     mlflow.log_param("num_features_dropped", len(dropper.to_drop_))
    
#     # Optionally log the dropped features list
#     dropped_features_str = ",".join(dropper.to_drop_)
#     mlflow.log_text(dropped_features_str, "dropped_features.txt")

#     # Log final shapes
#     mlflow.log_param("X_train_shape", X_train_new.shape)
#     mlflow.log_param("X_test_shape", X_test_new.shape)
    
#     print("Train set:", X_train_new.shape)
#     print("Test set:", X_test_new.shape)

# print("MLflow logging complete!")

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
import xgboost as xgb
import numpy as np

# Assume these exist already:
X = pipeline.X_train_new_  # after undersampling + feature selection
y = pipeline.y_train_new_

# List of parameter combinations to try
param_grid = [
    {"n_estimators": 100, "max_depth": 3, "learning_rate": 0.1},
    {"n_estimators": 200, "max_depth": 5, "learning_rate": 0.05},
    {"n_estimators": 300, "max_depth": 6, "learning_rate": 0.01},
    {"n_estimators": 100, "max_depth": 4, "learning_rate": 0.2, "subsample": 0.8, "colsample_bytree": 0.8},
    {"n_estimators": 150, "max_depth": 3, "learning_rate": 0.1, "scale_pos_weight": 5},  # account for imbalance
]

# K-Fold CV with stratification
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Try each param set
for i, params in enumerate(param_grid):
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        tree_method='hist',
        **params
    )
    
    aucs = []
    f1s = []
    precisions = []
    recalls = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train_fold, y_train_fold)
        y_val_pred_proba = model.predict_proba(X_val_fold)[:, 1]
        y_val_pred = (y_val_pred_proba > 0.5).astype(int)
        
        auc = roc_auc_score(y_val_fold, y_val_pred_proba)
        f1 = f1_score(y_val_fold, y_val_pred)
        precision = precision_score(y_val_fold, y_val_pred, zero_division=0)
        recall = recall_score(y_val_fold, y_val_pred)
        
        aucs.append(auc)
        f1s.append(f1)
        precisions.append(precision)
        recalls.append(recall)
    
    print(f"\nModel {i+1} with params: {params}")
    print(f"Mean AUC: {np.mean(aucs):.4f}, Std AUC: {np.std(aucs):.4f}")
    print(f"Mean F1: {np.mean(f1s):.4f}, Std F1: {np.std(f1s):.4f}")
    print(f"Mean Precision: {np.mean(precisions):.4f}, Std Precision: {np.std(precisions):.4f}")
    print(f"Mean Recall: {np.mean(recalls):.4f}, Std Recall: {np.std(recalls):.4f}")

In [None]:
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline  
final_pipeline = ImbPipeline(steps=[
    ('preprocessor', FullPreprocessor(identity_df=identity)),
    ('correlation_dropper', CorrelationDropper(threshold=0.9)),
    ('feature_selector', XGBFeatureSelector(threshold="mean", random_state=42)),
    ('scaler', StandardScaler()),
    ('classifier', xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        random_state=42,
        use_label_encoder=False,
        n_estimators=100,
        max_depth=5,
        tree_method='hist'
    ))
])

In [None]:
import mlflow
import mlflow.xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
import xgboost as xgb
import numpy as np

# Initialize MLflow experiment
mlflow.set_experiment("XGBoost_Training1")

# Assume these exist already:
X = pipeline.X_train_new_  # after undersampling + feature selection
y = pipeline.y_train_new_

# List of parameter combinations to try
param_grid = [
    {"n_estimators": 100, "max_depth": 3, "learning_rate": 0.1},
    {"n_estimators": 200, "max_depth": 5, "learning_rate": 0.05},
    {"n_estimators": 300, "max_depth": 6, "learning_rate": 0.01},
    {"n_estimators": 100, "max_depth": 4, "learning_rate": 0.2, "subsample": 0.8, "colsample_bytree": 0.8},
    {"n_estimators": 150, "max_depth": 3, "learning_rate": 0.1, "scale_pos_weight": 5},
]

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, params in enumerate(param_grid):
    with mlflow.start_run(run_name=f"XGBoost_Training_{i+1}"):
        # Log hyperparameters
        mlflow.log_params(params)

        model = xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='auc',
            use_label_encoder=False,
            random_state=42,
            tree_method='hist',
            **params
        )

        # Store fold-wise metrics
        cv_metrics = {'auc': [], 'f1': [], 'precision': [], 'recall': []}

        # Cross-validation
        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

            model.fit(X_train_fold, y_train_fold)
            y_val_proba = model.predict_proba(X_val_fold)[:, 1]
            y_val_pred = (y_val_proba > 0.5).astype(int)

            # Calculate metrics
            auc = roc_auc_score(y_val_fold, y_val_proba)
            f1 = f1_score(y_val_fold, y_val_pred)
            precision = precision_score(y_val_fold, y_val_pred, zero_division=0)
            recall = recall_score(y_val_fold, y_val_pred)

            # Log each fold's metrics
            mlflow.log_metric(f'fold{fold}_auc', auc)
            mlflow.log_metric(f'fold{fold}_f1', f1)
            mlflow.log_metric(f'fold{fold}_precision', precision)
            mlflow.log_metric(f'fold{fold}_recall', recall)

            # Save for aggregation
            cv_metrics['auc'].append(auc)
            cv_metrics['f1'].append(f1)
            cv_metrics['precision'].append(precision)
            cv_metrics['recall'].append(recall)

        # Log average and std of each metric
        for metric in cv_metrics:
            mean_val = np.mean(cv_metrics[metric])
            std_val = np.std(cv_metrics[metric])
            mlflow.log_metric(f'mean_{metric}', mean_val)
            mlflow.log_metric(f'std_{metric}', std_val)

        # Log model
        mlflow.xgboost.log_model(model, artifact_path="model")

        # Console output
        print(f"\nModel {i+1} with params: {params}")
        for metric in cv_metrics:
            print(f"Mean {metric.upper()}: {np.mean(cv_metrics[metric]):.4f}, "
                  f"STD: {np.std(cv_metrics[metric]):.4f}")

print("\n✅ All experiments logged to MLflow with fold-wise and aggregated metrics.")

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
import xgboost as xgb
import numpy as np
import itertools

# Grid of hyperparameters
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.01, 0.05, 0.1],
    "scale_pos_weight": [1, 5, 10],  # Try 1 if no imbalance
    "subsample": [0.8],
    "colsample_bytree": [0.8]
}

# Generate all combinations
all_params = list(itertools.product(
    param_grid['n_estimators'],
    param_grid['max_depth'],
    param_grid['learning_rate'],
    param_grid['scale_pos_weight'],
    param_grid['subsample'],
    param_grid['colsample_bytree']
))

best_f1 = 0
best_config = None

# K-Fold CV with stratification
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Loop through each combination
for i, (n, d, lr, spw, subs, colsample) in enumerate(all_params):
    params = {
        "n_estimators": n,
        "max_depth": d,
        "learning_rate": lr,
        "scale_pos_weight": spw,
        "subsample": subs,
        "colsample_bytree": colsample
    }

    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        tree_method='hist',
        **params
    )

    aucs, f1s, precisions, recalls = [], [], [], []

    for train_idx, val_idx in skf.split(X, y):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train_fold, y_train_fold)
        y_proba = model.predict_proba(X_val_fold)[:, 1]
        y_pred = (y_proba > 0.5).astype(int)  # Can try 0.4/0.6 later

        aucs.append(roc_auc_score(y_val_fold, y_proba))
        f1s.append(f1_score(y_val_fold, y_pred))
        precisions.append(precision_score(y_val_fold, y_pred, zero_division=0))
        recalls.append(recall_score(y_val_fold, y_pred))

    mean_f1 = np.mean(f1s)
    if mean_f1 > best_f1:
        best_f1 = mean_f1
        best_config = params

    print(f"\nModel {i+1} with params: {params}")
    print(f"Mean AUC: {np.mean(aucs):.4f}, Std AUC: {np.std(aucs):.4f}")
    print(f"Mean F1: {mean_f1:.4f}, Std F1: {np.std(f1s):.4f}")
    print(f"Mean Precision: {np.mean(precisions):.4f}, Mean Recall: {np.mean(recalls):.4f}")

print("\n✅ Best config based on F1:")
print(best_config)
print(f"Best F1 Score: {best_f1:.4f}")

In [None]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_recall_curve,
    average_precision_score,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score
)
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import numpy as np

# 1. Auto-calculate scale_pos_weight
scale_pos_weight = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
print(f"Auto-calculated scale_pos_weight: {scale_pos_weight:.2f}")

# 2. Set model parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': ['aucpr', 'auc'],  # prioritize aucpr
    'tree_method': 'hist',
    'random_state': 42,
    'n_estimators': 300,
    'max_depth': 6,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'scale_pos_weight': scale_pos_weight,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0
}

# 3. Cross-validation
cv_metrics = {'auc': [], 'aucpr': [], 'f1': [], 'precision': [], 'recall': []}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in skf.split(X_train, y_train):
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model = xgb.XGBClassifier(**params)
    model.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_val, y_fold_val)],
        early_stopping_rounds=20,
        verbose=False
    )
    
    y_pred_proba = model.predict_proba(X_fold_val)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    cv_metrics['auc'].append(roc_auc_score(y_fold_val, y_pred_proba))
    cv_metrics['aucpr'].append(average_precision_score(y_fold_val, y_pred_proba))
    cv_metrics['f1'].append(f1_score(y_fold_val, y_pred))
    cv_metrics['precision'].append(precision_score(y_fold_val, y_pred))
    cv_metrics['recall'].append(recall_score(y_fold_val, y_pred))

# 4. Print CV results
print("\nCross-Validation Results:")
for metric, values in cv_metrics.items():
    print(f"{metric.upper():<10} Mean: {np.mean(values):.4f} ± {np.std(values):.4f}")

# 5. Train final model
final_model = xgb.XGBClassifier(**params)
final_model.fit(
    X_train_new, y_train_new,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=20,
    verbose=True
)

# 6. Evaluate on test set (default threshold)
y_test_proba = final_model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba > 0.5).astype(int)

print("\nTest Set Performance (Default Threshold=0.5):")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

# 7. Threshold tuning to improve recall
precisions, recalls, thresholds = precision_recall_curve(y_test, y_test_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-9)

# Find best threshold with recall ≥ 75%
try:
    idx = np.where(recalls >= 0.75)[0][0]
    optimal_threshold = thresholds[idx]
    print(f"\nOptimal Threshold for Recall ≥ 75%: {optimal_threshold:.4f}")
except IndexError:
    optimal_threshold = 0.5
    print("\nNo threshold found for Recall ≥ 75%. Using default 0.5.")

# 8. Evaluate with adjusted threshold
y_test_adj = (y_test_proba >= optimal_threshold).astype(int)
print("\nTest Set Performance (Adjusted Threshold):")
print(classification_report(y_test, y_test_adj))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_adj))

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from imblearn.pipeline import Pipeline as ImbPipeline  # For using RandomUnderSampler inside pipeline

# Final pipeline definition
final_pipeline = ImbPipeline(steps=[
    ('preprocessor', FullPreprocessor(identity_df=identity)),   # Accepts raw test set
    ('undersample', RandomUnderSampler(random_state=42, sampling_strategy=0.2)),
    ('correlation', CorrelationDropper(threshold=0.9)),
    ('feature_selector', XGBFeatureSelector(threshold="mean", random_state=42)),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        random_state=42,
        use_label_encoder=False,
        n_estimators=100,
        max_depth=5,
        tree_method='hist'
    ))
])

In [41]:
import mlflow
import mlflow.xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
import xgboost as xgb
import numpy as np

with mlflow.start_run(run_name="XGBoost_FullPipeline") as run:
    final_pipeline.fit(X, y)  # Only X, not X_processed!
    
    mlflow.sklearn.log_model(final_pipeline, "xgb_full_pipeline")
    mlflow.log_param("model", "XGBoost")
    mlflow.log_param("feature_selector", "SelectKBest")


  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  return op(a, b)


Dropped 92/277 features due to high correlation
Selected 36 / 185 features




🏃 View run XGBoost_FullPipeline at: https://dagshub.com/tvani2/IEEE-CIS-Fraud-Detection.mlflow/#/experiments/0/runs/1044c3529ec94fd19b53c5f3b53deb32
🧪 View experiment at: https://dagshub.com/tvani2/IEEE-CIS-Fraud-Detection.mlflow/#/experiments/0
