In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
pip install mlflow dagshub

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1

In [3]:
import dagshub
dagshub.init(repo_owner='tvani2', repo_name='IEEE-CIS-Fraud-Detection', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=f64590a8-a54e-4311-938b-2132fe72f40c&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=635b2cea6cc5e0cf7e451a4c9d415833cfd1cb92a85d1a1261f4ae0ef66c0d32




Output()

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class FullPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 target_column='isFraud',
                 transaction_thresh=0.6,
                 identity_thresh=0.9,
                 identity_df=None):
        self.target_column = target_column
        self.transaction_thresh = transaction_thresh
        self.identity_thresh = identity_thresh
        self.identity_df = identity_df  # identity will be passed during initialization

    def fit(self, X, y=None):
        # 1. Drop columns with too many missing values
        self.transaction_cols_to_keep = X.columns[X.isnull().mean() < self.transaction_thresh].tolist()
        if self.identity_df is not None:
            self.identity_cols_to_keep = self.identity_df.columns[self.identity_df.isnull().mean() < self.identity_thresh].tolist()
        else:
            self.identity_cols_to_keep = []

        # 2. Merge
        if self.identity_df is not None:
            identity_filtered = self.identity_df[self.identity_cols_to_keep]
            X = X[self.transaction_cols_to_keep].merge(identity_filtered, how='left', on='TransactionID')
        else:
            X = X[self.transaction_cols_to_keep]

        # 3. Separate numeric and categorical columns
        self.numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        self.categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

        # 4. Imputers
        self.numeric_imputer = SimpleImputer(strategy='mean')
        self.categorical_imputer = SimpleImputer(strategy='most_frequent')

        # Fit imputers
        self.numeric_imputer.fit(X[self.numeric_cols])
        self.categorical_imputer.fit(X[self.categorical_cols])

        # 5. Determine WOE and one-hot columns
        s = X[self.categorical_cols].nunique()
        self.woe_columns = list(s[s > 3].index)
        self.one_hot_columns = list(s[s <= 3].index)

        # 6. Fit WOE mappings
        if y is not None:
            df_woe = X[self.woe_columns].copy()
            df_woe['target'] = y.reset_index(drop=True)

            self.woe_mappings = {}
            self.woe_columns_fillna = df_woe[self.woe_columns].mode().T[0].to_dict()

            for col in self.woe_columns:
                groups = df_woe.groupby(col)['target'].agg(['count', 'mean'])
                groups['n_pos'] = groups['mean'] * groups['count']
                groups['n_neg'] = groups['count'] - groups['n_pos']

                total_pos = groups['n_pos'].sum()
                total_neg = groups['n_neg'].sum()

                groups['prop_pos'] = groups['n_pos'] / total_pos
                groups['prop_neg'] = groups['n_neg'] / total_neg

                groups['woe'] = np.log(groups['prop_pos'] / groups['prop_neg'])

                groups.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
                self.woe_mappings[col] = groups['woe'].to_dict()

        return self

    def transform(self, X):
        # 1. Drop columns with too many missing values
        if self.identity_df is not None:
            identity_filtered = self.identity_df[self.identity_cols_to_keep]
            X = X[self.transaction_cols_to_keep].merge(identity_filtered, how='left', on='TransactionID')
        else:
            X = X[self.transaction_cols_to_keep]

        # 2. Impute missing values
        X[self.numeric_cols] = self.numeric_imputer.transform(X[self.numeric_cols])
        X[self.categorical_cols] = self.categorical_imputer.transform(X[self.categorical_cols])

        # 3. Apply WOE encoding
        for col in self.woe_columns:
            new_col = f'{col}_woe'
            X[new_col] = (
                X[col]
                .map(self.woe_mappings[col])
                .fillna(self.woe_mappings[col].get(self.woe_columns_fillna[col], 0))
            )

        # 4. One-hot encode
        X = pd.get_dummies(X, columns=self.one_hot_columns, drop_first=True, dummy_na=True)

        # 5. Drop original WOE and one-hot columns
        cols_to_drop = [col for col in (self.woe_columns + self.one_hot_columns) if col in X.columns]
        X = X.drop(columns=cols_to_drop)

        return X

In [5]:
from sklearn.impute import SimpleImputer

# Load data
transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')

# Separate target
target_column = 'isFraud'
y = transaction[target_column]
X = transaction.drop(columns=[target_column])

# Initialize preprocessor
preprocessor = FullPreprocessor(
    target_column=target_column,
    transaction_thresh=0.6,
    identity_thresh=0.9,
    identity_df=identity
)

# Fit-transform
X_processed = preprocessor.fit_transform(X, y)

print(X_processed.shape)

  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (
  X[new_col] = (


(590540, 277)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

class CorrelationDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.to_drop_ = None

    def fit(self, X, y=None):
        # 1. Calculate correlation matrix
        corr_matrix = X.corr().abs()
        
        # 2. Upper triangle of the correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        
        # 3. Find features with correlation greater than threshold
        self.to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
        
        print(f"Columns to drop due to high correlation ({len(self.to_drop_)}): {self.to_drop_}")
        
        return self

    def transform(self, X):
        # 4. Drop them
        X_dropped = X.drop(columns=self.to_drop_, errors='ignore')
        return X_dropped

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [8]:
dropper = CorrelationDropper(threshold=0.9)

# Fit on training data and transform it
X_train_new = dropper.fit_transform(X_train)

# Only transform test data
X_test_new = dropper.transform(X_test)

# Done!
print("Train set:", X_train_new.shape)
print("Test set:", X_test_new.shape)

  return op(a, b)


Columns to drop due to high correlation (94): ['TransactionDT', 'C2', 'C4', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C14', 'V5', 'V11', 'V13', 'V16', 'V18', 'V20', 'V21', 'V22', 'V28', 'V30', 'V31', 'V32', 'V33', 'V34', 'V36', 'V40', 'V43', 'V45', 'V49', 'V50', 'V51', 'V52', 'V54', 'V57', 'V58', 'V60', 'V63', 'V64', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V76', 'V79', 'V81', 'V84', 'V85', 'V90', 'V91', 'V92', 'V93', 'V94', 'V96', 'V97', 'V101', 'V102', 'V103', 'V105', 'V106', 'V113', 'V126', 'V127', 'V128', 'V132', 'V133', 'V134', 'V137', 'V279', 'V280', 'V292', 'V293', 'V294', 'V295', 'V296', 'V297', 'V298', 'V299', 'V301', 'V304', 'V306', 'V307', 'V308', 'V309', 'V311', 'V315', 'V316', 'V317', 'V318', 'V321', 'id_16_NotFound', 'id_29_NotFound']
Train set: (472432, 183)
Test set: (118108, 183)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score

# 1) Initialize the classifier with class‐weight to help imbalance
clf = LogisticRegression(
    solver='lbfgs',
    max_iter=1000,
    class_weight='balanced',  # penalize misclassifying the minority class
    n_jobs=-1
)

# 2) Fit on training data
clf.fit(X_train_new, y_train)

from sklearn.metrics import roc_auc_score

# get predicted probabilities for the positive class
train_proba = clf.predict_proba(X_train_new)[:, 1]  
train_roc_auc = roc_auc_score(y_train, train_proba)
print(f"ROC–AUC on Train Set: {train_roc_auc:.4f}")


# 3) Predict on hold‐out test set
y_pred = clf.predict(X_test_new)
y_proba = clf.predict_proba(X_test_new)[:, 1]

# 4) Metrics on test set
print("=== Classification Report on Test Set ===")
print(classification_report(y_test, y_pred, digits=4))

roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC–AUC on Test Set: {roc_auc:.4f}")

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (TN, FP; FN, TP):")
print(cm)

# 5) Stratified K-Fold cross‐validation (ROC–AUC)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    clf, X_train_new, y_train,
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1
)
print(f"5-Fold CV ROC–AUC scores: {cv_scores}")
print(f"Mean CV ROC–AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

ROC–AUC on Train Set: 0.6535
=== Classification Report on Test Set ===
              precision    recall  f1-score   support

           0     0.9787    0.5196    0.6788    113866
           1     0.0512    0.6959    0.0954      4242

    accuracy                         0.5259    118108
   macro avg     0.5149    0.6077    0.3871    118108
weighted avg     0.9453    0.5259    0.6578    118108

ROC–AUC on Test Set: 0.6548
Confusion Matrix (TN, FP; FN, TP):
[[59160 54706]
 [ 1290  2952]]
5-Fold CV ROC–AUC scores: [0.65848842 0.65278147 0.70132648 0.69874297 0.65216779]
Mean CV ROC–AUC: 0.6727 ± 0.0224


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score

clf = LogisticRegression(
    solver='saga',
    penalty='elasticnet',
    l1_ratio=0.5,
    C=0.05,  # stronger regularization
    max_iter=2000,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)

# 2) Fit on training data
clf.fit(X_train_new, y_train)

from sklearn.metrics import roc_auc_score

# get predicted probabilities for the positive class
train_proba = clf.predict_proba(X_train_new)[:, 1]  
train_roc_auc = roc_auc_score(y_train, train_proba)
print(f"ROC–AUC on Train Set: {train_roc_auc:.4f}")


# 3) Predict on hold‐out test set
y_pred = clf.predict(X_test_new)
y_proba = clf.predict_proba(X_test_new)[:, 1]

# 4) Metrics on test set
print("=== Classification Report on Test Set ===")
print(classification_report(y_test, y_pred, digits=4))

roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC–AUC on Test Set: {roc_auc:.4f}")

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (TN, FP; FN, TP):")
print(cm)

# 5) Stratified K-Fold cross‐validation (ROC–AUC)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    clf, X_train_new, y_train,
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1
)
print(f"5-Fold CV ROC–AUC scores: {cv_scores}")
print(f"Mean CV ROC–AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

KeyboardInterrupt: 

In [None]:
# import mlflow
# import mlflow.sklearn
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
# from sklearn.model_selection import StratifiedKFold, cross_val_score

# # 0) Set experiment
# mlflow.set_experiment("Logistic_regressio_Training")

# # 1) Start a new MLflow run
# with mlflow.start_run(run_name="Logistic_regression_Training1"):

#     # 2) Initialize the classifier
#     clf = LogisticRegression(
#         solver='lbfgs',
#         max_iter=1000,
#         class_weight='balanced',
#         n_jobs=-1
#     )

#     # 3) Fit on training data
#     clf.fit(X_train_rfe, y_train)

#     # 4) Evaluate on training data
#     train_proba = clf.predict_proba(X_train_rfe)[:, 1]
#     train_roc_auc = roc_auc_score(y_train, train_proba)
#     mlflow.log_metric("train_roc_auc", train_roc_auc)

#     # 5) Predict on test set
#     y_pred = clf.predict(X_test_rfe)
#     y_proba = clf.predict_proba(X_test_rfe)[:, 1]

#     test_roc_auc = roc_auc_score(y_test, y_proba)
#     mlflow.log_metric("test_roc_auc", test_roc_auc)

#     # 6) Classification Report
#     report = classification_report(y_test, y_pred, output_dict=True)
#     for label, metrics in report.items():
#         if isinstance(metrics, dict):
#             for metric_name, value in metrics.items():
#                 mlflow.log_metric(f"{label}_{metric_name}", value)

#     # 7) Confusion Matrix
#     cm = confusion_matrix(y_test, y_pred)
#     mlflow.log_metric("tn", cm[0, 0])
#     mlflow.log_metric("fp", cm[0, 1])
#     mlflow.log_metric("fn", cm[1, 0])
#     mlflow.log_metric("tp", cm[1, 1])

#     # 8) Stratified K-Fold Cross Validation (ROC-AUC)
#     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     cv_scores = cross_val_score(
#         clf, X_train_rfe, y_train,
#         cv=skf,
#         scoring='roc_auc',
#         n_jobs=-1
#     )
#     mlflow.log_metric("cv_mean_roc_auc", cv_scores.mean())
#     mlflow.log_metric("cv_std_roc_auc", cv_scores.std())

#     # 9) Log model itself
#     mlflow.sklearn.log_model(clf, artifact_path="logistic_regression_model")

#     # 10) Optional: Log params
#     mlflow.log_params({
#         "solver": "lbfgs",
#         "max_iter": 1000,
#         "class_weight": "balanced",
#         "n_jobs": -1
#     })

# print("MLflow run completed and logged!")