In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [None]:
def reduce_mem_usage(df):
    """Reduce memory usage of DataFrame by downcasting numeric types"""
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type != 'category' and col != 'TransactionID' and 'ID' not in col:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%')
    
    return df

In [None]:
!pip install dagshub

In [None]:
!pip install mlflow

In [None]:
import mlflow
import dagshub
dagshub.init(repo_owner='dshan21', repo_name='ML_ASS_2', mlflow=True)

In [None]:
identity_train_file_path = "/kaggle/input/ieee-fraud-detection/train_identity.csv"
identity_df = pd.read_csv(identity_train_file_path)
transaction_train_file_path = "/kaggle/input/ieee-fraud-detection/train_transaction.csv"
transaction_df = pd.read_csv(transaction_train_file_path)

In [None]:
mlflow.set_experiment("XGBoost_Training")

# FEATURE CLEANING

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
merged_df = None
y = None

with mlflow.start_run(run_name="XGBoost_Cleaning"):
    missing_cutoff = 0.9
    merged_df = transaction_df.merge(identity_df, on='TransactionID', how='left')
    # print(f"Merged data shape: {merged_df.shape}")
    
    y = merged_df['isFraud'].copy()
    merged_df.drop(['isFraud', 'TransactionID'], axis=1, inplace=True)
    
    missing_rate = merged_df.isnull().mean()
    high_missing_cols = missing_rate[missing_rate > missing_cutoff].index.tolist()
    mlflow.log_param("high_missing_threshold", missing_cutoff)
    
    high_missing_df = pd.DataFrame({
        'column_name': high_missing_cols,
        'missing_rate': [missing_rate[col] for col in high_missing_cols]
    }).sort_values('missing_rate', ascending=False)
    
    high_missing_file = "high_missing_columns.csv"
    high_missing_df.to_csv(high_missing_file, index=False)
    
    mlflow.log_artifact(high_missing_file)
    
    merged_df.drop(high_missing_cols, axis=1, inplace=True)
    mlflow.log_param("removed_columns", len(high_missing_cols))
    
    categorical_cols = merged_df.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = merged_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    # print(categorical_cols)
    # print(numerical_cols)
    
    for col in numerical_cols:
        if merged_df[col].isnull().sum() > 0:
            merged_df[col] = merged_df[col].fillna(merged_df[col].median())
    
    for col in categorical_cols:
        if merged_df[col].isnull().sum() > 0:
            merged_df[col] = merged_df[col].fillna('Unknown')
    categorical_encoder = OneHotEncoder()      
    mlflow.log_param("categorical_col_repl_method", categorical_encoder)
    if isinstance(categorical_encoder, LabelEncoder):
        for col in categorical_cols:
            merged_df[col] = categorical_encoder.fit_transform(merged_df[col].astype(str)) 
    elif isinstance(categorical_encoder, OneHotEncoder):
        for col in categorical_cols:
            encoded_array = categorical_encoder.fit_transform(merged_df[col].astype(str).values.reshape(-1, 1))
            encoded_cols = categorical_encoder.get_feature_names_out([col])
            encoded_df = pd.DataFrame(encoded_array.toarray(), columns=encoded_cols, index=merged_df.index)
            
            merged_df.drop(columns=[col], inplace=True)
            merged_df = pd.concat([merged_df, encoded_df], axis=1)
    mlflow.log_param("final_feature_count",  merged_df.shape[1])

# FEATURE ENGINEERING

In [None]:
newframe = merged_df.copy()
merged_df = newframe

merged_df['transaction_day'] = merged_df['TransactionDT'] // (24 * 60 * 60)
merged_df['transaction_hour'] = (merged_df['TransactionDT'] % (24 * 60 * 60)) // (60 * 60)
merged_df['transaction_minute'] = ((merged_df['TransactionDT'] % (24 * 60 * 60)) % (60 * 60)) // 60
merged_df['transaction_second'] = merged_df['TransactionDT'] % 60

merged_df['day_of_week'] = merged_df['transaction_day'] % 7
merged_df['is_weekend'] = merged_df['day_of_week'].isin([0, 6]).astype(int)

merged_df.drop('TransactionDT', axis=1, inplace=True)

merged_df['amount_log'] = np.log1p(merged_df['TransactionAmt'])
merged_df['amount_decimal'] = merged_df['TransactionAmt'] - np.floor(merged_df['TransactionAmt'])

# FEATURE SELECTION

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import xgboost as xgb

X_selected = None
with mlflow.start_run(run_name="XGBoost_Feature_Selection"):
    X_train, X_val, y_train, y_val = train_test_split(
        merged_df, y, test_size=0.2, random_state=42, stratify=y
    )
    
    model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=1,
        random_state=42,
        tree_method='hist',
        eval_metric='auc',
        early_stopping_rounds=50
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    feature_importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': merged_df.columns,
        'Importance': feature_importances
    }).sort_values('Importance', ascending=False)
    
    top_n_features = 200
    top_features = feature_importance_df.head(top_n_features)['Feature'].tolist()
    
    
    X_selected = merged_df[top_features].copy()
    
    mlflow.log_param("feature_selection_method", "XGBoost_Importance")
    mlflow.log_param("top_n_features", top_n_features)
    
    with open('top_features.txt', 'w') as f:
        for feature in top_features:
            f.write(f"{feature}\n")
    
    feature_importance_df.to_csv('feature_importance.csv', index=False)
    mlflow.log_artifact('feature_importance.csv')
    mlflow.log_artifact('top_features.txt')

# MODEL TRAINING

In [None]:
import mlflow
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, confusion_matrix

with mlflow.start_run(run_name="XGBoost_Training"):
    run_id = mlflow.active_run().info.run_id
    print(f"MLflow Run ID: {run_id}")
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.05,
        'max_depth': 5,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'tree_method': 'hist',
        'random_state': 42
    }
    
    for param, value in params.items():
        mlflow.log_param(param, value)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y, test_size=0.2, random_state=42, stratify=y
    )
    
    mlflow.log_param("num_features", X_selected.shape[1])
    mlflow.log_param("dataset_size", X_train.shape)
    mlflow.log_param("test_size", 0.2)
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=[(dtrain, 'train'), (dtest, 'test')],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    
    y_pred_proba = model.predict(dtest)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    y_pred_binary = (y_pred_proba > 0.5).astype(int)
    
    mlflow.log_metric("auc", auc_score)
    mlflow.log_metric("best_iteration", model.best_iteration)
    
    mlflow.xgboost.log_model(model,
        "xgboost_model",
        registered_model_name="FraudXgboostModel")
    
    model_path = "xgboost_model.json"
    model.save_model(model_path)
    mlflow.log_artifact(model_path)
    
    feature_importance = model.get_score(importance_type='gain')
    importance_df = pd.DataFrame({
        'Feature': list(feature_importance.keys()),
        'Importance': list(feature_importance.values())
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
    plt.title('Top 20 Features by Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    
    mlflow.log_artifact('feature_importance.png')
    
    pipeline_config = {
        'model_type': 'XGBoost',
        'feature_selection': 'importance_based',
        'top_features': list(X_selected.columns),  # Assuming top_features is the column names
        'preprocessing': 'Label encoding for categoricals, median imputation for numericals',
        'model_params': params,
        'auc_score': float(auc_score)  # Convert numpy float to Python float for JSON serialization
    }
    
    import json
    with open('pipeline_config.json', 'w') as f:
        json.dump(pipeline_config, f, indent=4)
    
    mlflow.log_artifact('pipeline_config.json')
    
    print(f"AUC Score: {auc_score:.4f}")
    print(f"Best Iteration: {model.best_iteration}")