In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


In [2]:
# Load the dataset
df = pd.read_csv("../data/processed/eda_data.csv")

# Feature Engineering
agg_df = df.groupby('CustomerId').agg({
    'Amount': ['sum', 'mean', 'std', 'max', 'min', 'count'],
    'Value': ['sum', 'mean', 'std', 'max', 'min'],
    'TransactionHour': 'nunique',
    'TransactionDay': 'nunique',
    'TransactionMonth': 'nunique'
}).reset_index()

# Flatten column names
agg_df.columns = ['CustomerId'] + ['_'.join(col).strip() for col in agg_df.columns[1:]]
customer_ids = agg_df['CustomerId']

num_features = [col for col in agg_df.columns if agg_df[col].dtype in ['int64', 'float64'] and col != 'CustomerId']


X = agg_df.drop(columns=['CustomerId'])
y = np.zeros(X.shape[0]) 

print("X columns:", X.columns.tolist())

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Apply transformations
X_scaled = numeric_pipeline.fit_transform(X)

print("Transformed X shape:", X_scaled.shape)

print("Feature extraction complete. Processed data ready for modeling.")

X columns: ['Amount_sum', 'Amount_mean', 'Amount_std', 'Amount_max', 'Amount_min', 'Amount_count', 'Value_sum', 'Value_mean', 'Value_std', 'Value_max', 'Value_min', 'TransactionHour_nunique', 'TransactionDay_nunique', 'TransactionMonth_nunique']
Transformed X shape: (3742, 14)
Feature extraction complete. Processed data ready for modeling.


In [3]:
# Task 4 : proxy labels creation
from sklearn.cluster import KMeans
import seaborn as sns

df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'], errors='coerce')

# Define snapshot date for Recency calculation
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)

# Calculate RFM per CustomerId
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,
    'TransactionId': 'count',
    'Value': 'sum'
}).reset_index()
rfm.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']

# Scale RFM for clustering
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

# KMeans Clustering (3 segments)
kmeans = KMeans(n_clusters=3, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

# Determine high-risk cluster: lowest Frequency + Monetary, highest Recency
cluster_summary = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
}).sort_values(by='Frequency')

high_risk_cluster = cluster_summary.index[0]  # assume lowest freq is highest risk
rfm['is_high_risk'] = (rfm['Cluster'] == high_risk_cluster).astype(int)

# Save or merge this label with processed features
# e.g. rfm[['CustomerId', 'is_high_risk']] 
#merge it with df

rfm = rfm[['CustomerId', 'is_high_risk']]
df = df.merge(rfm, on='CustomerId', how='left')

# Save the processed data with proxy labels
PROCESSED_DATA_PATH = "../data/processed/eda_data_with_proxy_labels.csv"
df.to_csv(PROCESSED_DATA_PATH, index=False)

print("Proxy labels created. High-risk cluster:", high_risk_cluster)
print(rfm['is_high_risk'].value_counts())

Proxy labels created. High-risk cluster: 0
is_high_risk
0    2307
1    1435
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import mlflow
import mlflow.sklearn

df_final = pd.DataFrame(X_scaled)
df_final['CustomerId'] = customer_ids.values
df_final = df_final.merge(rfm[['CustomerId', 'is_high_risk']], on='CustomerId')

y = df_final['is_high_risk']
X_final = df_final.drop(columns=['CustomerId', 'is_high_risk'])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, stratify=y, random_state=42)

# Train and Evaluate (Task 5)
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

mlflow.set_experiment("credit-risk-model")

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:, 1]

        acc = accuracy_score(y_test, preds)
        prec = precision_score(y_test, preds)
        rec = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        roc = roc_auc_score(y_test, probs)

        mlflow.log_param("model", name)
        mlflow.log_metrics({
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1,
            "roc_auc": roc
        })

        mlflow.sklearn.log_model(model, "model", registered_model_name=name.replace(" ", "_"))

        print(f"✅ {name} Results")
        print(classification_report(y_test, preds))

2025/07/01 17:59:07 INFO mlflow.tracking.fluent: Experiment with name 'credit-risk-model' does not exist. Creating a new experiment.
Successfully registered model 'Logistic_Regression'.
Created version '1' of model 'Logistic_Regression'.


✅ Logistic Regression Results
              precision    recall  f1-score   support

           0       0.83      0.60      0.70       462
           1       0.56      0.81      0.66       287

    accuracy                           0.68       749
   macro avg       0.70      0.70      0.68       749
weighted avg       0.73      0.68      0.68       749





✅ Random Forest Results
              precision    recall  f1-score   support

           0       0.81      0.78      0.80       462
           1       0.67      0.70      0.68       287

    accuracy                           0.75       749
   macro avg       0.74      0.74      0.74       749
weighted avg       0.75      0.75      0.75       749



Successfully registered model 'Random_Forest'.
Created version '1' of model 'Random_Forest'.


In [5]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter Tuning
# Define parameter grids
param_grids = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    },
    "Random Forest": {
        'n_estimators': [100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
}

mlflow.set_experiment("credit-risk-model-tuned")

for name, model in models.items():
    print(f"\n🔍 Running GridSearch for {name}")
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        cv=3,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    tuned_model = grid_search.best_estimator_
    preds = tuned_model.predict(X_test)
    probs = tuned_model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    roc = roc_auc_score(y_test, probs)

    with mlflow.start_run(run_name=f"{name} - Tuned"):
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metrics({
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1,
            "roc_auc": roc
        })
        mlflow.sklearn.log_model(tuned_model, "model", registered_model_name=name.replace(" ", "_") + "_Tuned")
        print(f"✅ {name} tuned and logged.\n")
        print(classification_report(y_test, preds))

2025/07/01 17:59:17 INFO mlflow.tracking.fluent: Experiment with name 'credit-risk-model-tuned' does not exist. Creating a new experiment.



🔍 Running GridSearch for Logistic Regression
Fitting 3 folds for each of 4 candidates, totalling 12 fits


Successfully registered model 'Logistic_Regression_Tuned'.
Created version '1' of model 'Logistic_Regression_Tuned'.


✅ Logistic Regression tuned and logged.

              precision    recall  f1-score   support

           0       0.83      0.60      0.70       462
           1       0.56      0.81      0.66       287

    accuracy                           0.68       749
   macro avg       0.70      0.70      0.68       749
weighted avg       0.73      0.68      0.68       749


🔍 Running GridSearch for Random Forest
Fitting 3 folds for each of 24 candidates, totalling 72 fits




✅ Random Forest tuned and logged.

              precision    recall  f1-score   support

           0       0.83      0.77      0.80       462
           1       0.67      0.74      0.70       287

    accuracy                           0.76       749
   macro avg       0.75      0.76      0.75       749
weighted avg       0.77      0.76      0.76       749



Successfully registered model 'Random_Forest_Tuned'.
Created version '1' of model 'Random_Forest_Tuned'.


In [6]:
best_model = mlflow.sklearn.load_model("models:/Logistic_Regression_Tuned/1")
mlflow.sklearn.save_model(best_model, "exported_model")
