In [2]:
import pandas as pd 
import numpy as np
import mlflow
import mlflow.sklearn
import pickle
from sklearn.metrics import roc_auc_score
from datetime import datetime
import os
from sklearn.model_selection import train_test_split, KFold
from optbinning import BinningProcess, Scorecard  # Ensure you import the necessary classes
from sklearn.linear_model import LogisticRegression  # Ensure this is imported

In [3]:
dfs = pd.read_excel('/Users/macbookpro/Documents/Applications/Fairmoney/Credit Scoring Model/data/Processed/processed_credit_data.xlsx')
display(dfs.head())
display(dfs.info())
display(dfs.describe())


Unnamed: 0.1,Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,installment_rate,personal_status,other_debtors,...,installment_plan,housing,existing_credits,default,dependents,foreign_worker,job,gender,employment_length_years,residence_history_years
0,0,-43.0,6,critical,radio/tv,1169,781.570379,4,single,none,...,none,own,2,0,1,yes,skilled employee,male,13.0,6.0
1,1,75.0,48,repaid,radio/tv,5951,89.0,2,Unknown,none,...,none,own,1,1,1,yes,skilled employee,female,2.0,0.416667
2,2,97.245875,12,critical,education,2096,24.0,2,single,none,...,none,own,1,0,2,yes,unskilled resident,male,5.0,4.0
3,3,-32.0,42,repaid,furniture,7882,9.0,2,single,guarantor,...,none,for free,1,0,2,yes,skilled employee,male,5.0,13.0
4,4,-23.0,24,delayed,car (new),4870,43.0,3,single,none,...,none,for free,2,1,2,yes,skilled employee,male,3.0,13.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               1000 non-null   int64  
 1   checking_balance         1000 non-null   float64
 2   months_loan_duration     1000 non-null   int64  
 3   credit_history           1000 non-null   object 
 4   purpose                  1000 non-null   object 
 5   amount                   1000 non-null   int64  
 6   savings_balance          1000 non-null   float64
 7   installment_rate         1000 non-null   int64  
 8   personal_status          1000 non-null   object 
 9   other_debtors            1000 non-null   object 
 10  property                 1000 non-null   object 
 11  age                      1000 non-null   int64  
 12  installment_plan         1000 non-null   object 
 13  housing                  1000 non-null   object 
 14  existing_credits         

None

Unnamed: 0.1,Unnamed: 0,checking_balance,months_loan_duration,amount,savings_balance,installment_rate,age,existing_credits,default,dependents,employment_length_years,residence_history_years
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,97.245875,20.903,3271.258,781.570379,2.973,35.546,1.407,0.3,1.155,5.129583,7.085833
std,288.819436,161.02931,12.058814,2822.736876,2726.687149,1.118715,11.375469,0.577654,0.458487,0.362086,5.15184,7.473123
min,0.0,-50.0,4.0,250.0,0.0,1.0,19.0,1.0,0.0,1.0,0.0,0.0
25%,249.75,-5.25,12.0,1365.5,38.0,2.0,27.0,1.0,0.0,1.0,1.0,0.75
50%,499.5,97.245875,18.0,2319.5,82.5,3.0,33.0,1.0,0.0,1.0,3.0,4.0
75%,749.25,97.245875,24.0,3972.25,781.570379,4.0,42.0,2.0,1.0,1.0,7.0,13.0
max,999.0,999.0,72.0,18424.0,19972.0,4.0,75.0,4.0,1.0,2.0,19.0,24.0


In [4]:
df = dfs.copy()

In [5]:


# Set experiment name (optional)
mlflow.set_experiment("Credit_Scoring_Model_Optimization")

# Updated selection criteria params
selection_criteria = {
    "iv": {"min": 0.02, "max": 0.5, 'strategy': "highest", 'top': 10},
    "quality_score": {"min": 0.01},
    "gini": {"min": 0.02}
}

train_test_ratio = 0.7  # Adjusted to 0.7 for training (30% test)

# Gini calculation (function, assuming it's based on AUC)
def calculate_gini(y_true, y_pred_proba):
    auc = roc_auc_score(y_true, y_pred_proba)
    gini = 2 * auc - 1
    return gini

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store metrics
gini_scores = []

X = df.drop(columns=['default'])  # Exclude target variable
y = df['default']  # Use the encoded target variable

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - train_test_ratio), random_state=42)

# Extract categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

max_n_bins = 8
min_n_bins = 2

# Initialize the Binning Process
binning_process = BinningProcess(
    variable_names=numerical_features + categorical_features,  # Ensure to include all features
    categorical_variables=categorical_features,
    special_codes=[],
    selection_criteria=selection_criteria,
    min_n_bins=min_n_bins,
    max_n_bins=max_n_bins
)

# Initialize the Logistic Regression estimator
estimator = LogisticRegression(solver="lbfgs", class_weight='balanced')

# Initialize the Scorecard
scorecard = Scorecard(
    binning_process=binning_process,
    estimator=estimator,
    scaling_method="min_max",  # You can adjust the scaling method here
    scaling_method_params={"min": 350, "max": 800},
    intercept_based=False,
    reverse_scorecard=False,
    verbose=True
)

# Start an MLflow run
with mlflow.start_run():
    
    # Log selection criteria parameters
    for criterion, params in selection_criteria.items():
        for key, value in params.items():
            mlflow.log_param(f"{criterion}_{key}", value)
    
    mlflow.log_param("train_test_ratio", train_test_ratio)
    mlflow.log_param("max_n_bins", max_n_bins)
    mlflow.log_param("min_n_bins", min_n_bins)

    

    # K-Fold Cross-Validation
    for train_index, test_index in kf.split(X):
        X_fold_train, X_fold_test = X.iloc[train_index], X.iloc[test_index]
        y_fold_train, y_fold_test = y.iloc[train_index], y.iloc[test_index]

        # Fit the Scorecard
        scorecard.fit(X_fold_train, y_fold_train)

        # Calculate and log Gini for this fold
        fold_pred_proba = scorecard.predict_proba(X_fold_test)[:, 1]
        gini_fold = calculate_gini(y_fold_test, fold_pred_proba)
        gini_scores.append(gini_fold)
        mlflow.log_metric("gini_fold", gini_fold)

    # Log average Gini score across folds
    avg_gini = np.mean(gini_scores)
    mlflow.log_metric("avg_gini_cv", avg_gini)

    # Fit the final model on the entire training data
    scorecard.fit(X_train, y_train)

    # Count unique variables from the scorecard
    unique_variables_count = scorecard.table(style="detailed")["Variable"].unique().size
    
    # Log the count of unique variables
    mlflow.log_param("unique_variables_count", unique_variables_count)

    # Log the fitted model using MLflow
    mlflow.sklearn.log_model(scorecard, "credit_scoring_model")

    # Calculate and log Gini for train
    train_pred_proba = scorecard.predict_proba(X_train)[:, 1]
    gini_train = calculate_gini(y_train, train_pred_proba)
    mlflow.log_metric("gini_train", gini_train)
    
    # Calculate and log Gini for test
    test_pred_proba = scorecard.predict_proba(X_test)[:, 1]
    gini_test = calculate_gini(y_test, test_pred_proba)
    mlflow.log_metric("gini_test", gini_test)

    # Generate today's date
    today_date = datetime.today().strftime('%Y-%m-%d')

    # Directory path to save the model
    directory = "Model Registry/models"
    os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist

    # File path with Gini and date
    file_name = f"{directory}/logit_model_{gini_test:.2f}_Gini_{today_date}.pkl"

    # Save the model as a .pkl file
    with open(file_name, 'wb') as file:
        pickle.dump(scorecard, file)  # Ensure you're saving the fitted scorecard

    # Log the .pkl model file as artifact
    mlflow.log_artifact(file_name)

print("Model and metrics logged successfully!")

2024-10-09 16:28:27,973 | INFO : Scorecard building process started.
2024-10-09 16:28:27,974 | INFO : Options: check parameters.
2024-10-09 16:28:27,975 | INFO : Dataset: binary target.
2024-10-09 16:28:27,975 | INFO : Binning process started.
2024-10-09 16:28:28,568 | INFO : Binning process terminated. Time: 0.5925s
2024-10-09 16:28:28,568 | INFO : Fitting estimator.
2024-10-09 16:28:28,573 | INFO : Fitting terminated. Time 0.0051s
2024-10-09 16:28:28,574 | INFO : Scorecard table building started.
2024-10-09 16:28:28,588 | INFO : Scorecard table terminated. Time: 0.0146s
2024-10-09 16:28:28,589 | INFO : Scorecard building process terminated. Time: 0.6152s
2024-10-09 16:28:28,596 | INFO : Scorecard building process started.
2024-10-09 16:28:28,596 | INFO : Options: check parameters.
2024-10-09 16:28:28,597 | INFO : Dataset: binary target.
2024-10-09 16:28:28,597 | INFO : Binning process started.
2024-10-09 16:28:29,183 | INFO : Binning process terminated. Time: 0.5862s
2024-10-09 16:28



In [7]:
get_ipython().system_raw("mlflow ui --port 5004 &")


[2024-10-09 16:28:48 +0500] [69924] [INFO] Starting gunicorn 20.1.0
[2024-10-09 16:28:48 +0500] [69924] [INFO] Listening at: http://127.0.0.1:5004 (69924)
[2024-10-09 16:28:48 +0500] [69924] [INFO] Using worker: sync
[2024-10-09 16:28:48 +0500] [69925] [INFO] Booting worker with pid: 69925
[2024-10-09 16:28:48 +0500] [69926] [INFO] Booting worker with pid: 69926
[2024-10-09 16:28:48 +0500] [69927] [INFO] Booting worker with pid: 69927
[2024-10-09 16:28:48 +0500] [69928] [INFO] Booting worker with pid: 69928
[2024-10-09 16:57:33 +0500] [69924] [CRITICAL] WORKER TIMEOUT (pid:69925)
[2024-10-09 16:57:33 +0500] [69924] [CRITICAL] WORKER TIMEOUT (pid:69926)
[2024-10-09 16:57:33 +0500] [69924] [CRITICAL] WORKER TIMEOUT (pid:69927)
[2024-10-09 16:57:33 +0500] [69924] [CRITICAL] WORKER TIMEOUT (pid:69928)
[2024-10-09 16:57:33 +0500] [69925] [INFO] Worker exiting (pid: 69925)
[2024-10-09 16:57:33 +0500] [69927] [INFO] Worker exiting (pid: 69927)
[2024-10-09 16:57:33 +0500] [69926] [INFO] Worker