<h1 align='center' style='color:blue'>Credit Risk Modeling Project</h1>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import seaborn as sns

pd.set_option('display.float_format',lambda x : "{:.2f}".format(x))
np.set_printoptions(suppress=True)
import warnings
warnings.filterwarnings('ignore')

### Load Data

In [None]:
df_customers = pd.read_csv("dataset/customers.csv")
df_loans = pd.read_csv("dataset/loans.csv")
df_bureau = pd.read_csv("dataset/bureau_data.csv")

In [None]:
df_customers.shape, df_loans.shape, df_bureau.shape

In [None]:
df_customers.head()

In [None]:
df_loans.head()

In [None]:
df_bureau.head()

In [None]:
df= pd.merge(df_customers,df_loans,on='cust_id')
df.head()

In [None]:
df = pd.merge(df, df_bureau, on='cust_id')
df.head()

In [None]:
df.info()

In [None]:
df['default']=df['default'].astype(int)
df['default'].value_counts()

default is a target variable and we can see some class imbalance here. We will address it later on.

### Train Test split

We perform train test split before doing EDA to prevent data leakage. We don't want test set (which is part of the entire dataset) to influence EDA or any decisions on feature engineering. 

In [None]:
X=df.drop("default",axis=1)
y=df['default']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42,stratify=y)

df_train= pd.concat([X_train,y_train],axis=1)
df_test = pd.concat([X_test,y_test],axis=1)

df_train.head(3)


<h3 align="center" style="color:blue">Data Cleaning</h3>

### Handle Missing & Duplicate Values

In [None]:
df_train.shape

In [None]:
df_train.isna().sum()

In [None]:
df_train['residence_type'].unique()

In [None]:
mode_residence = df_train['residence_type'].mode()[0]
mode_residence

In [None]:
df_train['residence_type'].fillna(mode_residence,inplace=True)
df_test['residence_type'].fillna(mode_residence,inplace=True)
df_train['residence_type'].unique()

In [None]:
df_train.duplicated().sum()

In [None]:
df_train.describe()

In [None]:
df_train.columns

In [None]:
columns_continuous = ['age', 'income', 'number_of_dependants', 'years_at_current_address', 
                      'sanction_amount', 'loan_amount', 'processing_fee', 'gst', 'net_disbursement', 
                      'loan_tenure_months','principal_outstanding', 'bank_balance_at_application',
                      'number_of_open_accounts','number_of_closed_accounts', 'total_loan_months', 'delinquent_months',
                       'total_dpd', 'enquiry_count', 'credit_utilization_ratio']

columns_categorical = ['gender', 'marital_status', 'employment_status', 'residence_type', 'city', 
                       'state', 'zipcode', 'loan_purpose', 'loan_type', 'default']

In [None]:
num_plots= len(columns_continuous)
num_col = 4
num_rows = (num_plots + num_col - 1) // num_col

fig, axes = plt.subplots(num_rows,num_col , figsize = (5*num_col,5*num_rows))
axes = axes.flatten()

for i, col in enumerate(columns_continuous):
    sns.boxplot(x=df_train[col],ax=axes[i])
    axes[i].set_title(col)

for j in range(i+1, num_col*num_rows):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
num_plots= len(columns_continuous)
num_col = 4
num_rows = (num_plots + num_col - 1) // num_col

fig, axes = plt.subplots(num_rows,num_col , figsize = (5*num_col,5*num_rows))
axes = axes.flatten()

for i, col in enumerate(columns_continuous):
    sns.histplot(x=df_train[col],ax=axes[i])
    axes[i].set_title(col)

for j in range(i+1, num_col*num_rows):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

### Outlier Removal: Processing Fee

In [None]:
df_train['processing_fee'].describe()

In [None]:
df_train['processing_fee'].max()

In [None]:
df_train[df_train['processing_fee']==df_train['processing_fee'].max()][['loan_amount','processing_fee']]

In [None]:
df_train[df_train['processing_fee']>df_train['loan_amount']][['loan_amount','processing_fee']]

we can see that their are five entery where processing fee is greater than loan_amount which is not possible in real processing fee should be 3% of loan amount it varies according to bank varies so these are the outlier 

In [None]:
df_train[df_train['processing_fee']/df_train['loan_amount']>0.03][['loan_amount','processing_fee']]

we can see that their are five are outlier these are low amount so we can simply remove by drop these rows of data

In [None]:
df_train_1 = df_train[df_train['processing_fee']/df_train['loan_amount']<0.03].copy()
df_train_1.shape

In [None]:
df_train_1['residence_type'].isna().sum()

In [None]:
# Apply same step on test set
df_test = df_test[df_test['processing_fee']/df_test['loan_amount']<0.03].copy()
df_test.shape

In [None]:
# Use other business rules for data validation
# Rule 1: GST should not be more than 20%

df_train_1[(df_train_1['gst']/df_train_1['loan_amount'])>0.2].shape

In [None]:
# Rule 2: Net disbursement should not be higher than loan_amount
df_train_1[df_train_1['net_disbursement']>df_train_1['loan_amount']].shape

No rows found that breaks rule 1 and rule 2

### Analyze Categorical Columns

In [None]:
columns_categorical

In [None]:
for col in columns_categorical:
    print(col ,"-->" ,df_train_1[col].unique())

### Fix Errors in Loan Purpose Column

In [None]:
df_train_1['loan_purpose'] = df_train_1['loan_purpose'].replace('Personaal','Personal')
df_train_1['loan_purpose'].unique()

In [None]:
df_test['loan_purpose'] = df_test['loan_purpose'].replace('Personaal','Personal')
df_test['loan_purpose'].unique()

<h3 style="color:blue" align="center">Exploratory Data Analysis</h3>

In [None]:
columns_continuous

### Age Column

In [None]:
df_train_1['age'].describe()

In [None]:
df_train_1.groupby('default')['age'].describe()

**Insights**

1. Average age in the default group is little less (37.12) than the average (39.7) of the group that did not default
1. Variability (standard deviation) is mostly similar in both the groups
1. Both the groups have similar min and max ages

In [None]:
df_train_1['age'][df_train_1['default']==1]

In [None]:
sns.kdeplot(df_train_1['age'][df_train_1['default']==0],fill=True,label='default=0')
sns.kdeplot(df_train_1['age'][df_train_1['default']==1],fill=True,label='default=1')
plt.title(f"Age KDE Plot with Hue by default")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(24, 20))  # Width, height in inches

for i, col in enumerate(columns_continuous):
    plt.subplot(6,4,i+1)
    sns.kdeplot(df_train_1[col][df_train_1['default']==0],fill=True,label='default=0')
    sns.kdeplot(df_train_1[col][df_train_1['default']==1],fill=True,label='default=1')
    plt.title(col)        
    plt.xlabel('')
    
plt.tight_layout()
plt.legend()
plt.show()

**Insights**

1. In columns: loan_tenure_months, delinquent_months, total_dpd, credit_utilization, higher values indicate high likelyhood of becoming a default. Hence these 4 looks like strong predictors
1. In remaining columns the distributions do not give any obvious insights
1. Why loan_amount and income did not give any signs of being strong predictors? May be when we combine these two and get loan to income ratio (LTI), that may have influence on the target variable. We will explore more later

<h3 style="color:blue" align="center">Feature Engineering, Feature Selection</h3>

#### Generate Loan to Income (LTI) Ratio

In [None]:
df_train_1[['loan_amount','income']].head(3)

In [None]:
df_train_1['loan_to_income'] = round(df_train_1['loan_amount']/df_train_1['income'],2)
df_train_1['loan_to_income'].describe()

In [None]:
df_test['loan_to_income'] = round(df_test['loan_amount']/df_test['income'],2)

In [None]:
sns.kdeplot(df_train_1['loan_to_income'][df_train_1['default']==0],fill=True,label='default=0')
sns.kdeplot(df_train_1['loan_to_income'][df_train_1['default']==1],fill=True,label='default=1')
plt.title(f"Loan to Income Ratio (LTI) KDE Plot with Hue by default")
plt.legend()
plt.show()

**Insights**
1. Blue graph has majority of its values on lower side of LTI
2. Orange graph has many values when LTI is higher indicating that higher LTI means high risk loan

#### Generate Delinquency Ratio

In [None]:
df_train_1['delinquency_ratio'] = round(df_train_1['delinquent_months']*100/df_train_1['total_loan_months'],2)
df_test['delinquency_ratio'] = round(df_test['delinquent_months']*100/df_test['total_loan_months'],2)

In [None]:
sns.kdeplot(df_train_1['delinquency_ratio'][df_train_1['default']==0],fill=True,label='default=0')
sns.kdeplot(df_train_1['delinquency_ratio'][df_train_1['default']==1],fill=True,label='default=1')
plt.title(f"elinquency Ratio KDE Plot with Hue by default")
plt.legend()
plt.show()

**Insights**
1. Blue graph has majority of its values on lower side of LTI
2. Orange graph has many values when delinquency ratio is higher indicating some correlation on default

#### Generate Avg DPD Per Delinquency

In [None]:
df_train_1['avg_dpd_per_delinquency'] = np.where(
    df_train_1['delinquent_months']!=0,
    round(df_train_1['total_dpd']/df_train_1['delinquent_months'],2),0
)
df_test['avg_dpd_per_delinquency'] = np.where(
    df_test['delinquent_months']!=0,
    round(df_test['total_dpd']/df_test['delinquent_months'],2),0
)


In [None]:
plt.figure(figsize=(8, 4))
sns.kdeplot(df_train_1['avg_dpd_per_delinquency'][df_train_1['default'] == 0], fill=True, label='default=0')
sns.kdeplot(df_train_1['avg_dpd_per_delinquency'][df_train_1['default'] == 1], fill=True, label='default=1')
plt.title(f"Avg DPD Per Delinquency Ratio KDE Plot with Hue by default")
plt.legend()
plt.show()

**Insights**

1. Graph clearly shows more occurances of default cases when avg_dpd_per_delinquency is high. This means this column is a strong predictor

### Remove columns that are just unique ids and don't have influence on target

In [None]:
df_train_1.columns

In [None]:
df_train_2 = df_train_1.drop(['cust_id','loan_id'] ,axis=1)
df_test = df_test.drop(['cust_id','loan_id'],axis=1)

### Remove columns that business contact person asked us to remove

In [None]:
df_train_3 = df_train_2.drop(['disbursal_date','delinquent_months','total_dpd','installment_start_dt','income','loan_amount','total_loan_months'],axis=1)
df_test = df_test.drop(['disbursal_date','delinquent_months','total_dpd','installment_start_dt','income','loan_amount','total_loan_months'],axis=1)
df_train_3.columns

In [None]:
df_train_3.select_dtypes(['int64','float64']).columns

In [None]:
X_train = df_train_3.drop(['default'],axis=1)
y_train = df_train_3['default']

from sklearn.preprocessing import MinMaxScaler
cols_to_scale = X_train.select_dtypes(['int64','float64']).columns.drop('zipcode')

scaler= MinMaxScaler()

X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

X_train.describe()



In [None]:
X_test = df_test.drop(['default'],axis=1)
y_test = df_test['default']


X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

X_test.describe()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(data):
    vif_df= pd.DataFrame()
    vif_df['Column'] = data.columns
    vif_df['VIF'] = [variance_inflation_factor(data.values,i ) for i in range(data.shape[1])]
    return vif_df

In [None]:
X_train.head()

In [None]:
calculate_vif(X_train[cols_to_scale])

In [None]:
features_to_drop_vif = ['sanction_amount', 'processing_fee', 'gst', 'net_disbursement','principal_outstanding']

X_train_1 = X_train.drop(features_to_drop_vif,axis=1)
numeric_columns = X_train_1.select_dtypes(['int64', 'float64']).columns.drop('zipcode')
numeric_columns

In [None]:
vif_df = calculate_vif(X_train_1[numeric_columns])
vif_df

In [None]:
selected_numeric_features_vif = vif_df.Column.values
selected_numeric_features_vif

In [None]:
numeric_columns

In [None]:
df_train_3[numeric_columns]

In [None]:
df_train_3[numeric_columns.append(pd.Index(['default']))]

In [None]:
plt.figure(figsize=(12,12))
cm = df_train_3[numeric_columns.append(pd.Index(['default']))].corr()
sns.heatmap(cm, fmt="0.2f",annot=True)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

### Feature Selection: Categorical Features

In [None]:
columns_categorical

In [None]:
X_train_1.head()

In [None]:
pd.concat([X_train_1, y_train],axis=1).groupby('loan_purpose')['default'].agg(['count','sum'])

### Calculate WOE and IV

In [None]:
def calculate_woe_iv(df, feature,target):
    grouped = df.groupby(feature)[target].agg(['count','sum'])
    grouped = grouped.rename(columns= {'count': 'total', 'sum': 'good'})
    grouped['bad'] = grouped['total'] - grouped['good']

    total_good = grouped['good'].sum()
    total_bad = grouped['bad'].sum()

    grouped['good_pct'] = grouped['good']/total_good
    grouped['bad_pct'] = grouped['bad']/total_bad
    grouped['woe'] = np.log(grouped['good_pct']/grouped['bad_pct'])
    grouped['iv'] = grouped['woe']*(grouped['good_pct'] - grouped['bad_pct'])

    
    grouped['woe'] = grouped['woe'].replace([np.inf, -np.inf],0)
    grouped['iv'] =  grouped['iv'].replace([np.inf, -np.inf],0)


    total_iv = grouped['iv'].sum()

    return grouped, total_iv
    

In [None]:
grouped, total_iv = calculate_woe_iv(pd.concat([X_train_1, y_train],axis=1),'loan_purpose','default')
grouped

In [None]:
X_train_1.info()

In [None]:
pd.cut(X_train_1['loan_to_income'],bins=10)

In [None]:
iv_values = {}
for feature in X_train_1.columns:
    if X_train_1[feature].dtype == 'object':
        _, iv = calculate_woe_iv(pd.concat([X_train_1,y_train],axis=1),feature,'default')
    else:
        X_binned = pd.cut(X_train_1[feature],bins=10, labels=False)
        _, iv = calculate_woe_iv(pd.concat([X_binned, y_train], axis=1), feature, 'default')

    iv_values[feature] = iv



iv_values


        

In [None]:
iv_df = pd.DataFrame(list(iv_values.items()),columns=['Feature','IV'])
iv_df = iv_df.sort_values(by='IV',ascending=False)
iv_df

In [None]:
iv_values.items()

In [None]:
selected_features_iv = [feature for feature, iv, in iv_values.items() if iv >0.02]
selected_features_iv

### Feature Encoding

In [None]:
X_train_reduced = X_train_1[selected_features_iv]
X_test_reduced = X_test[selected_features_iv]

In [None]:
X_train_encoded = pd.get_dummies(X_train_reduced, drop_first=True,dtype='int')
X_train_encoded.head(3)

In [None]:
X_test_encoded = pd.get_dummies(X_test_reduced, drop_first=True , dtype='int')
X_test_encoded.head(3)

<h3 align="center" style="color:blue">Model Training</h3>

### Attempt 1

1. Logistic Regression, RandomForest & XGB
1. No handling of class imbalance

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train_encoded, y_train)

y_pred = model.predict(X_test_encoded)

report = classification_report(y_test, y_pred)
print(report)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train_encoded, y_train)

y_pred = model.predict(X_test_encoded)

report = classification_report(y_test, y_pred)
print(report)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train_encoded, y_train)

y_pred = model.predict(X_test_encoded)

report = classification_report(y_test, y_pred)
print(report)

Since there is not much difference between XGB and Logistic, we will choose LogisticRegression as a candidate for our RandomizedSearchCV candidate it has a better interpretation.

#### RandomizedSearch CV for Attempt 1: Logistic Regression

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs', 'saga', 'liblinear', 'newton-cg']
    
}

# Create the Logistic Regression model
log_reg = LogisticRegression(max_iter=10000)  # Increased max_iter for convergence

random_search = RandomizedSearchCV(
    estimator = log_reg,
    param_distributions = param_dist,
    cv= 3,
    verbose=2,
    scoring='f1',
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train_encoded,y_train)

print(f"Best Parameters : {random_search.best_params_}")
print(f"Nest score : {random_search.best_score_} ")

best_model = random_search.best_estimator_
y_pred  = best_model.predict(X_test_encoded)
report = classification_report(y_test, y_pred)
print(report)

#### RandomizedSearch CV for Attempt 1: XGBoost

In [None]:


param_dist = {
    'n_estimators' : [100, 150, 200, 250, 300],
    'max_depth' : [3, 4, 5, 6, 7, 8, 9, 10 ],
    'learning_rate' : [0.01, 0.03, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
    'subsample' : [0.6, 0.7, 0.8, 0.9, 1.0 ],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'scale_pos_weight': [1, 2, 3, 5, 7, 10],
    'reg_alpha': [0.01, 0.1, 0.5, 1.0, 5.0, 10.0],  # L1 regularization term
    'reg_lambda': [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]  # L2 regularization term
}

# Create the XGB Classifier  model
xgb = XGBClassifier()

random_search = RandomizedSearchCV(
    estimator = xgb,
    param_distributions = param_dist,
    n_iter=100,
    cv= 3,
    verbose=1,
    scoring='f1',
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train_encoded,y_train)

print(f"Best Parameters : {random_search.best_params_}")
print(f"Nest score : {random_search.best_score_} ")

best_model = random_search.best_estimator_
y_pred  = best_model.predict(X_test_encoded)
report = classification_report(y_test, y_pred)
print(report)

### Attempt 2

1. Logistic Regression & XGB
1. Handle Class Imbalance Using Under Sampling

In [None]:
y_train.value_counts()

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_train_rus, y_train_rus =  rus.fit_resample(X_train_encoded, y_train)
y_train_rus.value_counts()

In [None]:
model = LogisticRegression()
model.fit(X_train_rus, y_train_rus)

y_pred = model.predict(X_test_encoded)

report = classification_report(y_test, y_pred)
print(report)

In [None]:
model = XGBClassifier(**random_search.best_params_)
model.fit(X_train_rus, y_train_rus)

y_pred = model.predict(X_test_encoded)

report = classification_report(y_test, y_pred)
print(report)

### Attempt 3

1. Logistic Regression
1. Handle Class Imbalance Using SMOTE Tomek
1. Parameter tunning using optuna

In [None]:
from imblearn.combine import SMOTETomek

sm = SMOTETomek(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train_encoded, y_train)
y_train_sm.value_counts()

In [None]:
model = LogisticRegression()
model.fit(X_train_sm, y_train_sm)

y_pred = model.predict(X_test_encoded)

report = classification_report(y_test, y_pred)
print(report)

In [None]:
import optuna
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import cross_val_score

In [None]:
def objective(trials):
    params={
    'C' : trials.suggest_float("C", 1e-4, 1e4, log=True),
    'solver' : trials.suggest_categorical('solver',["newton-cg", "lbfgs", "liblinear", "sag", "saga"]),
    'tol': trials.suggest_float('tol', 1e-6, 1e-1, log=True),  # Logarithmically spaced values for tolerance
    'class_weight': trials.suggest_categorical('class_weight', [None, 'balanced'])  # Class weights      
    }
    model = LogisticRegression(**params, max_iter=10000)
    f1_scorer = make_scorer(f1_score, average= 'macro')
    scores  = cross_val_score(model,X_train_sm,y_train_sm, cv= 3,scoring= f1_scorer,n_jobs=-1)
    return np.mean(scores)

study_logistic = optuna.create_study(direction='maximize')
study_logistic.optimize(objective, n_trials=50)

In [None]:
print('Best_trial: ')
trial = study_logistic.best_trial
print('  F1-score: {}'.format(trial.value))
print("prams")
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

best_model_logistic = LogisticRegression(**study_logistic.best_params)
best_model_logistic.fit(X_train_sm,y_train_sm)

y_pred = best_model_logistic.predict(X_test_encoded)

report = classification_report(y_test, y_pred)
print(report)


In [None]:
def objective(trial):
    params={
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'verbosity': 0,
        'booster': 'gbtree',
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10)    
    }
    model = XGBClassifier(**params)
    f1_scorer = make_scorer(f1_score, average= 'macro')
    scores  = cross_val_score(model,X_train_sm,y_train_sm, cv= 3,scoring= f1_scorer,n_jobs=-1)
    return np.mean(scores)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective, n_trials=50)

In [None]:
print('Best_trial: ')
trial = study_xgb.best_trial
print('  F1-score: {}'.format(trial.value))
print("prams")
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

best_params = study_xgb.best_params
best_model_xgb = XGBClassifier(**best_params)
best_model_xgb.fit(X_train_sm,y_train_sm)

y_pred = best_model_xgb.predict(X_test_encoded)

report = classification_report(y_test, y_pred)
print(report)


## Model Evaluation : ROC/AUC

In [None]:
y_pred = best_model_logistic.predict(X_test_encoded)
report = classification_report(y_test,y_pred)
print(report)

In [None]:
from sklearn.metrics import roc_curve

probabilities = best_model_logistic.predict_proba(X_test_encoded)[:,1]

fpr, tpr, threshold = roc_curve(y_test, probabilities)
fpr[:5],tpr[:5],threshold[:5]

    

In [None]:
from sklearn.metrics import auc

area = auc(fpr, tpr)
print(area)

In [None]:
plt.figure()
plt.plot(fpr, tpr, color = "darkorange", lw= 2, label = "ROC curve (area = %0.2f ) "%area)
plt.plot([0, 1], [0, 1], color = 'navy', lw= 2, linestyle = '--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
df_eval = pd.DataFrame({
    'Default Truth': y_test,
    'Default Probability': probabilities
})
df_eval.head()

In [None]:
df_eval['Decile'] = pd.qcut(df_eval['Default Probability'],10,labels=False,duplicates='drop')
df_eval.head()

In [None]:
df_eval.groupby('Decile')['Default Probability'].min()

In [None]:
df_decile = df_eval.groupby('Decile').apply(lambda x: pd.Series({
    'Minimum Probability': x['Default Probability'].min(),
    'Maximum Probability': x['Default Probability'].max(),
    'Events': x['Default Truth'].sum(),
    'Non Events': x['Default Truth'].count() - x['Default Truth'].sum()
}))
df_decile.reset_index(inplace=True)
df_decile

In [None]:
df_decile['Events'].sum()

In [None]:
df_decile['Event Rate'] = (df_decile['Events']*100)/(df_decile['Events'] + df_decile['Non Events'])
df_decile['Non Event Rate'] = (df_decile['Non Events']*100)/(df_decile['Events'] + df_decile['Non Events'])
df_decile

In [None]:
df_decile = df_decile.sort_values(by='Decile',ascending=False).reset_index(drop=True)
df_decile

In [None]:
df_decile['Cum Events'] = df_decile['Events'].cumsum()
df_decile['Cum Non Events'] = df_decile['Non Events'].cumsum()
df_decile

In [None]:
df_decile['Cum Event Rate'] = (df_decile['Cum Events']*100)/df_decile['Events'].sum()
df_decile['Cum Non Event Rate'] = (df_decile['Cum Non Events']*100)/df_decile['Non Events'].sum()
df_decile

In [None]:
df_decile['KS'] = df_decile['Cum Event Rate'] - df_decile['Cum Non Event Rate']
df_decile

To assess whether rank ordering is followed, we should look at whether higher deciles (those with higher predicted probabilities) have higher event rates compared to lower deciles. Rank ordering means that as you move from the top decile to the bottom decile, the event rate should generally decrease.

Non-Events - termed as good (customers) who do not default. Events - termed as bad (customers) who default.

Events and Non-Events terms are interchanged based on usecase to usecase.

eg: for marketing usecase, which customer to reach out (who will take loan based on offers) will be events - here it will termed as good, and the customers who will not take loans will be non-events (bad).

**Insights from the Decile Table**

1. Top Deciles

* The first decile (Decile 9) has a high event rate of 71.84% and a non-event rate of 28.16%. This indicates that the model is highly confident in predicting events in this decile.
* The second decile (Decile 8) also shows a significant event rate of 12.96%, with a cumulative event rate reaching 98.6%.

2. Middle Deciles:

* Deciles 7 and 6 show a significant drop in event rates

3. Lower Deciles:

* Deciles 5 to 0 show zero events, with all predictions being non-events. These deciles collectively have a non-event rate of 100%.

4. KS Statistic:

* The KS statistic, which is the maximum difference between cumulative event rates and cumulative non-event rates, is highest at Decile 8 with a value of 86.09%. This suggests that the model performs best at distinguishing between events and non-events up to this decile.

* The KS value gradually decreases in the following deciles, indicating a decrease in model performance for distinguishing between events and non-events.

**KS Value**

The highest KS value is 85.98%, found at Decile 8. This indicates that the model's performance in distinguishing between events and non-events is most significant at this decile. (If KS is in top 3 decile and score above 40, it is considered a good predictive model.)

In [None]:
gini_coefficient = 2 * area -1
print("AUC:",area)
print("Gini Coefficient:",gini_coefficient)

AUC of 0.98: The model is very good at distinguishing between events and non-events.

Gini coefficient of 0.96: This further confirms that the model is highly effective in its predictions, with almost perfect rank ordering capability.

The Gini coefficient ranges from -1 to 1, where a value closer to 1 signifies a perfect model, 0 indicates a model with no discriminative power, and -1 signifies a perfectly incorrect model.

#### Finalize The Model and Visualize Feature Importance

In [None]:
final_model = best_model_logistic

feature_importance = final_model.coef_[0]

coef_df = pd.DataFrame(feature_importance,index=X_train_encoded.columns,columns=['Coefficients'])
coef_df = coef_df.sort_values(by='Coefficients',ascending = True)

plt.figure(figsize=(8,4))

plt.barh(coef_df.index,coef_df['Coefficients'])
plt.xlabel('Coefficient Value')
plt.title('Feature Importance in Logistic Regression')
plt.show()

### Save the Model

In [None]:
X_test_encoded.head(2)

In [None]:
X_test_encoded.columns

In [None]:
cols_to_scale

In [None]:
X_train_encoded.columns

In [None]:
from joblib import dump

model_data = {
    'model': final_model,
    'features': X_train_encoded.columns,
    'scaler': scaler,
    'cols_to_scale': cols_to_scale
}
dump(model_data,'../artifacts/model_data.joblib')

In [None]:
final_model.coef_, final_model.intercept_