In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

In [7]:
df = pd.read_csv('loan_clean.csv')
df.head()

Unnamed: 0,loan_amnt,int_rate,term,dti,annual_inc,delinq_2yrs,open_acc,grade,home_ownership,collections_12_mths_ex_med,revol_bal,total_acc,loan_status
0,2400.0,8.9,1,19.14,127000.0,2.0,16.0,1,2,0.0,34227.0,47.0,Normal
1,16000.0,6.99,1,6.36,80000.0,0.0,7.0,1,1,0.0,19716.0,8.0,Normal
2,15000.0,13.11,1,11.09,106000.0,0.0,12.0,2,1,0.0,7715.0,18.0,Normal
3,10500.0,12.12,1,18.62,115000.0,3.0,10.0,2,2,0.0,2662.0,47.0,Normal
4,12000.0,15.61,1,18.37,65000.0,0.0,7.0,4,2,0.0,4930.0,21.0,Normal


In [8]:
def labesencoder(text):
    if text == "Normal":
        return 0
    elif text in ['Default', 'Delinquent', 'Not Compliant']:
        return 1
    else:
        return 2
    
df['loan_status'] = df['loan_status'].apply(labesencoder)

In [9]:
df['loan_status'].value_counts()

loan_status
0    33301
1     9599
Name: count, dtype: int64

In [10]:
X = df.drop('grade', axis=1)
y = df['grade']

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

In [33]:
from sklearn.ensemble import RandomForestClassifier

rb = RandomForestClassifier(random_state=42)

rb.fit(X_train, y_train)

y_pred = rb.predict(X_test)

y_prob = rb.predict_proba(X_test)


auc_score = roc_auc_score(y_test, y_prob, multi_class='ovo')

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("AUC-ROC Score:", auc_score)


Classification Report:
              precision    recall  f1-score   support

           1       0.98      0.98      0.98      1750
           2       0.92      0.93      0.92      3080
           3       0.81      0.87      0.84      2739
           4       0.77      0.77      0.77      1848
           5       0.83      0.69      0.75       880
           6       0.89      0.80      0.84       323
           7       0.94      0.64      0.76       105

    accuracy                           0.87     10725
   macro avg       0.88      0.81      0.84     10725
weighted avg       0.87      0.87      0.87     10725

Confusion Matrix:
[[1722   28    0    0    0    0    0]
 [  41 2854  185    0    0    0    0]
 [   0  198 2374  167    0    0    0]
 [   1   11  334 1430   72    0    0]
 [   2    0   21  238  606   13    0]
 [   0    0    5   13   42  259    4]
 [   0    0    0    7   12   19   67]]
AUC-ROC Score: 0.9681967624058989


In [31]:
auc_score = roc_auc_score(y_test, y_prob, multi_class='ovo')

In [32]:
print("AUC-ROC Score:", auc_score)

AUC-ROC Score: 0.9681967624058989


In [43]:
import zipfile

with zipfile.ZipFile("credit-risk-dataset.zip") as myzip:
    data = myzip.open("loan/loan.csv")

df1 = pd.read_csv(data)
df1.head()

  df1 = pd.read_csv(data)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501,1296599,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430,1314167,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,
2,1077175,1313524,2400.0,2400.0,2400.0,36 months,15.96,84.33,C,C5,...,,,,,,,,,,
3,1076863,1277178,10000.0,10000.0,10000.0,36 months,13.49,339.31,C,C1,...,,,,,,,,,,
4,1075358,1311748,3000.0,3000.0,3000.0,60 months,12.69,67.79,B,B5,...,,,,,,,,,,


In [44]:
df_sample = df1.sample(frac=0.15)
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 133107 entries, 238432 to 394667
Data columns (total 74 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           133107 non-null  int64  
 1   member_id                    133107 non-null  int64  
 2   loan_amnt                    133107 non-null  float64
 3   funded_amnt                  133107 non-null  float64
 4   funded_amnt_inv              133107 non-null  float64
 5   term                         133107 non-null  object 
 6   int_rate                     133107 non-null  float64
 7   installment                  133107 non-null  float64
 8   grade                        133107 non-null  object 
 9   sub_grade                    133107 non-null  object 
 10  emp_title                    125437 non-null  object 
 11  emp_length                   126416 non-null  object 
 12  home_ownership               133107 non-null  object 
 13 

In [45]:
columns = ['loan_amnt', 'int_rate', 'term', 'dti', 'annual_inc', 'delinq_2yrs', 'open_acc',
           'grade', 'home_ownership', 'collections_12_mths_ex_med', 'revol_bal', 'total_acc',
           'loan_status']

df_clean = df_sample[columns]
df_clean.head()

Unnamed: 0,loan_amnt,int_rate,term,dti,annual_inc,delinq_2yrs,open_acc,grade,home_ownership,collections_12_mths_ex_med,revol_bal,total_acc,loan_status
238432,5000.0,11.99,36 months,13.88,58000.0,0.0,4.0,B,RENT,1.0,3548.0,10.0,Current
35121,12000.0,8.94,36 months,18.99,57000.0,0.0,13.0,A,RENT,0.0,25577.0,26.0,Fully Paid
613461,3600.0,16.55,36 months,36.2,56000.0,1.0,16.0,D,RENT,0.0,13964.0,23.0,Current
317582,10000.0,14.99,60 months,15.74,95000.0,1.0,10.0,C,RENT,0.0,4731.0,17.0,Current
448546,4000.0,13.98,36 months,17.97,40130.0,0.0,5.0,C,MORTGAGE,0.0,3528.0,15.0,Current


In [46]:
def categorize_loan_status(status):
    if status in ['Fully Paid', 'In Grace Period', 'Issued']:
        return 'Normal'
    elif status in ['Late (16-30 days)', 'Late (31-120 days)']:
        return 'Delinquent'
    elif status in ['Charged Off', 'Default']:
        return 'Default'
    elif 'Does not meet the credit policy' in status:
        return 'Not Compliant'
    elif 'Current' in status:
        return 'Current'
    else:
        return 'Unknown'
    
df_clean['loan_status'] = df_clean['loan_status'].apply(categorize_loan_status)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['loan_status'] = df_clean['loan_status'].apply(categorize_loan_status)


In [47]:
def apply_label_encoding(column):
    labelencoder = LabelEncoder()
    return labelencoder.fit_transform(column)

columns_to_encode = ['term', 'grade', 'home_ownership']
df_clean[columns_to_encode] = df_clean[columns_to_encode].apply(apply_label_encoding)
df_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[columns_to_encode] = df_clean[columns_to_encode].apply(apply_label_encoding)


Unnamed: 0,loan_amnt,int_rate,term,dti,annual_inc,delinq_2yrs,open_acc,grade,home_ownership,collections_12_mths_ex_med,revol_bal,total_acc,loan_status
238432,5000.0,11.99,0,13.88,58000.0,0.0,4.0,1,4,1.0,3548.0,10.0,Current
35121,12000.0,8.94,0,18.99,57000.0,0.0,13.0,0,4,0.0,25577.0,26.0,Normal
613461,3600.0,16.55,0,36.2,56000.0,1.0,16.0,3,4,0.0,13964.0,23.0,Current
317582,10000.0,14.99,1,15.74,95000.0,1.0,10.0,2,4,0.0,4731.0,17.0,Current
448546,4000.0,13.98,0,17.97,40130.0,0.0,5.0,2,0,0.0,3528.0,15.0,Current


In [48]:
def labesencoder(text):
    if text == "Normal":
        return 0
    elif text in ['Default', 'Delinquent', 'Not Compliant']:
        return 1
    else:
        return 2
    
df_clean['loan_status'] = df_clean['loan_status'].apply(labesencoder)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['loan_status'] = df_clean['loan_status'].apply(labesencoder)


In [49]:
df_clean.head()

Unnamed: 0,loan_amnt,int_rate,term,dti,annual_inc,delinq_2yrs,open_acc,grade,home_ownership,collections_12_mths_ex_med,revol_bal,total_acc,loan_status
238432,5000.0,11.99,0,13.88,58000.0,0.0,4.0,1,4,1.0,3548.0,10.0,2
35121,12000.0,8.94,0,18.99,57000.0,0.0,13.0,0,4,0.0,25577.0,26.0,0
613461,3600.0,16.55,0,36.2,56000.0,1.0,16.0,3,4,0.0,13964.0,23.0,2
317582,10000.0,14.99,1,15.74,95000.0,1.0,10.0,2,4,0.0,4731.0,17.0,2
448546,4000.0,13.98,0,17.97,40130.0,0.0,5.0,2,0,0.0,3528.0,15.0,2


In [55]:
import xgboost as xgb

X = df_clean.drop('grade', axis=1)
y = df_clean['grade']

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)


xgb_model = xgb.XGBClassifier(use_label_encoder=True, n_estimators=500, learning_rate=0.01,
                              subsample=0.8, colsample_bytree=0.8)

In [52]:
y_train.shape

(32175,)

In [56]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [59]:
y_pred = xgb_model.predict(X_test)

y_prob = xgb_model.predict_proba(X_test)

auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr')

# Classification Report and Confusion Matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("AUC-ROC Score:", auc_score)

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5507
           1       0.97      0.93      0.95      9539
           2       0.90      0.92      0.91      9252
           3       0.85      0.88      0.87      5202
           4       0.83      0.88      0.85      2686
           5       0.90      0.83      0.86       861
           6       0.96      0.83      0.89       230

    accuracy                           0.92     33277
   macro avg       0.91      0.89      0.90     33277
weighted avg       0.92      0.92      0.92     33277

Confusion Matrix:
[[5477   30    0    0    0    0    0]
 [ 105 8830  604    0    0    0    0]
 [   1  241 8485  525    0    0    0]
 [   2    8  266 4586  340    0    0]
 [   0    0   41  241 2353   51    0]
 [   1    0    6   19  115  712    8]
 [   0    0    0    5   11   24  190]]
AUC-ROC Score: 0.9947846152718519


In [60]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(random_state=42)
X_oversampled, y_oversampled = oversampler.fit_resample(X_train, y_train)

print("After Oversampling:", y_oversampled.value_counts())

After Oversampling: grade
0    28610
3    28610
2    28610
1    28610
4    28610
5    28610
6    28610
Name: count, dtype: int64


In [62]:
xgb_model_oversampled = xgb_model.fit(X_oversampled, y_oversampled)

y_pred = xgb_model_oversampled.predict(X_test)

y_prob = xgb_model_oversampled.predict_proba(X_test)

auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr')

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("AUC-ROC Score:", auc_score)

Parameters: { "use_label_encoder" } are not used.



Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      5507
           1       0.96      0.89      0.93      9539
           2       0.90      0.89      0.90      9252
           3       0.84      0.88      0.86      5202
           4       0.82      0.92      0.86      2686
           5       0.91      0.92      0.92       861
           6       0.94      0.91      0.92       230

    accuracy                           0.91     33277
   macro avg       0.90      0.92      0.91     33277
weighted avg       0.91      0.91      0.91     33277

Confusion Matrix:
[[5477   30    0    0    0    0    0]
 [ 254 8498  787    0    0    0    0]
 [   1  277 8274  700    0    0    0]
 [   2    9  114 4567  509    0    1]
 [   0    0   19  135 2460   70    2]
 [   1    0    4   13   37  796   10]
 [   0    0    0    3   11    7  209]]
AUC-ROC Score: 0.993949756334379


In [63]:
from imblearn.under_sampling import RandomUnderSampler

undersampler = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = undersampler.fit_resample(X_train, y_train)

print("After Undersampling:", y_undersampled.value_counts())

After Undersampling: grade
0    612
1    612
2    612
3    612
4    612
5    612
6    612
Name: count, dtype: int64


In [64]:
xgb_model_undersampled = xgb_model.fit(X_undersampled, y_undersampled)

y_pred = xgb_model_undersampled.predict(X_test)

y_prob = xgb_model_undersampled.predict_proba(X_test)

auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr')

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("AUC-ROC Score:", auc_score)

Parameters: { "use_label_encoder" } are not used.



Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5507
           1       0.96      0.91      0.93      9539
           2       0.89      0.89      0.89      9252
           3       0.84      0.80      0.82      5202
           4       0.73      0.91      0.81      2686
           5       0.88      0.93      0.90       861
           6       0.88      0.91      0.90       230

    accuracy                           0.90     33277
   macro avg       0.88      0.90      0.89     33277
weighted avg       0.90      0.90      0.90     33277

Confusion Matrix:
[[5477   30    0    0    0    0    0]
 [  99 8658  782    0    0    0    0]
 [   1  332 8225  693    1    0    0]
 [   2   12  195 4148  833    7    5]
 [   0    0   31  111 2442   93    9]
 [   1    0    5    9   34  798   14]
 [   0    0    0    2   13    6  209]]
AUC-ROC Score: 0.9910451980420839


Doing Hyper Parameter Tunning

In [65]:
xgb_model = xgb.XGBClassifier(use_label_encoder=True, n_estimators=400, learning_rate=0.05,
                              subsample=0.8, colsample_bytree=0.8)

In [66]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [67]:
y_pred = xgb_model.predict(X_test)

y_prob = xgb_model.predict_proba(X_test)

auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr')

# Classification Report and Confusion Matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("AUC-ROC Score:", auc_score)

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      5507
           1       0.96      0.94      0.95      9539
           2       0.91      0.92      0.91      9252
           3       0.86      0.89      0.88      5202
           4       0.86      0.87      0.87      2686
           5       0.88      0.85      0.87       861
           6       0.96      0.83      0.89       230

    accuracy                           0.93     33277
   macro avg       0.92      0.90      0.91     33277
weighted avg       0.93      0.93      0.93     33277

Confusion Matrix:
[[5468   39    0    0    0    0    0]
 [  93 8970  476    0    0    0    0]
 [   1  295 8469  487    0    0    0]
 [   1   10  278 4633  280    0    0]
 [   0    0   34  227 2348   77    0]
 [   1    0    6   21   93  732    8]
 [   0    0    0    5   11   22  192]]
AUC-ROC Score: 0.9958448476752462
