In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import pickle

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [3]:
model_data = pd.read_csv('/kaggle/input/loan-data/Modeling dataset.csv', encoding='utf-8')
bureau_data = pd.read_csv('/kaggle/input/loan-data/BureauData.csv')

In [4]:
model_data.head()

Unnamed: 0,id,loan_amnt,emp_length,annual_inc,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,earliest_cr_line,purpose,desc,loan_status
0,1002603,30000,1,93000.0,0,1,120,130,10,0,43209,0.41,32,7/18/1990 3:54,debt_consolidation,,Current
1,1024806,22000,6,70000.0,1,0,10,130,11,0,38928,0.85,36,10/7/1983 8:46,debt_consolidation,Borrower added on 11/09/11 > This loan is fo...,Fully Paid
2,663413,3700,10,117000.0,1,1,10,130,9,0,19382,0.86,24,4/7/1992 11:32,credit_card,Borrower added on 01/28/11 > pay off credit ...,Fully Paid
3,810093,7600,1,33996.0,0,0,120,130,10,0,1962,0.48,18,1/21/2003 2:40,other,Borrower added on 07/11/11 > I have been con...,Current
4,865474,12000,7,75000.0,0,1,24,130,13,0,13528,0.51,23,9/8/1997 7:19,debt_consolidation,,Fully Paid


In [5]:
model_data.dtypes

id                          int64
loan_amnt                   int64
emp_length                  int64
annual_inc                float64
delinq_2yrs                 int64
inq_last_6mths              int64
mths_since_last_delinq      int64
mths_since_last_record      int64
open_acc                    int64
pub_rec                     int64
revol_bal                   int64
revol_util                float64
total_acc                   int64
earliest_cr_line           object
purpose                    object
desc                       object
loan_status                object
dtype: object

In [6]:
model_data = model_data.drop_duplicates()
model_data.shape

(12491, 17)

In [7]:
model_data = model_data.drop(columns=['id'])

In [8]:
model_data.isna().sum()

loan_amnt                    0
emp_length                   0
annual_inc                   0
delinq_2yrs                  0
inq_last_6mths               0
mths_since_last_delinq       0
mths_since_last_record       0
open_acc                     0
pub_rec                      0
revol_bal                    0
revol_util                   0
total_acc                    0
earliest_cr_line             0
purpose                      0
desc                      4114
loan_status                  0
dtype: int64

In [9]:
model_data.nunique()

loan_amnt                   648
emp_length                   10
annual_inc                 2227
delinq_2yrs                   8
inq_last_6mths                9
mths_since_last_delinq       88
mths_since_last_record       95
open_acc                     35
pub_rec                       5
revol_bal                  9775
revol_util                  101
total_acc                    73
earliest_cr_line          12455
purpose                      14
desc                       8297
loan_status                   7
dtype: int64

In [10]:
model_data['loan_status'].unique()

array(['Current', 'Fully Paid', 'Charged Off', 'Late (31-120 days)',
       'In Grace Period', 'Late (16-30 days)', 'Default'], dtype=object)

In [11]:
category_mapping = {
    'Current':1,
    'In Grace Period':1,
    'Fully Paid':1,
    'Charged Off':0,
    'Late (31-120 days)':0,
    'Late (16-30 days)':1,
    'Default':0
}
model_data['loan_status']=model_data['loan_status'].map(category_mapping)

In [12]:
purpose_mapping = {
        'debt_consolidation': 'Personal_Loan',
        'credit_card': 'Personal_Loan',
        'other': 'Debt',
        'home_improvement': 'Personal_Loan',
        'small_business': 'Debt',
        'major_purchase': 'Personal_Loan',
        'car': 'Personal_Loan',
        'wedding': 'Personal_Loan', 
        'medical': 'Personal_Loan',
        'house' : 'Home_Loan',
        'moving': 'Personal_Loan',
        'vacation': 'Personal_Loan',
        'educational': 'Educational_Loan',
        'renewable_energy': 'Debt'
    }

model_data['purpose'] = model_data['purpose'].map(purpose_mapping)

In [13]:
positive_percentage = model_data.groupby('purpose')['loan_status'].mean()
purpose_mapping = positive_percentage.to_dict()
print(purpose_mapping)

{'Debt': 0.8191841234840133, 'Educational_Loan': 0.7980769230769231, 'Home_Loan': 0.8660714285714286, 'Personal_Loan': 0.8805085555874199}


In [14]:
model_data.describe(exclude=['object'])

Unnamed: 0,loan_amnt,emp_length,annual_inc,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,loan_status
count,12491.0,12491.0,12491.0,12491.0,12491.0,12491.0,12491.0,12491.0,12491.0,12491.0,12491.0,12491.0,12491.0
mean,11228.384437,5.047234,68245.58,0.145625,0.862621,90.174366,125.910015,9.322392,0.053959,13515.845409,0.490956,22.268914,0.870787
std,7418.857146,3.363823,51600.06,0.482198,1.057902,42.339998,18.985737,4.431606,0.234978,16118.26173,0.282016,11.521747,0.335449
min,500.0,1.0,6000.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0
25%,5500.0,2.0,40320.0,0.0,0.0,47.0,130.0,6.0,0.0,3752.0,0.26,14.0,1.0
50%,10000.0,4.0,59000.0,0.0,1.0,120.0,130.0,9.0,0.0,8943.0,0.5,21.0,1.0
75%,15000.0,9.0,82000.0,0.0,1.0,120.0,130.0,12.0,0.0,17100.0,0.72,29.0,1.0
max,35000.0,10.0,2039784.0,8.0,8.0,120.0,130.0,42.0,4.0,149527.0,1.0,81.0,1.0


In [15]:
model_data.isna().sum()

loan_amnt                    0
emp_length                   0
annual_inc                   0
delinq_2yrs                  0
inq_last_6mths               0
mths_since_last_delinq       0
mths_since_last_record       0
open_acc                     0
pub_rec                      0
revol_bal                    0
revol_util                   0
total_acc                    0
earliest_cr_line             0
purpose                      0
desc                      4114
loan_status                  0
dtype: int64

In [16]:
model_data.shape

(12491, 16)

In [17]:
X, y = model_data.drop(columns=['loan_status']), model_data['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
X.columns

Index(['loan_amnt', 'emp_length', 'annual_inc', 'delinq_2yrs',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'earliest_cr_line', 'purpose', 'desc'],
      dtype='object')

<h1>Input preprocessing</h1>
Below are a few classes to preprocess the input variables before feeding them into the model for training and inference

<h2>Leverage Ratio</h2>
We generate a new variable 'leverage_ratio' by calculating the debt-to-annual income ratio from the given inputs for loan amount and annual income

In [19]:
class LeverageCalculator(BaseEstimator, TransformerMixin):
    def __init__(self, loan_amount, annual_income):
        self.loan_amount = loan_amount
        self.annual_income = annual_income
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['leverage_ratio'] = X[self.loan_amount]/X[self.annual_income]
        X.insert(1, 'leverage_ratio', X.pop('leverage_ratio'))
        return X        

<h2>Earliest Credit Line</h2>
We change the 'earliest_cr_line' variable into an age variable denoting how many years ago the person's first credit line was issued

In [20]:
class AgeCalculator(BaseEstimator, TransformerMixin):
    def __init__(self, date_column,):
        self.date_column = date_column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X[self.date_column] = X[self.date_column].apply(self.calculate_age)
        return X
        
    def calculate_age(self, date):
        if(pd.isna(date)):
            return 0
        
        date = pd.to_datetime(date, format='%m/%d/%Y %H:%M', exact=False)
        today = datetime.today()
        age = today.year - date.year - ((today.month, today.day) < (date.month, date.day))
        return age

<h2>Category Encoding</h2>
We map the different values for 'purpose' present in the training data to the percentage of positive samples for that purpose. We also concatenate the 'purpose' and 'desc' columns into a single 'text' column used for further text analysis

In [21]:
class CategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, purpose, description):
        self.purpose = purpose
        self.description = description
        
        
        self.percentage_mapping = {
            'Debt': 81.91841234840133,
            'Educational_Loan': 79.8076923076923,
            'Home_Loan': 86.60714285714286,
            'Personal_Loan': 88.05085555874199
        }
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X[self.purpose] = X[self.purpose].fillna('')
        X[self.description] = X[self.description].fillna('')
        X['text'] = X[self.purpose]+ X[self.description]
        X = X.drop(columns = [self.description])
        
        X[self.purpose] = X[self.purpose].map(self.percentage_mapping)
        return X
    

<h2>Text Processing</h2>
The two classes below are used to preprocess the 'text' column. The TextPreprocessor converts all text to lowercase, removes stopwords from the text, and lemmatizes all words to their root form. Then we use the TfidfConcatenator to find the TF-IDF vector representations for the 'text' column and concatenate those to the original dataframe while dropping the 'text' column

In [22]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.text_column] = X[self.text_column].apply(self._preprocess_text)
        return X
    
    def _preprocess_text(self, text):
        stop_words = set(stopwords.words('english'))
        
        
        text = text.lower()
        # Tokenize into words
        words = word_tokenize(text)
        # Remove stopwords
        words = [word for word in words if word.isalpha() and word not in self.stop_words]
        # Lemmatize words
        words = [self.lemmatizer.lemmatize(word) for word in words]
        
        return ' '.join(words)
    

In [23]:
class TfidfConcatenator(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
        self.vectorizer = TfidfVectorizer(max_features=30)

    def fit(self, X, y=None):
        self.vectorizer.fit(X[self.text_column])
        return self

    def transform(self, X):
        X = X.copy()
        tfidf_matrix = self.vectorizer.transform(X[self.text_column])
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=self.vectorizer.get_feature_names_out())
        X = X.drop(columns=[self.text_column])
        return pd.concat([X.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

Finally we chain all the preprocessing classes together into a pipeline. The same pipeline will be used to for inference

In [24]:
model_pipeline = Pipeline(steps=[
    ('leverage_calculator', LeverageCalculator(loan_amount='loan_amnt', annual_income='annual_inc')),
    ('age_calculator', AgeCalculator(date_column='earliest_cr_line')),
    ('category_encoder', CategoryEncoder(purpose='purpose', description='desc')),
    ('text_preprocessor', TextPreprocessor(text_column='text')),
    ('tfidf_concat', TfidfConcatenator(text_column='text')),
    ('scaler', StandardScaler(with_mean=False)),  # with_mean=False because TF-IDF output is sparse
    ('smote', SMOTE(k_neighbors=5, random_state=42)),
    ('model', LogisticRegression(max_iter=1000))
])

In [25]:
model_pipeline.fit(X_train, y_train)

In [26]:
y_pred = model_pipeline.predict(X_test)

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.19      0.58      0.28       323
           1       0.91      0.63      0.75      2176

    accuracy                           0.62      2499
   macro avg       0.55      0.60      0.51      2499
weighted avg       0.82      0.62      0.69      2499



In [28]:
default_rate = np.mean(y == 0)
print(f'Default rate in modelling data: {default_rate:.2f}')

Default rate in modelling data: 0.13


In [29]:
y_pred_prob = model_pipeline.predict_proba(X)[: ,1]
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(y, y_pred_prob)

In [30]:
# Find the threshold where the decline rate matches the default rate
decline_rate_diffs = []
for threshold in thresholds:
    decline_rate = np.mean(y_pred_prob < threshold)
    decline_rate_diffs.append(abs(decline_rate - default_rate))

optimal_idx = np.argmin(decline_rate_diffs)
optimal_threshold = thresholds[optimal_idx]
print(f'Optimal threshold where decline rate matches default rate: {optimal_threshold}')

Optimal threshold where decline rate matches default rate: 0.3402045739512086


In [31]:
default_rate_optim = np.mean(y_pred_prob < optimal_threshold)
print(f'Default rate according to threshold: {default_rate_optim:.2f}')

Default rate according to threshold: 0.13


In [32]:
with open("loan_model_pipeline.pkl", "wb") as f:
    pickle.dump(model_pipeline, f)