In [1]:
# importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pickle

In [2]:
# configuring notebook
pd.set_option("display.max_columns", None)
sns.set_theme(style = "whitegrid")
warnings.filterwarnings("ignore")
plt.rcParams['figure.figsize'] = (20, 5)

In [3]:
df = pd.read_csv("loan_tap.csv")
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,title,dti,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
0,10000.0,36 months,11.44,329.48,B,B4,Marketing,10+ years,RENT,117000.0,Not Verified,Jan-2015,Fully Paid,vacation,Vacation,26.24,Jun-1990,16.0,0.0,36369.0,41.8,25.0,w,INDIVIDUAL,0.0,0.0,"0174 Michelle Gateway\r\nMendozaberg, OK 22690"
1,8000.0,36 months,11.99,265.68,B,B5,Credit analyst,4 years,MORTGAGE,65000.0,Not Verified,Jan-2015,Fully Paid,debt_consolidation,Debt consolidation,22.05,Jul-2004,17.0,0.0,20131.0,53.3,27.0,f,INDIVIDUAL,3.0,0.0,"1076 Carney Fort Apt. 347\r\nLoganmouth, SD 05113"
2,15600.0,36 months,10.49,506.97,B,B3,Statistician,< 1 year,RENT,43057.0,Source Verified,Jan-2015,Fully Paid,credit_card,Credit card refinancing,12.79,Aug-2007,13.0,0.0,11987.0,92.2,26.0,f,INDIVIDUAL,0.0,0.0,"87025 Mark Dale Apt. 269\r\nNew Sabrina, WV 05113"
3,7200.0,36 months,6.49,220.65,A,A2,Client Advocate,6 years,RENT,54000.0,Not Verified,Nov-2014,Fully Paid,credit_card,Credit card refinancing,2.6,Sep-2006,6.0,0.0,5472.0,21.5,13.0,f,INDIVIDUAL,0.0,0.0,"823 Reid Ford\r\nDelacruzside, MA 00813"
4,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,Verified,Apr-2013,Charged Off,credit_card,Credit Card Refinance,33.95,Mar-1999,13.0,0.0,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,"679 Luna Roads\r\nGreggshire, VA 11650"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396030 entries, 0 to 396029
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   loan_amnt             396030 non-null  float64
 1   term                  396030 non-null  object 
 2   int_rate              396030 non-null  float64
 3   installment           396030 non-null  float64
 4   grade                 396030 non-null  object 
 5   sub_grade             396030 non-null  object 
 6   emp_title             373103 non-null  object 
 7   emp_length            377729 non-null  object 
 8   home_ownership        396030 non-null  object 
 9   annual_inc            396030 non-null  float64
 10  verification_status   396030 non-null  object 
 11  issue_d               396030 non-null  object 
 12  loan_status           396030 non-null  object 
 13  purpose               396030 non-null  object 
 14  title                 394274 non-null  object 
 15  

In [5]:
# converting the columns with dtype = object to dtype = category
object_columns = df.select_dtypes(include = ["object"]).columns.tolist()
for i in object_columns:
    df[i] = df[i].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396030 entries, 0 to 396029
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   loan_amnt             396030 non-null  float64 
 1   term                  396030 non-null  category
 2   int_rate              396030 non-null  float64 
 3   installment           396030 non-null  float64 
 4   grade                 396030 non-null  category
 5   sub_grade             396030 non-null  category
 6   emp_title             373103 non-null  category
 7   emp_length            377729 non-null  category
 8   home_ownership        396030 non-null  category
 9   annual_inc            396030 non-null  float64 
 10  verification_status   396030 non-null  category
 11  issue_d               396030 non-null  category
 12  loan_status           396030 non-null  category
 13  purpose               396030 non-null  category
 14  title                 394274 non-nul

In [6]:
df.shape

(396030, 27)

In [7]:
# checking for duplicates
df.duplicated().sum()

0

In [8]:
# treating missing values
df.dropna(inplace = True)

In [9]:
# outlier detection and removal
numerical_columns = df.select_dtypes(include = ["int", "float"]).columns.tolist()
numerical_columns, len(numerical_columns)
df_num = df[numerical_columns]

q1 = df_num.quantile(0.25)
q3 = df_num.quantile(0.75)
iqr = q3 - q1

# set the threshold (k = 1.5)
k = 1.5
lower_bound = q1 - k * iqr
upper_bound = q3 + k * iqr

# potential outliers
outliers = ((df_num < lower_bound) | (df_num > upper_bound)).any(axis=1)

df = df[~ outliers]
df.shape

(240757, 27)

In [10]:
# one hot encoding of "term" column
def ohe_term(x):
    if x == " 36 months":
        return 0
    elif x == " 60 months":
        return 1
df["term"] = df["term"].apply(ohe_term).astype("int64")

In [11]:
# one hot encoding of "initial_list_status" column
def ohe_inital_list_status(x):
    if x == "w":
        return 0
    elif x == "f":
        return 1
df["initial_list_status"] = df["initial_list_status"].apply(ohe_inital_list_status).astype("int64")

In [12]:
# fetch zip-code from the "address" column
def fetch_zip(x):
    return x[-5:]
df["zip_code"] = df["address"].apply(fetch_zip).astype("category")
# dropping the address column
df.drop(columns = ["address"], inplace = True)

In [13]:
# creating flags for "pub_rec", "mort_acc" and "pub_rec_bankruptcies"
def create_flag_pub_rec(x):
    if x > 0.0:
        return 1
    else:
        return 0
    
def create_flag_mort_acc(x):
    if x > 0.0:
        return 1
    else:
        return 0
    
def create_flag_pub_rec_bankruptcies(x):
    if x > 0.0:
        return 1
    else:
        return 0
    
df["pub_rec"] = df["pub_rec"].apply(create_flag_pub_rec).astype("int64")
df["mort_acc"] = df["mort_acc"].apply(create_flag_mort_acc).astype("int64")
df["pub_rec_bankruptcies"] = df["pub_rec_bankruptcies"].apply(create_flag_pub_rec_bankruptcies).astype("int64")

In [14]:
# one hot encoding of the target column "loan_status"
def ohe_loan_status(x):
    if x == "Fully Paid":
        return 0
    elif x == "Charged Off":
        return 1
df["loan_status"] = df["loan_status"].apply(ohe_loan_status).astype("int64")

In [15]:
# dropping some columns which  have little importance
print(df.shape)
df.drop(columns = ["sub_grade", "emp_title", "emp_length", "issue_d", "title", "earliest_cr_line", "installment"], inplace = True)
print(df.shape)

(240757, 27)
(240757, 20)


In [16]:
# target encoding of categorical columns
print(f"Number of columns BEFORE target encoding of categorical columns {df.shape}")
dummies = ["grade", "home_ownership", "verification_status", "purpose", "application_type", "zip_code"]
df = pd.get_dummies(df, columns = dummies, drop_first = True)
print(f"Number of columns AFTER target encoding of categorical columns {df.shape}")

Number of columns BEFORE target encoding of categorical columns (240757, 20)
Number of columns AFTER target encoding of categorical columns (240757, 51)


In [17]:
# converting all the columns with dtypes == "bool" to "int64"
for col in df:
    if df[col].dtypes == "bool":
        df[col] = df[col].astype("int64")

In [18]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,annual_inc,loan_status,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,mort_acc,pub_rec_bankruptcies,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,application_type_INDIVIDUAL,application_type_JOINT,zip_code_05113,zip_code_11650,zip_code_22690,zip_code_29597,zip_code_30723,zip_code_48052,zip_code_70466,zip_code_86630,zip_code_93700
0,10000.0,0,11.44,117000.0,0,26.24,16.0,0,36369.0,41.8,25.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0
1,8000.0,0,11.99,65000.0,0,22.05,17.0,0,20131.0,53.3,27.0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,15600.0,0,10.49,43057.0,0,12.79,13.0,0,11987.0,92.2,26.0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
3,7200.0,0,6.49,54000.0,0,2.6,6.0,0,5472.0,21.5,13.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,24375.0,1,17.27,55000.0,1,33.95,13.0,0,24584.0,69.8,43.0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0


In [19]:
# preparing data for the model
x = df.drop(columns = ["loan_status"])
y = df["loan_status"]

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, stratify = y, random_state = 42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((192605, 50), (48152, 50), (192605,), (48152,))

In [21]:
x_train.to_csv("train.csv", index = False)
x_test.to_csv("test.csv", index = False)

In [22]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [23]:
model = LogisticRegression()
model

In [24]:
model.fit(x_train, y_train)

In [25]:
# saving the model
with open("./artefacts/model.pkl", "wb") as file:
    pickle.dump(model, file)