# Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn
import plotly.express as px

# loading the data

In [2]:
df = pd.read_csv("C:/Users/vikas/Downloads/credit_risk_dataset.csv")

##### Info about dataset

In [3]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [22]:
df.shape

(32581, 11)

In [6]:
df.size

390972

### Data cleaning 

In [7]:
df["loan_int_rate"].fillna(df["loan_int_rate"].median(),inplace=True)

In [8]:
df['person_emp_length'].fillna(4.0,inplace=True)

In [9]:
df.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [19]:
df.drop('cb_person_cred_hist_length', axis = 1, inplace=True)

# Identify the target variable

In [20]:
x = df[['person_age', 'person_income', 'person_home_ownership','person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt','loan_int_rate', 'loan_percent_income','cb_person_default_on_file']]

y = df["loan_status"]

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.75,random_state=100)

In [23]:
x_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file
2009,26,37000,RENT,0.0,PERSONAL,B,14000,9.88,0.38,N
24210,35,70000,OWN,6.0,VENTURE,B,17000,9.88,0.24,N
22659,29,42000,RENT,0.0,EDUCATION,C,7500,12.87,0.18,Y
15764,22,166000,MORTGAGE,1.0,EDUCATION,B,23450,11.36,0.14,N
11378,26,57800,MORTGAGE,0.0,MEDICAL,D,15000,10.99,0.22,Y


In [24]:
x_test.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file
10775,25,40000,RENT,0.0,EDUCATION,B,10000,12.18,0.25,N
28127,30,125000,MORTGAGE,14.0,VENTURE,A,5750,7.43,0.05,N
17835,29,70500,RENT,5.0,PERSONAL,A,35000,8.9,0.5,N
15419,26,135000,MORTGAGE,11.0,EDUCATION,A,6700,6.03,0.05,N
20158,30,34000,RENT,0.0,VENTURE,B,4000,10.95,0.12,N


# Data preparation on train data

In [28]:
x_train_num = x_train.select_dtypes(include=["float64","int64"])
x_train_cat = x_train.select_dtypes(include="object")

In [29]:
le = x_train_cat["loan_grade"]

ohe = x_train_cat[["person_home_ownership","loan_intent"]]

In [30]:
grade_encoder = {'A':1,'B':2,'C':3,'D':4,"E":5,"F":6,"G":7}
x_train_cat_le = pd.DataFrame(index=x_train_cat.index)
x_train_cat_le["loan_grade"] = x_train_cat["loan_grade"].apply(lambda x:grade_encoder[x])

In [31]:
x_train_cat_le.head()

Unnamed: 0,loan_grade
2009,2
24210,2
22659,3
15764,2
11378,4


### scaling the numerical features

In [32]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_num_rescaled = pd.DataFrame(scaler.fit_transform(x_train_num),
                                   columns = x_train_num.columns,
                                   index = x_train_num.index)

## one hot encoding on categorical features

In [33]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop = "first",sparse=False)

x_train_cat_ohe = pd.DataFrame(encoder.fit_transform(ohe),
                              columns = encoder.get_feature_names_out(ohe.columns),
                              index = ohe.index)
x_train_cat_ohe.head()

Unnamed: 0,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
2009,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
24210,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
22659,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
15764,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
11378,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [34]:
x_trai_cat_transformed = pd.concat([x_train_cat_ohe,x_train_cat_le],axis = 1)

In [35]:
x_trai_cat_transformed.head()

Unnamed: 0,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade
2009,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2
24210,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2
22659,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,3
15764,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
11378,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4


In [36]:
x_train_transformed = pd.concat([x_train_num,x_trai_cat_transformed], axis=1)

In [37]:
x_train_transformed

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade
2009,26,37000,0.0,14000,9.88,0.38,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2
24210,35,70000,6.0,17000,9.88,0.24,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2
22659,29,42000,0.0,7500,12.87,0.18,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,3
15764,22,166000,1.0,23450,11.36,0.14,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2
11378,26,57800,0.0,15000,10.99,0.22,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12119,24,80000,8.0,18000,7.90,0.23,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
24423,29,66000,4.0,8200,17.04,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
14147,23,100000,0.0,7000,8.59,0.07,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
23320,32,66000,1.0,8000,7.29,0.12,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1


## Data preparation on test data

In [38]:
x_test_num = x_test.select_dtypes(include=["int64","float64"])
x_test_cat = x_test.select_dtypes(include=["object"])

In [39]:
x_test_num_scaler = pd.DataFrame(scaler.transform(x_test_num),
                                columns=x_test_num.columns,
                                index = x_test_num.index)

In [None]:
test_le = x_test_cat[["loan_grade"]]
test_ohe = x_test_cat[["person_home_ownership","loan_intent","cb_person_default_on_file"]]

In [None]:
x_test_cat_le = pd.DataFrame(index=x_test_cat.index)
x_test_cat_le["loan_grand"] = x_test_cat["loan_grade"].apply(lambda x: grade_encoder[x])
x_test_cat_le.head()

In [None]:
x_test_cat_ohe = pd.DataFrame(encoder.transform(test_ohe),
                             columns=encoder.get_feature_names_out(test_ohe.columns),
                             index = test_ohe.index)
x_test_cat_ohe.head()

In [None]:
x_test_cat_transformed = pd.concat([x_test_cat_ohe,x_test_cat_le], axis = 1)
x_test_cat_transformed.head()

In [None]:
x_test_transformed = pd.concat([x_test_num_scaler,x_test_cat_transformed],axis=1)
x_test_transformed

## K-Neareast Neighbour

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn.fit(x_train_transformed,y_train)

In [None]:
y_test_pred_knn = knn.predict(x_test_transformed)

In [None]:
temp_df = pd.DataFrame({"Actual":y_test,"Predicted":y_test_pred_knn})

temp_df.head()

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test,y_test_pred_knn)

# logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

regressor = LogisticRegression()

regressor.fit(x_train_transformed,y_train)

In [None]:
y_test_pred_lr = regressor.predict(x_test_transformed)

In [None]:
metrics.accuracy_score(y_test,y_test_pred_lr)

## support vector Machine

In [None]:
from sklearn.svm import SVC

sv = SVC()

sv.fit(x_train_transformed,y_train)

In [None]:
y_test_pred_svc = sv.predict(x_test_transformed)

In [None]:
metrics.accuracy_score(y_test,y_test_pred_svc)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()

classifier.fit(x_train_transformed,y_train)

In [None]:
y_test_pred_dt = classifier.predict(x_test_transformed)

In [None]:
metrics.precision_score(y_test,y_test_pred_dt)

## random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

rfc.fit(x_train_transformed,y_train)

In [None]:
y_test_pred_rf = rfc.predict(x_test_transformed)

In [None]:
metrics.accuracy_score(y_test,y_test_pred_rf)

## visualization

In [None]:
import plotly.express as px

In [None]:
data = {"Algorithm":["LogisticRegression","KNeighborsClassifier","DecisionTreeClassifier","RandomForestClassifier","SVC"],
       "Prediction_precentage":[58.8,21.4,34.4,65.6,21.4]}

In [None]:
visualiztion_data = pd.DataFrame(data)
fig = px.line(visualiztion_data, x = "Algorithm", y = "Prediction_precentage")
fig.show()