In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('loan.csv')
df.head()

Unnamed: 0,Loan_ID,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
0,LP001002,0,Graduate,0.0,5849,,1.0,1
1,LP001003,1,Graduate,0.0,4583,128.0,1.0,0
2,LP001005,0,Graduate,,3000,66.0,1.0,1
3,LP001006,0,0t Graduate,0.0,2583,120.0,1.0,1
4,LP001008,0,Graduate,0.0,6000,141.0,1.0,1


Remove loan id column (irrelevant)

In [4]:
df.drop(["Loan_ID"], axis="columns", inplace=True)
df.dropna(inplace=True)

In [5]:
df

Unnamed: 0,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
1,1,Graduate,0.0,4583,128.0,1.0,0
3,0,0t Graduate,0.0,2583,120.0,1.0,1
4,0,Graduate,0.0,6000,141.0,1.0,1
6,0,0t Graduate,0.0,2333,95.0,1.0,1
7,3+,Graduate,0.0,3036,158.0,0.0,0
...,...,...,...,...,...,...,...
608,0,Graduate,0.0,3232,108.0,1.0,1
609,0,Graduate,0.0,2900,71.0,1.0,1
610,3+,Graduate,0.0,4106,40.0,1.0,1
611,1,Graduate,0.0,8072,253.0,1.0,1


In [6]:
df['Education'] = df['Education'].replace({'Graduate': 1, "Not Graduate": 0})
df["Credit_History"] = pd.to_numeric(df['Credit_History'], errors='coerce').astype(int)
df["LoanAmount"] = pd.to_numeric(df['LoanAmount'], errors='coerce').astype(int)

base on dataset source, loan amount are written in thousands so we will use the real number

In [7]:
df["LoanAmount"] = df.LoanAmount*1000

look at features correlation and remove unperformed features

In [8]:
df

Unnamed: 0,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
1,1,1,0.0,4583,128000,1,0
3,0,0t Graduate,0.0,2583,120000,1,1
4,0,1,0.0,6000,141000,1,1
6,0,0t Graduate,0.0,2333,95000,1,1
7,3+,1,0.0,3036,158000,0,0
...,...,...,...,...,...,...,...
608,0,1,0.0,3232,108000,1,1
609,0,1,0.0,2900,71000,1,1
610,3+,1,0.0,4106,40000,1,1
611,1,1,0.0,8072,253000,1,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 434 entries, 1 to 612
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Dependents       434 non-null    object 
 1   Education        434 non-null    object 
 2   Self_Employed    434 non-null    float64
 3   ApplicantIncome  434 non-null    int64  
 4   LoanAmount       434 non-null    int64  
 5   Credit_History   434 non-null    int64  
 6   Loan_Status      434 non-null    int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 27.1+ KB


In [10]:
df['Dependents'] = df['Dependents'].str.replace('3+', '3')
df['Education'] = df['Education'].str.replace('0t Graduate', '0')
df['Education'].fillna(1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Education'].fillna(1, inplace=True)


In [11]:
df

Unnamed: 0,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
1,1,1,0.0,4583,128000,1,0
3,0,0,0.0,2583,120000,1,1
4,0,1,0.0,6000,141000,1,1
6,0,0,0.0,2333,95000,1,1
7,3,1,0.0,3036,158000,0,0
...,...,...,...,...,...,...,...
608,0,1,0.0,3232,108000,1,1
609,0,1,0.0,2900,71000,1,1
610,3,1,0.0,4106,40000,1,1
611,1,1,0.0,8072,253000,1,1


In [12]:
df.drop(columns=["Self_Employed"], inplace=True)

In [13]:
df

Unnamed: 0,Dependents,Education,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
1,1,1,4583,128000,1,0
3,0,0,2583,120000,1,1
4,0,1,6000,141000,1,1
6,0,0,2333,95000,1,1
7,3,1,3036,158000,0,0
...,...,...,...,...,...,...
608,0,1,3232,108000,1,1
609,0,1,2900,71000,1,1
610,3,1,4106,40000,1,1
611,1,1,8072,253000,1,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 434 entries, 1 to 612
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Dependents       434 non-null    object
 1   Education        434 non-null    object
 2   ApplicantIncome  434 non-null    int64 
 3   LoanAmount       434 non-null    int64 
 4   Credit_History   434 non-null    int64 
 5   Loan_Status      434 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 23.7+ KB


In [15]:
df["Dependents"] = pd.to_numeric(df['Dependents'], errors='coerce').astype(int)
df["Education"] = pd.to_numeric(df['Education'], errors='coerce').astype(int)
df

Unnamed: 0,Dependents,Education,ApplicantIncome,LoanAmount,Credit_History,Loan_Status
1,1,1,4583,128000,1,0
3,0,0,2583,120000,1,1
4,0,1,6000,141000,1,1
6,0,0,2333,95000,1,1
7,3,1,3036,158000,0,0
...,...,...,...,...,...,...
608,0,1,3232,108000,1,1
609,0,1,2900,71000,1,1
610,3,1,4106,40000,1,1
611,1,1,8072,253000,1,1


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 434 entries, 1 to 612
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Dependents       434 non-null    int64
 1   Education        434 non-null    int64
 2   ApplicantIncome  434 non-null    int64
 3   LoanAmount       434 non-null    int64
 4   Credit_History   434 non-null    int64
 5   Loan_Status      434 non-null    int64
dtypes: int64(6)
memory usage: 23.7 KB


In [17]:
scaler = StandardScaler()
df[["ApplicantIncome", "LoanAmount"]] = scaler.fit_transform(df[["ApplicantIncome", "LoanAmount"]])

each column impact on loan status visualization

In [18]:
df[df.Loan_Status == 1].shape

(298, 6)

In [19]:
df[df.Loan_Status == 0].shape

(136, 6)

<b> 4. Data Preparation

In [20]:
#Extracting Independent and dependent Variable  
X = df.drop(["Loan_Status"], axis=1)
y = df['Loan_Status']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=64)

<b> 5. Create LOGISTIC Regression model

#Fitting Logistic Regression to the training set  
from sklearn.linear_model import LogisticRegression  
classifier= LogisticRegression(random_state=0)  
classifier.fit(X_train, y_train)  


In [22]:
classifier = LogisticRegression(C=1, penalty='l1', solver='liblinear')
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

0.8473282442748091

In [23]:
#Predicting the test set result  
y_pred= classifier.predict(X_test)  

In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm=confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)

In [25]:
cm

array([[21, 18],
       [ 2, 90]])

In [26]:
acc

0.8473282442748091

In [27]:
pickle.dump(classifier, open('logisticloannew.pkl','wb'))

In [28]:
pickle.dump(scaler, open('scalerloan.pkl','wb'))

In [29]:
X_train[:10]

Unnamed: 0,Dependents,Education,ApplicantIncome,LoanAmount,Credit_History
67,1,1,0.95108,2.118637,1
266,2,1,-0.068052,0.10115,1
17,0,1,-0.270124,-0.820418,0
272,0,1,-0.384316,-0.197737,1
281,0,1,-0.199786,-0.372088,1
431,0,1,0.234213,-0.459263,0
298,0,1,-0.018799,0.113603,1
375,0,1,-0.049666,-0.272459,1
177,3,1,0.068237,4.397649,0
368,1,1,0.204695,0.41249,1


In [30]:
y_train[:10]

67     1
266    1
17     0
272    1
281    1
431    0
298    0
375    1
177    0
368    1
Name: Loan_Status, dtype: int64

In [31]:
data = scaler.transform([[3500, 450000]])
p1 = data[0,0]
p2 = data[0,1]
classifier.predict([[1,1,p1,p2,0]])  



array([0])

In [32]:
# Step 3: Define models and parameter grids
# We define 3 different models with their respective hyperparameters for tuning.
models = {
    "RandomForest": (Pipeline([
        ('clf', RandomForestClassifier())
    ]), {
        'clf__n_estimators': [50, 100, 200],
        'clf__max_depth': [None, 10, 20]
    }),
    "SVM": (Pipeline([
        ('scaler', StandardScaler()),  # StandardScaler is used to normalize features for SVM
        ('clf', SVC())
    ]), {
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf']
    }),
    "GradientBoosting": (Pipeline([
        ('clf', GradientBoostingClassifier())
    ]), {
        'clf__n_estimators': [50, 100, 200],
        'clf__learning_rate': [0.01, 0.1, 0.2]
    })
}

best_models = {}

In [33]:
# Step 4: Perform Grid Search to find the best parameters
# This step will take some time as it runs cross-validation on multiple hyperparameters.
for model_name, (pipeline, param_grid) in models.items():
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

Best parameters for RandomForest: {'clf__max_depth': 20, 'clf__n_estimators': 100}
Best parameters for SVM: {'clf__C': 1, 'clf__kernel': 'rbf'}
Best parameters for GradientBoosting: {'clf__learning_rate': 0.01, 'clf__n_estimators': 200}


In [34]:
# Step 5: Evaluate all models to find the best performing one
best_model = None
best_score = 0

for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy of {model_name}: {acc:.4f}")
    if acc > best_score:
        best_score = acc
        best_model = (model_name, model)

print(f"\nBest Model: {best_model[0]} with accuracy {best_score:.4f}")

Accuracy of RandomForest: 0.7557
Accuracy of SVM: 0.8321
Accuracy of GradientBoosting: 0.8321

Best Model: SVM with accuracy 0.8321


In [35]:
# Step 6: Save the best model to a pickle file
# This pickle file is used for model inference in Flask API.
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model[1], file)