# Loan Prediction


In [1]:
# 1. Project Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

pd.set_option('display.max_columns', None)
sns.set(style='whitegrid')

In [3]:
# 2. Load Data
df = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    object 
 2   Married            614 non-null    object 
 3   Dependents         614 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      614 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:
#fill empty columns

columns_tofill=['Gender','Married','Self_Employed','Dependents']
df[columns_tofill]=df[columns_tofill].fillna('unknown')

In [6]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [9]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [18]:
# 3. Initial Checks and data cleaning
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [12]:
#fill empty loan amount

df['LoanAmount']=df['LoanAmount'].fillna(df['LoanAmount'].median())


In [13]:
#fill empty loan amount term

df['Loan_Amount_Term']=df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])


In [14]:
#fill empty empty credit history

df['Credit_History'] = df['Credit_History'].fillna(-1)


In [16]:
df.duplicated().sum()

np.int64(0)

In [17]:
df.nunique()

Loan_ID              614
Gender                 3
Married                3
Dependents             5
Education              2
Self_Employed          3
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         3
Property_Area          3
Loan_Status            2
dtype: int64

In [26]:
# Convert Loan_Status to numeric (target variable)
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})


In [None]:
#select categorical columns

cat_columns = df.select_dtypes(include='object').columns.tolist()
cat_columns = [col for col in cat_columns if col not in ['Loan_ID', 'Loan_Status']]


In [36]:
# 4. Feature Engineering
df = pd.get_dummies(df, columns=cat_columns, drop_first=True)
df

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Male,Gender_unknown,Married_Yes,Married_unknown,Dependents_1,Dependents_2,Dependents_3+,Dependents_unknown,Education_Not Graduate,Self_Employed_Yes,Self_Employed_unknown,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,128.0,360.0,1.0,1,True,False,False,False,False,False,False,False,False,False,False,False,True
1,4583,1508.0,128.0,360.0,1.0,0,True,False,True,False,True,False,False,False,False,False,False,False,False
2,3000,0.0,66.0,360.0,1.0,1,True,False,True,False,False,False,False,False,False,True,False,False,True
3,2583,2358.0,120.0,360.0,1.0,1,True,False,True,False,False,False,False,False,True,False,False,False,True
4,6000,0.0,141.0,360.0,1.0,1,True,False,False,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,2900,0.0,71.0,360.0,1.0,1,False,False,False,False,False,False,False,False,False,False,False,False,False
610,4106,0.0,40.0,180.0,1.0,1,True,False,True,False,False,False,True,False,False,False,False,False,False
611,8072,240.0,253.0,360.0,1.0,1,True,False,True,False,True,False,False,False,False,False,False,False,True
612,7583,0.0,187.0,360.0,1.0,1,True,False,True,False,False,True,False,False,False,False,False,False,True


In [37]:
df.dtypes

ApplicantIncome              int64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
Loan_Status                  int64
Gender_Male                   bool
Gender_unknown                bool
Married_Yes                   bool
Married_unknown               bool
Dependents_1                  bool
Dependents_2                  bool
Dependents_3+                 bool
Dependents_unknown            bool
Education_Not Graduate        bool
Self_Employed_Yes             bool
Self_Employed_unknown         bool
Property_Area_Semiurban       bool
Property_Area_Urban           bool
dtype: object

In [39]:
# 5. Train-Test Split

# Split features and target
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [45]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [46]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [47]:
# 6. Modeling
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [49]:
# 7. Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6991869918699187

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.24      0.33        38
           1       0.73      0.91      0.81        85

    accuracy                           0.70       123
   macro avg       0.63      0.57      0.57       123
weighted avg       0.67      0.70      0.66       123


Confusion Matrix:
 [[ 9 29]
 [ 8 77]]


## 10. Conclusion
- Summarize key findings
- Mention model performance
- Suggest next steps or improvements

Predictions made for 'not approved' loans were 53% correct, 73% for 'approved' loans were predicted correctly. 

The model recalled a low 24% of 'not approved' loans correctly, and it caught a high 91% of 'approved' loans correctly.

The F1 score for 'not approved' loans is 33%, this is not good. The F1 score for 'approved' loans is 85%, this is very good.

Confusion matrix: There were 8 false negatives (predicted as 'not approved', but were actually approved).
There were 29 false positives( predicted as 'approved' but actually not approved).

This means that this model is bias towards predicting approving loans. There is an imbalance in class.