In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
ds_train = pd.read_csv("train.csv")
ds_test = pd.read_csv("test.csv")

In [4]:
ds_train.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
ds_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


# Data Manipulation Techniques

In [6]:
ds_train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [7]:
ds_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [8]:
ds_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [9]:
ds_train.shape

(614, 13)

In [10]:
ds_test.shape

(367, 12)

In [11]:
ds_train = ds_train.drop(columns='Loan_ID')

In [12]:
ds_test = ds_test.drop(columns='Loan_ID')

## 1. Handling missing values

In [13]:
ds_train.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [14]:
ds_test.isnull().sum()

Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [15]:
# replacing null values from numerical data to mean values 

ds_train['LoanAmount'] = ds_train['LoanAmount'].fillna(ds_train['LoanAmount'].mean())
ds_train['Loan_Amount_Term'] = ds_train['Loan_Amount_Term'].fillna(ds_train['Loan_Amount_Term'].mean())
ds_train['Credit_History'] = ds_train['Credit_History'].fillna(ds_train['Credit_History'].mean())

In [16]:
# replacing null values from numerical data to mode values 

ds_train['Gender'] = ds_train['Gender'].fillna(ds_train['Gender'].mode()[0])
ds_train['Married'] = ds_train['Married'].fillna(ds_train['Married'].mode()[0])
ds_train['Dependents'] = ds_train['Dependents'].fillna(ds_train['Dependents'].mode()[0])
ds_train['Self_Employed'] = ds_train['Self_Employed'].fillna(ds_train['Self_Employed'].mode()[0])


In [17]:
ds_train.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [18]:
# replacing null values from numerical data to mean values 

ds_test['LoanAmount'] = ds_test['LoanAmount'].fillna(ds_test['LoanAmount'].mean())
ds_test['Loan_Amount_Term'] = ds_test['Loan_Amount_Term'].fillna(ds_test['Loan_Amount_Term'].mean())
ds_test['Credit_History'] = ds_test['Credit_History'].fillna(ds_test['Credit_History'].mean())

In [19]:
# replacing null values from numerical data to mode values 

ds_test['Gender'] = ds_test['Gender'].fillna(ds_test['Gender'].mode()[0])
ds_test['Married'] = ds_test['Married'].fillna(ds_test['Married'].mode()[0])
ds_test['Dependents'] = ds_test['Dependents'].fillna(ds_test['Dependents'].mode()[0])
ds_test['Self_Employed'] = ds_test['Self_Employed'].fillna(ds_test['Self_Employed'].mode()[0])


In [20]:
ds_test.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [22]:
ds_train.duplicated().sum()

0

In [23]:
ds_test.duplicated().sum()

1

In [24]:
ds_test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,0.825444,Urban
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [26]:
ds_test = ds_test.drop_duplicates()

In [28]:
ds_test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,0.825444,Urban
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [29]:
ds_test.duplicated().sum()

0

## 2. Encoding categorical values

In [30]:
print(type(ds_train))

<class 'pandas.core.frame.DataFrame'>


In [31]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [32]:
label_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']

# Initialize LabelEncoders
label_encoders = {}

# Apply Label Encoding
for column in label_columns:
    le = LabelEncoder()
    ds_train[column] = le.fit_transform(ds_train[column].astype(str))
    label_encoders[column] = le

In [33]:
ds_train

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.000000,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.000000,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.000000,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.000000,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,71.000000,360.0,1.0,0,1
610,1,1,3+,0,0,4106,0.0,40.000000,180.0,1.0,0,1
611,1,1,1,0,0,8072,240.0,253.000000,360.0,1.0,2,1
612,1,1,2,0,0,7583,0.0,187.000000,360.0,1.0,2,1


In [34]:
ds_train = ds_train[~ds_train.apply(lambda row: row.astype(str).str.contains('3+', regex=False).any(), axis=1)]

In [35]:
ds_train

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.000000,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.000000,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.000000,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.000000,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
608,1,1,0,0,0,3232,1950.0,108.000000,360.0,1.0,0,1
609,0,0,0,0,0,2900,0.0,71.000000,360.0,1.0,0,1
611,1,1,1,0,0,8072,240.0,253.000000,360.0,1.0,2,1
612,1,1,2,0,0,7583,0.0,187.000000,360.0,1.0,2,1


In [36]:
label_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

# Initialize LabelEncoders
label_encoders = {}

# Apply Label Encoding
for column in label_columns:
    le = LabelEncoder()
    ds_test[column] = le.fit_transform(ds_test[column].astype(str))
    label_encoders[column] = le

In [37]:
ds_test

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,0,5720,0,110.0,360.0,1.000000,2
1,1,1,1,0,0,3076,1500,126.0,360.0,1.000000,2
2,1,1,2,0,0,5000,1800,208.0,360.0,1.000000,2
3,1,1,2,0,0,2340,2546,100.0,360.0,0.825444,2
4,1,0,0,1,0,3276,0,78.0,360.0,1.000000,2
...,...,...,...,...,...,...,...,...,...,...,...
362,1,1,3+,1,1,4009,1777,113.0,360.0,1.000000,2
363,1,1,0,0,0,4158,709,115.0,360.0,1.000000,2
364,1,0,0,0,0,3250,1993,126.0,360.0,0.825444,1
365,1,1,0,0,0,5000,2393,158.0,360.0,1.000000,0


In [38]:
ds_test = ds_test[~ds_test.apply(lambda row: row.astype(str).str.contains('3+', regex=False).any(), axis=1)]

In [39]:
ds_test

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,0,5720,0,110.0,360.0,1.000000,2
1,1,1,1,0,0,3076,1500,126.0,360.0,1.000000,2
2,1,1,2,0,0,5000,1800,208.0,360.0,1.000000,2
3,1,1,2,0,0,2340,2546,100.0,360.0,0.825444,2
4,1,0,0,1,0,3276,0,78.0,360.0,1.000000,2
...,...,...,...,...,...,...,...,...,...,...,...
361,1,1,1,0,0,2269,2167,99.0,360.0,1.000000,1
363,1,1,0,0,0,4158,709,115.0,360.0,1.000000,2
364,1,0,0,0,0,3250,1993,126.0,360.0,0.825444,1
365,1,1,0,0,0,5000,2393,158.0,360.0,1.000000,0


## Feature Scaling

In [40]:
from sklearn.preprocessing import StandardScaler

In [41]:
#Select the columns to be scaled
#columns_to_scale = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

# Initialize the scaler
#scaler = StandardScaler()

# Fit the scaler on the selected columns and transform them
#ds_train[columns_to_scale] = scaler.fit_transform(ds_train[columns_to_scale])

In [42]:
# Using Max Min Scaler

#from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
#scaler = MinMaxScaler()

# Fit the scaler on the selected columns and transform them
#ds_train[columns_to_scale] = scaler.fit_transform(ds_train[columns_to_scale])


In [43]:
ds_train

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.000000,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.000000,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.000000,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.000000,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
608,1,1,0,0,0,3232,1950.0,108.000000,360.0,1.0,0,1
609,0,0,0,0,0,2900,0.0,71.000000,360.0,1.0,0,1
611,1,1,1,0,0,8072,240.0,253.000000,360.0,1.0,2,1
612,1,1,2,0,0,7583,0.0,187.000000,360.0,1.0,2,1


## Split the train dataset into training and validation sets

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
X = ds_train.drop(columns='Loan_Status')
y = ds_train['Loan_Status']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [46]:
X_train

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
354,0,1,0,0,0,2423,505.0,130.000000,360.0,1.0,1
518,1,0,0,0,0,4683,1915.0,185.000000,360.0,1.0,1
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2
12,1,1,2,0,0,3073,8106.0,200.000000,360.0,1.0,2
314,1,1,0,0,0,2473,1843.0,159.000000,360.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
77,1,1,1,0,1,1000,3022.0,110.000000,360.0,1.0,2
115,1,1,1,0,0,14583,0.0,185.000000,180.0,1.0,0
293,0,0,0,0,0,5417,0.0,143.000000,480.0,0.0,2
474,1,0,2,0,0,5532,4648.0,162.000000,360.0,1.0,0


In [47]:
X_val

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
273,1,1,0,0,0,2620,2223.0,150.000000,360.0,1.0,1
566,1,0,0,0,0,3333,0.0,70.000000,360.0,1.0,2
291,1,1,2,0,0,4400,0.0,127.000000,360.0,0.0,1
57,1,1,0,0,0,3366,2200.0,135.000000,360.0,1.0,0
597,1,0,0,0,0,2987,0.0,88.000000,360.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
96,0,1,0,0,0,2484,2302.0,137.000000,360.0,1.0,1
349,1,1,0,0,0,2625,6250.0,187.000000,360.0,1.0,0
389,1,1,0,0,0,5488,0.0,125.000000,360.0,1.0,0
35,1,1,0,0,0,2275,2067.0,146.412162,360.0,1.0,2


## Train a model

In [48]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [49]:
print(X_train.head())
print(X_train.dtypes)
print(y_train.head())
print(y_train.dtypes)

     Gender  Married Dependents  Education  Self_Employed  ApplicantIncome  \
354       0        1          0          0              0             2423   
518       1        0          0          0              0             4683   
0         1        0          0          0              0             5849   
12        1        1          2          0              0             3073   
314       1        1          0          0              0             2473   

     CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
354              505.0  130.000000             360.0             1.0   
518             1915.0  185.000000             360.0             1.0   
0                  0.0  146.412162             360.0             1.0   
12              8106.0  200.000000             360.0             1.0   
314             1843.0  159.000000             360.0             1.0   

     Property_Area  
354              1  
518              1  
0                2  
12            

In [50]:
# Validate the model
from sklearn.metrics import accuracy_score

y_val_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

Validation Accuracy: 0.7610619469026548


In [51]:
y_test_pred = model.predict(ds_test)

In [52]:
y_test_pred

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [53]:
#y_test = ds_test['Loan_Status']
#print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

In [54]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for RandomForest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Use the best model from GridSearch
best_model = grid_search.best_estimator_


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.8155555555555555


In [55]:
importances = model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(importance_df.sort_values(by='Importance', ascending=False))


              Feature  Importance
9      Credit_History    0.266817
5     ApplicantIncome    0.196151
7          LoanAmount    0.188981
6   CoapplicantIncome    0.115852
10      Property_Area    0.055206
8    Loan_Amount_Term    0.044915
2          Dependents    0.043378
3           Education    0.025458
1             Married    0.024064
4       Self_Employed    0.021088
0              Gender    0.018089
