In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

In [2]:
# Load the dataset
data = pd.read_csv('loan_approval_dataset.csv')

In [3]:
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
data.shape

(4269, 13)

# Task 1: Data Preprocessing

### a) Clean the dataset by handling missing values and removing outliers


In [5]:
# Check for missing values
data.isnull().sum()

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

In [6]:
# Removing outliers using z-score
data = data[(np.abs(zscore(data.select_dtypes(include=[np.number]))) < 3).all(axis=1)]

In [7]:
data.shape

(4236, 13)

In [10]:
data.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

### b) Perform feature scaling or normalization


In [11]:
# Feature scaling
scaler = StandardScaler()
numerical_features = [' income_annum', ' loan_amount', ' loan_term', ' cibil_score', 
                      ' residential_assets_value', ' commercial_assets_value', 
                      ' luxury_assets_value', ' bank_asset_value']

In [12]:
data[numerical_features] = scaler.fit_transform(data[numerical_features])

### c) Encode categorical variables appropriately


In [14]:
# Encoding categorical variables
label_encoder = LabelEncoder()
data[' education'] = label_encoder.fit_transform(data[' education'])
data[' self_employed'] = label_encoder.fit_transform(data[' self_employed'])
data[' loan_status'] = label_encoder.fit_transform(data[' loan_status'])

### d) Split the dataset into training and testing sets

In [16]:
# Splitting the dataset
X = data.drop(' loan_status', axis=1)
y = data[' loan_status']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Task 2: Model Building with Hyperparameter Tuning


## Random forest is selected

### b) Implement hyperparameter tuning by conducting a grid search or random search


In [20]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [21]:
# Create the model
rf = RandomForestClassifier(random_state=42)

In [22]:
# Implement grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [23]:
# Display the best parameters
print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


### c) Build the classification model using the training data


In [24]:
# Build the model with the best parameters
best_rf = grid_search.best_estimator_

In [25]:
# Train the model
best_rf.fit(X_train, y_train)

In [26]:
# Display the model
print(best_rf)

RandomForestClassifier(max_depth=10, random_state=42)


# Task 3: Model Evaluation and Selection


### a) Calculate and analyze the confusion matrix


In [28]:
# Predictions
y_pred = best_rf.predict(X_test)

In [29]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[747  18]
 [ 18 488]]


### b) Evaluate the performance of the classification model using appropriate metrics

In [31]:
# Classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       765
           1       0.96      0.96      0.96       506

    accuracy                           0.97      1271
   macro avg       0.97      0.97      0.97      1271
weighted avg       0.97      0.97      0.97      1271



### c) Implement k-fold cross-validation to assess the model's generalization performance


In [33]:
# K-fold cross-validation
cv_scores = cross_val_score(best_rf, X, y, cv=5)

In [34]:
# Display the cross-validation scores
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

Cross-validation scores: [0.96698113 0.97638725 0.98229044 0.97874852 0.95395514]
Mean cross-validation score: 0.9716724956004544


## d) Select the best-performing classification model


###### Random Forest model will be selected due to its high performance and generalization capabilities.