In [2]:
# ----------------------------------------------------------------------------
# Title: Assignment 9.2
# Author: Surenther Selvaraj
# Date: 05 Nov 2025
# Modified By: Surenther Selvaraj
# Description: Best Model Selection and Hyperparameter Tuning
# Data: https://www.kaggle.com/datasets/granjithkumar/loan-approval-data-set
# ----------------------------------------------------------------------------

### Import the dataset and ensure that it loaded properly.

In [3]:
import pandas as pd
import os

# Define the file path. Ensure this path is correct if the file isn't in the same directory.
file_name = 'Loan_Train.csv'

# Check if the file exists locally before attempting to read
if not os.path.exists(file_name):
    print(f"Error: The file '{file_name}' was not found in the current directory.")
    print("Please ensure the file is in the same location as your script or notebook.")
    df = None
else:
    # Attempt to import the dataset
    try:
        df = pd.read_csv(file_name)
        print(f"Dataset '{file_name}' imported successfully from local file.")
        
        # Display the first 5 rows
        print("\nHead (First 5 Rows):\n", df.head())
        
    except Exception as e:
        print(f"An error occurred while reading the CSV file: {e}")
        df = None

Dataset 'Loan_Train.csv' imported successfully from local file.

Head (First 5 Rows):
     Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0        

### Data Preparation for Modeling

In [4]:
# Drop the column "Loan_ID"
# This is an identifier and is not useful for model training.
print("Shape before dropping Loan_ID:", df.shape)
df = df.drop('Loan_ID', axis=1)
print("Shape after dropping Loan_ID:", df.shape)

# Drop any rows with missing data (Imputation is skipped for this exercise)
print("\nNumber of rows before dropping missing values:", len(df))
df = df.dropna()
print("Number of rows after dropping missing values:", len(df))

# Convert the categorical features into dummy variables (One-Hot Encoding)
# pandas get_dummies automatically identifies and converts object/category types
print("\nShape before creating dummy variables:", df.shape)
df_model = pd.get_dummies(df, drop_first=True)
print("Shape after creating dummy variables:", df_model.shape)

# Display the first few rows of the final prepared DataFrame
print("\n--- Final Prepared DataFrame (df_model) Head ---")
print(df_model.head())

Shape before dropping Loan_ID: (614, 13)
Shape after dropping Loan_ID: (614, 12)

Number of rows before dropping missing values: 614
Number of rows after dropping missing values: 480

Shape before creating dummy variables: (480, 12)
Shape after creating dummy variables: (480, 15)

--- Final Prepared DataFrame (df_model) Head ---
   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   
5             5417             4196.0       267.0             360.0   

   Credit_History  Gender_Male  Married_Yes  Dependents_1  Dependents_2  \
1             1.0         True         True          True         False   
2             1.0         True         True         False         False   
3             1.0

### Split the data into a training and test set, where the ‚ÄúLoan_Status‚Äù column is the target.

In [5]:
from sklearn.model_selection import train_test_split

# Define Features (X) and Target (y)
target_column = 'Loan_Status_Y' 

# X contains all columns except the target
X = df_model.drop(target_column, axis=1)

# y contains only the target column
y = df_model[target_column]

# Split the data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42,
    stratify=y # Stratify ensures the train and test sets have the same proportion of target classes
)

# Print the resulting shapes to verify the split
print("--- Data Split Verification ---")
print(f"Total features (X) shape: {X.shape}")
print(f"Total target (y) shape: {y.shape}")
print("-" * 35)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

--- Data Split Verification ---
Total features (X) shape: (480, 14)
Total target (y) shape: (480,)
-----------------------------------
X_train shape: (384, 14)
X_test shape: (96, 14)
y_train shape: (384,)
y_test shape: (96,)


### Create a pipeline with a min-max scaler and a KNN classifier

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

# Define the steps in the pipeline:
# 1. 'scaler': MinMaxScaler
# 2. 'knn': KNeighborsClassifier (using default k=5)
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('knn', KNeighborsClassifier())
])

# Print the pipeline structure to confirm the steps
print("--- Pipeline Structure ---")
print(pipeline)
print("-" * 28)
print("Pipeline created successfully. It is ready for training (fitting).")

--- Pipeline Structure ---
Pipeline(steps=[('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier())])
----------------------------
Pipeline created successfully. It is ready for training (fitting).


### Create a search space for your KNN classifier where your ‚Äún_neighbors‚Äù parameter varies from 1 to 10

In [7]:
# Fit the pipeline to the training data
print("Fitting the KNN Pipeline to the training data (X_train, y_train)...")
pipeline.fit(X_train, y_train)
print("Pipeline fitting complete.")

# Evaluate the pipeline on the test data
# The score method automatically calculates the accuracy for classification models.
accuracy = pipeline.score(X_test, y_test)

# Report the accuracy
print("\n--- Model Performance on Test Set ---")
print(f"Model Accuracy on the Test Set: {accuracy:.4f}")

Fitting the KNN Pipeline to the training data (X_train, y_train)...
Pipeline fitting complete.

--- Model Performance on Test Set ---
Model Accuracy on the Test Set: 0.7083


### Fit a default KNN classifier to the data with this pipeline.

In [8]:
# Fit the pipeline to the training data
print("Fitting the KNN Pipeline to the training data (X_train, y_train)...")
pipeline.fit(X_train, y_train)
print("Pipeline fitting complete.")

# Evaluate the pipeline on the test data
# The score method automatically calculates the accuracy for classification models.
accuracy = pipeline.score(X_test, y_test)

# Report the accuracy
print("\n--- Model Performance on Test Set ---")
print(f"Model Accuracy on the Test Set: {accuracy:.4f}")

Fitting the KNN Pipeline to the training data (X_train, y_train)...
Pipeline fitting complete.

--- Model Performance on Test Set ---
Model Accuracy on the Test Set: 0.7083


### Find the accuracy of the grid search best model on the test set.

In [9]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'knn__n_neighbors': range(1, 21),  # Test k from 1 to 20
    'knn__weights': ['uniform', 'distance'] # Test uniform and distance weighting
}

# Initialize GridSearchCV
# cv=5 means 5-fold cross-validation will be used on the training data.
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1  # Use all available cores
)

# Fit the grid search to the training data
# This step performs the cross-validation and hyperparameter tuning.
print("Starting Grid Search (testing 40 different models)...")
grid_search.fit(X_train, y_train)
print("Grid Search complete.")

# Report the Best Parameters found
print("\n--- Grid Search Results ---")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
print(f"Best parameters found: {grid_search.best_params_}")

# Evaluate the best estimator on the test set
# The 'best_estimator_' attribute is the fitted pipeline with the optimal parameters.
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)

# Report the accuracy of the best model on the test set
print("\n--- Best Model Performance on Test Set ---")
print(f"Accuracy of the Grid Search Best Model on the Test Set: {test_accuracy:.4f}")

Starting Grid Search (testing 40 different models)...
Grid Search complete.

--- Grid Search Results ---
Best cross-validation accuracy: 0.7604
Best parameters found: {'knn__n_neighbors': 20, 'knn__weights': 'distance'}

--- Best Model Performance on Test Set ---
Accuracy of the Grid Search Best Model on the Test Set: 0.7188


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

# --- Define the Multi-Model Pipeline ---
# The classifier is just a placeholder; the grid search will swap it out.
pipeline_multi = Pipeline([
    ('scaler', MinMaxScaler()),
    ('classifier', LogisticRegression()) 
])

# --- Define the Expanded Parameter Grids ---

# Grid 1: K-Nearest Neighbors (KNN)
knn_params = {
    'classifier': [KNeighborsClassifier()],
    'classifier__n_neighbors': range(1, 21),
    'classifier__weights': ['uniform', 'distance']
}

# Grid 2: Logistic Regression (LogReg)
# Based on Section 12.3: L1/L2 and C logspace
logreg_params = {
    'classifier': [LogisticRegression(random_state=42, solver='liblinear')],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': np.logspace(-4, 4, 10) # 10 values from 0.0001 to 10000
}

# Grid 3: Random Forest (RF)
# Based on Section 12.3: n_estimators, max_features, max_depth
forest_params = {
    'classifier': [RandomForestClassifier(random_state=42)],
    'classifier__n_estimators': [10, 50, 100], 
    'classifier__max_features': [1, 2, 'sqrt'], 
    'classifier__max_depth': [None, 5, 10]
}

# Combine all parameter grids into a list
param_grids = [knn_params, logreg_params, forest_params]

# --- Initialize and Run Grid Search ---
# We pass the list of grids to param_grid
grid_search_multi = GridSearchCV(
    estimator=pipeline_multi,
    param_grid=param_grids,
    scoring='accuracy',
    cv=5,
    n_jobs=-1 # Use all available cores
)

print("Starting combined Grid Search across KNN, Logistic Regression, and Random Forest...")
# Fit the grid search (assuming X_train, y_train are in memory)
grid_search_multi.fit(X_train, y_train)
print("Grid Search complete.")

# --- Report Results ---

# Find the best model's class name for reporting
best_model_name = type(grid_search_multi.best_estimator_.named_steps['classifier']).__name__

print("\n--- Combined Grid Search Results ---")
print(f"Best Model Found: {best_model_name}")
print(f"Best Cross-Validation Accuracy: {grid_search_multi.best_score_:.4f}")
print(f"Best Parameters Found: {grid_search_multi.best_params_}")

# --- Report Accuracy on the Test Set ---
# Use the best_estimator_ found by the search to score the test set
test_accuracy = grid_search_multi.score(X_test, y_test)

print("\n--- Best Model Performance on Test Set ---")
print(f"Accuracy of the Grid Search Best Model on the Test Set: {test_accuracy:.4f}")

Starting combined Grid Search across KNN, Logistic Regression, and Random Forest...
Grid Search complete.

--- Combined Grid Search Results ---
Best Model Found: LogisticRegression
Best Cross-Validation Accuracy: 0.8020
Best Parameters Found: {'classifier': LogisticRegression(random_state=42, solver='liblinear'), 'classifier__C': np.float64(0.046415888336127774), 'classifier__penalty': 'l1'}

--- Best Model Performance on Test Set ---
Accuracy of the Grid Search Best Model on the Test Set: 0.8333


üí° Key Observations

The primary observation is the significant impact of model selection and hyperparameter tuning on predictive performance:

    Scaling and KNN: While essential, scaling alone only yielded modest performance with the default KNN model (70.83%). Tuning the KNN parameters resulted in only a minor improvement (+1.05%), indicating that the KNN algorithm is not the strongest fit for this dataset.

    Superior Performance of Logistic Regression: The expanded grid search highlighted that the Logistic Regression model, even within a vast multi-model search space, was the best choice. It achieved the highest test accuracy of 83.33%, a substantial jump of over 11 percentage points from the tuned KNN model.

    Optimal Hyperparameters: The best model utilized Logistic Regression with penalty='l1' and a specific C value (approximately 0.046), suggesting that a degree of regularization (specifically L1, which helps with feature selection) is crucial for achieving peak performance on this loan approval data. This confirms that selecting the right algorithm is often more important than fine-tuning a sub-optimal one.