In [16]:
import pandas as pd

# Load the dataset
df = pd.read_csv('loan_approval_dataset.csv')

In [17]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [18]:
# Clean column names
df.columns = df.columns.str.strip()

In [19]:
df.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [20]:
df.isnull().sum()

loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

In [21]:
df.dtypes

loan_id                      int64
no_of_dependents             int64
education                   object
self_employed               object
income_annum                 int64
loan_amount                  int64
loan_term                    int64
cibil_score                  int64
residential_assets_value     int64
commercial_assets_value      int64
luxury_assets_value          int64
bank_asset_value             int64
loan_status                 object
dtype: object

In [22]:
df.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

In [23]:
from scipy import stats

# Define the numerical features after cleaning column names
numerical_features = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 
                       'cibil_score', 'residential_assets_value', 'commercial_assets_value', 
                       'luxury_assets_value', 'bank_asset_value']

# Calculate Z-scores for numerical features
z_scores = stats.zscore(df[numerical_features])

In [24]:
import numpy as np

# Define the threshold
threshold = 3

# Find rows where any Z-score is greater than the threshold
outliers = np.any(np.abs(z_scores) > threshold, axis=1)

# Filter out the outliers
df_clean = df[~outliers]

# Verify the number of rows before and after outlier removal
print("Original number of rows:", len(df))
print("Number of rows after outlier removal:", len(df_clean))

Original number of rows: 4269
Number of rows after outlier removal: 4236


In [25]:
df_clean = df_clean.drop(columns=['loan_id'])

In [26]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label = LabelEncoder()

# Encode 'self_employed' feature
df_clean['self_employed'] = label.fit_transform(df_clean['self_employed'])
df_clean['education'] = label.fit_transform(df_clean['education'])
df_clean['loan_status'] = label.fit_transform(df_clean['loan_status'])

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

In [28]:
df_clean.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1
3,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1
4,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,1


In [29]:
# Define numerical features
numerical_features = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 
                       'cibil_score', 'residential_assets_value', 'commercial_assets_value', 
                       'luxury_assets_value', 'bank_asset_value']

# Initialize StandardScaler
scaler = StandardScaler()

# Scale numerical features
df_clean[numerical_features] = scaler.fit_transform(df_clean[numerical_features])

# Separate features and target
X = df_clean.drop(columns=['loan_status'])
y = df_clean['loan_status']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the shapes of the train and test sets
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Training set size: 3388
Testing set size: 848


In [30]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [31]:
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,-0.293443,0,0,1.641471,1.657663,0.193909,1.032906,-0.780184,2.953092,0.846931,0.94722
1,-1.473344,1,1,-0.331231,-0.314382,-0.506645,-1.062326,-0.733037,-0.628655,-0.687689,-0.508285
2,0.296507,0,0,1.462135,1.63538,1.595019,-0.545773,-0.041541,-0.093719,2.017216,2.433693
3,0.296507,0,0,1.139329,1.746795,-0.506645,-0.772128,1.702912,-0.372816,0.913173,0.916252
4,1.476407,1,1,1.713206,1.022598,1.595019,-1.265465,0.791396,0.766831,1.586639,0.018174


In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the models
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier()
}

In [33]:
# Parameter grid for each model
param_grids = {
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

In [34]:
# Initialize GridSearchCV for each model
grid_searches = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[model_name],
        cv=5,  # 5-fold cross-validation
        scoring='accuracy',  # You can use other metrics such as 'f1', 'roc_auc'
        n_jobs=-1,  # Use all available cores
        verbose=1  # Show progress
    )
    grid_searches[model_name] = grid_search

In [35]:
# Fit GridSearchCV for each model
best_estimators = {}
for model_name, grid_search in grid_searches.items():
    print(f"Training {model_name}...")
    grid_search.fit(X_train, y_train)
    best_estimators[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

Training LogisticRegression...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for LogisticRegression: {'C': 0.01, 'solver': 'liblinear'}
Training RandomForestClassifier...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters for RandomForestClassifier: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Training GradientBoostingClassifier...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters for GradientBoostingClassifier: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}


In [36]:
# Find the best model and its score
best_model_name = None
best_model = None
best_score = -float('inf')  # Initialize with a very low value

for model_name, grid_search in grid_searches.items():
    # Get the best score from the grid search
    score = grid_search.best_score_
    print(f"Cross-validation score for {model_name}: {score:.4f}")
    
    # Update the best model if this model has a higher score
    if score > best_score:
        best_score = score
        best_model_name = model_name
        best_model = grid_search.best_estimator_

print(f"Best model: {best_model_name} with cross-validation score: {best_score:.4f}")

Cross-validation score for LogisticRegression: 0.9256
Cross-validation score for RandomForestClassifier: 0.9823
Cross-validation score for GradientBoostingClassifier: 0.9858
Best model: GradientBoostingClassifier with cross-validation score: 0.9858


In [38]:
# Save the best model
if best_model:
    filename = f"{best_model_name.replace(' ', '_')}_best_model.pkl"
    joblib.dump(best_model, filename)
    print(f"Saved best model for {best_model_name} to {filename}")

Saved best model for GradientBoostingClassifier to GradientBoostingClassifier_best_model.pkl
