In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder,OrdinalEncoder
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df=pd.read_csv("loan_approval_data.csv")
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,employment_type,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,12th,No,Salaried,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,12th,No,Salaried,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,8th,No,Salaried,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,8th,No,Salaried,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Graduate,No,Salaried,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
X = df.drop(['loan_id','loan_status'], axis=1)  # Features
y = df['loan_status']  # Target

In [4]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [5]:
Ordinal_Encoder= OrdinalEncoder()
categorical_cols = ['education', 'self_employed', 'employment_type']
X[categorical_cols] = Ordinal_Encoder.fit_transform(X[categorical_cols])


In [6]:
# Feature scaling for numerical columns
scaler = StandardScaler()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [9]:
model_dict={'Logistic Regression': LogisticRegression(max_iter=1000),
            'Random Forest': RandomForestClassifier(),
            'Gradient Boosting': GradientBoostingClassifier(),
            'SVM': SVC(),
            'Decision Tree': DecisionTreeClassifier(),
            'K-Nearest Neighbors': KNeighborsClassifier()
            }

best_model_name = None
best_model = None
best_accuracy = 0

for name,initialised_model in model_dict.items():
    model=initialised_model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    model_accuracy=accuracy_score(y_true=y_test,y_pred=y_pred)

    if model_accuracy>best_accuracy:
        best_accuracy=model_accuracy
        best_model=model
        best_model_name=name

        

In [10]:
hyper_param_dict = {'Logistic Regression': {'C': [0.01, 0.1, 1, 10, 100],
                                               'solver': ['liblinear', 'lbfgs']},

                       'Random Forest': {'n_estimators': [100, 200, 300],
                                         'max_depth': [10, 20, None],
                                         'min_samples_split': [2, 5, 10]},

                       'Gradient Boosting': {'n_estimators': [100, 200, 300],
                                             'learning_rate': [0.01, 0.1, 0.2],
                                             'max_depth': [3, 5, 10]},

                       'SVM': {'C': [0.1, 1, 10],
                               'kernel': ['linear', 'rbf'],
                               'gamma': ['scale', 'auto']},
                               
                       'K-Nearest Neighbors': {'n_neighbors': [3, 5, 7, 9],
                                               'weights': ['uniform', 'distance']},
                                               
                       'Decision Tree': {'max_depth': [10, 20, 30,40],
                                         'min_samples_split': [2, 5, 10]}
                       }

hyper_param=hyper_param_dict.get(best_model_name)

grid_search = GridSearchCV(estimator=best_model, param_grid=hyper_param, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)
tuned_model=grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [11]:
dict={'Best model name':[best_model_name] ,
      'Best accuracy':[best_accuracy]}

pd.DataFrame(dict)

Unnamed: 0,Best model name,Best accuracy
0,Random Forest,0.977752


In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df=pd.read_csv("loan_approval_data.csv")

X = df.drop(['loan_id','loan_status'], axis=1)  # Features

numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns
cat_columns=['education', 'self_employed', 'employment_type']


num_pipeline=Pipeline(steps=[
    ('StandardScalar',StandardScaler())
])



cat_pipeline=Pipeline(steps=[
    ('OrdinalEncoder',OrdinalEncoder()),
    ])


Preprocessing=ColumnTransformer(transformers=[
    ('cat_pipeline',cat_pipeline,cat_columns),
    ('num_pipeline',num_pipeline,numeric_columns)
    ], remainder='drop')

Preprocessing.fit(X)

In [13]:
def predict_file(file_path):
    dataframe=pd.read_csv(file_path)
    df=Preprocessing.transform(dataframe)
    pred=best_model.predict(df)
    pred=label_encoder.inverse_transform(pred)
    dataframe['predicted_loan_status']=pred

    return dataframe


def predict(dataframe):
    df=Preprocessing.transform(dataframe)
    pred=best_model.predict(df)
    pred=label_encoder.inverse_transform(pred)
    dataframe['predicted_loan_status']=pred

    return dataframe


In [14]:
df=predict_file('loan_approval_data.csv')
print("The accuracy score for the file is",accuracy_score(y_true=df['loan_status'],y_pred=df['predicted_loan_status']))
print("\n")
print("[The classification report for the data is]")
print("\n")
print(classification_report(y_true=df['loan_status'],y_pred=df['predicted_loan_status']))



The accuracy score for the file is 0.9878191613961115


[The classification report for the data is]


              precision    recall  f1-score   support

    Approved       0.99      0.99      0.99      2656
    Rejected       0.99      0.98      0.98      1613

    accuracy                           0.99      4269
   macro avg       0.99      0.99      0.99      4269
weighted avg       0.99      0.99      0.99      4269



In [15]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,employment_type,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,predicted_loan_status
0,1,2,12th,No,Salaried,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved,Approved
1,2,0,12th,No,Salaried,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected,Rejected
2,3,3,8th,No,Salaried,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected,Rejected
3,4,3,8th,No,Salaried,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected,Rejected
4,5,5,Graduate,No,Salaried,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected,Rejected


In [16]:
import pickle
with open('model.pkl', 'wb') as model_file:
    pickle.dump(tuned_model, model_file)

with open('preprocessing_object.pkl', 'wb') as preprocessor:
    pickle.dump(Preprocessing, preprocessor)

In [18]:
new_data={'loan_id': [1, 2],
 'no_of_dependents': [2, 0],
 'education': ['12th', '12th'],
 'self_employed': ['No', 'No'],
 'employment_type': ['Salaried', 'Salaried'],
 'income_annum': [9600000, 4100000],
 'loan_amount': [29900000, 12200000],
 'loan_term': [12, 8],
 'cibil_score': [778, 417],
 'residential_assets_value': [2400000, 2700000],
 'commercial_assets_value': [17600000, 2200000],
 'luxury_assets_value': [22700000, 8800000],
 'bank_asset_value': [8000000, 3300000],
 'loan_status': ['Approved', 'Rejected'],
 'predicted_loan_status': ['Approved', 'Rejected']}

In [19]:
new_data=pd.DataFrame(new_data)

predict(dataframe=new_data)



Unnamed: 0,loan_id,no_of_dependents,education,self_employed,employment_type,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,predicted_loan_status
0,1,2,12th,No,Salaried,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved,Approved
1,2,0,12th,No,Salaried,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected,Rejected
