In [78]:
# IMPORT LIBRIES
import pandas as pd 
import numpy as np 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.impute import KNNImputer

In [79]:
# IMPORT DATASET
df = pd.read_csv('https://raw.githubusercontent.com/rohitmande-inttrvu/finance_loan_approval/refs/heads/main/Finance.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [80]:
# DROP LOAD_ID
df.drop(columns=['Loan_ID'],inplace=True)

In [81]:
df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)
  df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)


In [82]:
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [83]:
# CONFIGURE EXPERIMENT
import mlflow
import dagshub
dagshub.init(repo_owner='yogibaba7', repo_name='loan_approval_prediction', mlflow=True)
# set tracking uri
mlflow.set_tracking_uri('https://dagshub.com/yogibaba7/loan_approval_prediction.mlflow/')
# create a experiment
mlflow.set_experiment('exp2_simpleimpute_vs_advimpute')

<Experiment: artifact_location='mlflow-artifacts:/7edc41d916ad4b2e8559ead4b81900ab', creation_time=1744450779113, experiment_id='1', last_update_time=1744450779113, lifecycle_stage='active', name='exp2_simpleimpute_vs_advimpute', tags={}>

In [84]:
with mlflow.start_run(nested=True,description='on numerical columns knnimputer and on categorical cols most_frequent imputer'):
    X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Loan_Status']),df['Loan_Status'],test_size=0.2,random_state=42)
    # CATEGORICAL COLUMNS AND NUMERICAL COLUMNS
    cat_cols = []
    num_cols = []
    for col in X_train.columns:
        if df[col].nunique()<5:
            cat_cols.append(col)
        else:
            num_cols.append(col)
    print(f"Categorical cols : {cat_cols}")
    print(f"Numerical cols : {num_cols}")

    # numerical imputer
    knnimputer = KNNImputer()
    X_train[num_cols] = knnimputer.fit_transform(X_train[num_cols])
    X_test[num_cols] = knnimputer.transform(X_test[num_cols])
    # categorical imputer
    si = SimpleImputer(strategy='most_frequent')
    X_train[cat_cols] = si.fit_transform(X_train[cat_cols])
    X_test[cat_cols] = si.transform(X_test[cat_cols])   
    # encoding
    oe = OrdinalEncoder()
    X_train[cat_cols] = oe.fit_transform(X_train[cat_cols])
    X_test[cat_cols] = oe.transform(X_test[cat_cols])


    # log imputer 
    mlflow.log_param('imputer','KNNImputer')

    # model training
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)

    # log model
    mlflow.sklearn.log_model(lr,'LogisticRegression')

    # model evaluations
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)

    # log metrics 
    mlflow.log_metric('accuracy',accuracy)
    mlflow.log_metric('precision',precision)
    mlflow.log_metric('recall',recall)
    mlflow.log_metric('f1',f1)

    # Save and log the notebook
    import os
    notebook_path = "exp1_baseline_model.ipynb"
    os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
    mlflow.log_artifact(notebook_path)

    
    print(f"accuracy score : {accuracy}")
    print(f"precision score : {precision}")
    print(f"recall score : {recall}")
    print(f"f1 score : {f1}")     
    print("---------------------------------------------------------")



Categorical cols : ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']
Numerical cols : ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy score : 0.7886178861788617
precision score : 0.7596153846153846
recall score : 0.9875
f1 score : 0.8586956521739131
---------------------------------------------------------
🏃 View run incongruous-frog-904 at: https://dagshub.com/yogibaba7/loan_approval_prediction.mlflow/#/experiments/1/runs/546017419bb94cbca765faae98dd841c
🧪 View experiment at: https://dagshub.com/yogibaba7/loan_approval_prediction.mlflow/#/experiments/1
