# Experiment 1 Base line Model

## strategy 
### drop null values
### use label encoder
### train logistic model


In [98]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score


In [99]:
# import dataset
df = pd.read_csv('https://raw.githubusercontent.com/rohitmande-inttrvu/finance_loan_approval/refs/heads/main/Finance.csv')

In [100]:
# shape
df.shape

(614, 13)

In [101]:
# overview
df.sample(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
415,LP002337,Female,No,0,Graduate,No,2995,0.0,60.0,360.0,1.0,Urban,Y
174,LP001603,Male,Yes,0,Not Graduate,Yes,4344,736.0,87.0,360.0,1.0,Semiurban,N
381,LP002229,Male,No,0,Graduate,No,5941,4232.0,296.0,360.0,1.0,Semiurban,Y
242,LP001806,Male,No,0,Graduate,No,2965,5701.0,155.0,60.0,1.0,Urban,Y
397,LP002281,Male,Yes,0,Graduate,No,3033,1459.0,95.0,360.0,1.0,Urban,Y


In [102]:
df.drop(columns=['Loan_ID'],inplace=True)

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [104]:
# check null values
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [105]:
df.duplicated().sum()

np.int64(0)

In [106]:
# drop null values
print(f"Before drop null values X_train shape : {df.shape}")
df.dropna(inplace=True)
print(f"after drop null values X_train shape : {df.shape}")


Before drop null values X_train shape : (614, 12)
after drop null values X_train shape : (480, 12)


In [107]:
# INPUTS AND OUTPUTS
x = df.drop(columns=['Loan_Status']) # Inputs
y = df['Loan_Status']                # outputs

In [108]:
# output as 0 and 1
y.replace({'Y':1,'N':0},inplace=True)

  y.replace({'Y':1,'N':0},inplace=True)


In [109]:
# check for unique categories in each columns
for col in x.columns:
    print(f"{col} unique values : {x[col].nunique()}")

Gender unique values : 2
Married unique values : 2
Dependents unique values : 4
Education unique values : 2
Self_Employed unique values : 2
ApplicantIncome unique values : 405
CoapplicantIncome unique values : 232
LoanAmount unique values : 186
Loan_Amount_Term unique values : 9
Credit_History unique values : 2
Property_Area unique values : 3


In [110]:
# CATEGORICAL COLUMNS
cat_cols = []
num_cols = []
for col in x.columns:
    if x[col].nunique()<5:
        cat_cols.append(col)
    else:
        num_cols.append(col)
print(f"Categorical cols : {cat_cols}")
print(f"Numerical cols : {num_cols}")

Categorical cols : ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']
Numerical cols : ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']


In [111]:
# train test split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [112]:
# add experiment tracking
import mlflow
import dagshub
dagshub.init(repo_owner='yogibaba7', repo_name='loan_approval_prediction', mlflow=True)

In [113]:
# set tracking uri
mlflow.set_tracking_uri('https://dagshub.com/yogibaba7/loan_approval_prediction.mlflow/')
# create a experiment
mlflow.set_experiment('exp1_baseline_model')

2025/04/12 09:48:29 INFO mlflow.tracking.fluent: Experiment with name 'exp1_baseline_model' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1d31a41edcf144d1bcd9d8a381841ca1', creation_time=1744431509358, experiment_id='0', last_update_time=1744431509358, lifecycle_stage='active', name='exp1_baseline_model', tags={}>

In [114]:



with mlflow.start_run():
    # Encoding
    oe = OrdinalEncoder()
    X_train[cat_cols] = oe.fit_transform(X_train[cat_cols])
    X_test[cat_cols] = oe.transform(X_test[cat_cols])
    # train model
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    
    # model evaluation
    accuracy = accuracy_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)

    print(f"accuracy_score : {accuracy}")
    print(f"f1_score : {f1}")
    print(f"precision_score : {precision}")
    print(f"recall_score : {recall}")

    # log model
    mlflow.sklearn.log_model(lr,'LogisticRegression')
    # Save and log the notebook
    import os
    notebook_path = "exp1_baseline_model.ipynb"
    os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
    mlflow.log_artifact(notebook_path)

    # Log evaluation metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # add tags
    mlflow.set_tag('author','yogesh')

    



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy_score : 0.8229166666666666
f1_score : 0.8888888888888888
precision_score : 0.8
recall_score : 1.0




🏃 View run rumbling-pug-624 at: https://dagshub.com/yogibaba7/loan_approval_prediction.mlflow/#/experiments/0/runs/e5fc856b17a745cf8d860402b42c139c
🧪 View experiment at: https://dagshub.com/yogibaba7/loan_approval_prediction.mlflow/#/experiments/0


In [115]:
# Encoding
oe = OrdinalEncoder()
X_train[cat_cols] = oe.fit_transform(X_train[cat_cols])
X_test[cat_cols] = oe.transform(X_test[cat_cols])

In [116]:
# train model
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [117]:
# model evaluation
print(f"accuracy_score : {accuracy_score(y_test,y_pred)}")
print(f"f1_score : {f1_score(y_test,y_pred)}")
print(f"precision_score : {precision_score(y_test,y_pred)}")
print(f"recall_score : {recall_score(y_test,y_pred)}")

accuracy_score : 0.8229166666666666
f1_score : 0.8888888888888888
precision_score : 0.8
recall_score : 1.0
