In [1]:
# importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
# configuring the notebook
pd.set_option("display.max_columns", None)
sns.set_theme(style = "whitegrid")
warnings.filterwarnings("ignore")

In [3]:
# import the dataset
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.shape

(614, 13)

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ApplicantIncome,614.0,5403.459283,6109.041673,150.0,2877.5,3812.5,5795.0,81000.0
CoapplicantIncome,614.0,1621.245798,2926.248369,0.0,0.0,1188.5,2297.25,41667.0
LoanAmount,592.0,146.412162,85.587325,9.0,100.0,128.0,168.0,700.0
Loan_Amount_Term,600.0,342.0,65.12041,12.0,360.0,360.0,360.0,480.0
Credit_History,564.0,0.842199,0.364878,0.0,1.0,1.0,1.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:
# feature engineering
df["Gender"]= df["Gender"].map({"Male": 0, "Female": 1})
df["Married"]= df["Married"].map({"No": 0, "Yes": 1})
df["Loan_Status"]= df["Loan_Status"].map({"N": 0, "Y": 1})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    float64
 2   Married            611 non-null    float64
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    int64  
dtypes: float64(6), int64(2), object(5)
memory usage: 62.5+ KB


In [8]:
# checking for missing values in the DataFrame
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
## dropping all the missing values
df.dropna(inplace = True)
df.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [10]:
# consider that the given data is training data entirely
df_train = df.copy()
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,0.0,1.0,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,0.0,1.0,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,0.0,1.0,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,LP001008,0.0,0.0,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1
5,LP001011,0.0,1.0,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,1


In [11]:
# separating features and labels
x = df_train[["Gender", "Married", "ApplicantIncome", "LoanAmount", "Credit_History"]]
y = df_train["Loan_Status"]
x.shape, y.shape

((480, 5), (480,))

In [12]:
# performing the train-validation split
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=5)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((384, 5), (96, 5), (384,), (96,))

In [13]:
# building a RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth = 4, random_state = 5)
model.fit(x_train, y_train)

In [14]:
# predicting on the validation set
predict_val = model.predict(x_val)
predict_val

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])

In [15]:
# computing the accuracy
from sklearn.metrics import accuracy_score

accuracy_score(y_val, predict_val)

0.7916666666666666

In [16]:
# computing the accuracy score for training data
predict_train = model.predict(x_train)
accuracy_score(y_train, predict_train)

0.8385416666666666

# Experiment Tracking Using MLFlow

In [17]:
import mlflow
import sklearn
import psutil
import random

In [18]:
mlflow.set_experiment("loan_status")

2024/11/19 07:32:22 INFO mlflow.tracking.fluent: Experiment with name 'loan_status' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/vidishsirdesai/Desktop/dev/dsml_end_to_end_reference/mlops/experiment_tracking/mlruns/263997734690016607', creation_time=1731981742832, experiment_id='263997734690016607', last_update_time=1731981742832, lifecycle_stage='active', name='loan_status', tags={}>

In [19]:
# # run experiment
# with mlflow.start_run():
#     max_depth = 4
#     random_state = 5

#     # building the model
#     model = RandomForestClassifier(max_depth = max_depth, random_state = random_state)

#     # training the model
#     model.fit(x_train, y_train)

#     # predicting on the validation data
#     pred_val = model.predict(x_val)
#     val_acc = accuracy_score(y_val, pred_val)
#     pred_train = model.predict(x_train)
#     train_acc = accuracy_score(y_train, pred_train)

#     # logging parameters and metrics
#     mlflow.set_tag("mlflow.runName", "first_run") # set the tag name for the experiment being run
#     mlflow.log_param("max_depth", max_depth)
#     mlflow.log_param("random_state", random_state)
#     mlflow.log_metric("val_acc", val_acc)
#     mlflow.log_metric("train_acc", train_acc)

#     mlflow.sklearn.log_model(model, "model")

In [20]:
# defining a dynamic method to track experiment
def track_experiment(i, max_depth, random_state):
    with mlflow.start_run():
        # building the model
        model = RandomForestClassifier(max_depth = max_depth, random_state = random_state)

        # training the model
        model.fit(x_train, y_train)

        # predicting on the validation data
        pred_val = model.predict(x_val)
        val_acc = accuracy_score(y_val, pred_val)
        pred_train = model.predict(x_train)
        train_acc = accuracy_score(y_train, pred_train)

        # logging parameters and metrics
        experiment_name = f"experiment_number_{i}"
        mlflow.set_tag("mlflow.runName", experiment_name) # set the tag name for the experiment being run
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("random_state", random_state)
        mlflow.log_metric("val_acc", val_acc)
        mlflow.log_metric("train_acc", train_acc)

        # logging system metrics
        cpu_usage = psutil.cpu_percent()
        memory_usage = psutil.virtual_memory().percent
        mlflow.log_metric("cpu_usage", cpu_usage)
        mlflow.log_metric("memory_usage", memory_usage)

        mlflow.sklearn.log_model(model, "model")

In [21]:
for i in range(1, 11):
    random_state = random.randint(5, 90)
    track_experiment(i, i, random_state)

