Setting up Mlflow

In [1]:
# Install the following librairies (it is better to create a venv (or conda) virtual environment first and install these librairies in it)
#!pip install mlflow
#!pip install --upgrade jinja2
#!pip install --upgrade Flask
#!pip install setuptools

In [2]:
# starts an MLflow server locally.
# !mlflow server --host 127.0.0.1 --port 8080

## Using the MLflow Client API

- Initiate a new Experiment.

- Start Runs within an Experiment.

- Document parameters, metrics, and tags for your Runs.

- Log artifacts linked to runs, such as models, tables, plots, and more.

In [4]:
from mlflow import MlflowClient
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor

In [5]:
# In order to connect to the tracking server, we’ll need to use the uri that we assigned the server when we started it.

client = MlflowClient(tracking_uri="http://127.0.0.1:8080")

#it allows programmatic interaction with the MLflow tracking server.

We now have a client interface to the tracking server that can both send data to and retrieve data from the tracking server.

In [6]:
all_experiments = client.search_experiments()

print(all_experiments)

[<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1742589806074, experiment_id='0', last_update_time=1742589806074, lifecycle_stage='active', name='Default', tags={}>]


### Create an experiment

In [7]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "This is the loan default prediction project. "
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "loan-default-prediction",
    #store_dept": "produce",
    #"team": "stores-ml",
    "project_quarter": "Q1-2025",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
produce_loan_default_experiment = client.create_experiment(
    name="Loan_Default_Prediction_Models_3", tags=experiment_tags
)


In [8]:
# Use search_experiments() to search on the project_name tag key

loan_default_experiment_1 = client.search_experiments(
    filter_string="tags.`project_name` = 'loan-default-prediction'"
)

print(vars(loan_default_experiment_1[0]))

{'_experiment_id': '858358373506321674', '_name': 'Loan_Default_Prediction_Models_3', '_artifact_location': 'mlflow-artifacts:/858358373506321674', '_lifecycle_stage': 'active', '_tags': {'mlflow.note.content': 'This is the loan default prediction project. ', 'project_name': 'loan-default-prediction', 'project_quarter': 'Q1-2025'}, '_creation_time': 1742589887844, '_last_update_time': 1742589887844}


### Create a dataset

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('Loan_Data.csv')


 # Generate features
df_features = df.iloc[:, 1:7]


df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [3]:
df.describe()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4974577.0,1.4612,4159.677034,8718.916797,70039.901401,4.5528,637.5577,0.1851
std,2293890.0,1.743846,1421.399078,6627.164762,20072.214143,1.566862,60.657906,0.388398
min,1000324.0,0.0,46.783973,31.652732,1000.0,0.0,408.0,0.0
25%,2977661.0,0.0,3154.235371,4199.83602,56539.867903,3.0,597.0,0.0
50%,4989502.0,1.0,4052.377228,6732.407217,70085.82633,5.0,638.0,0.0
75%,6967210.0,2.0,5052.898103,11272.26374,83429.166133,6.0,679.0,0.0
max,8999789.0,5.0,10750.67781,43688.7841,148412.1805,10.0,850.0,1.0


### Logging the first runs with MLflow

In [11]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [12]:
# This function call sets the global tracking URI for the current session.
# It’s a convenient way to configure the tracking server URI without creating a separate client instance.

mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [13]:
# Sets the current active experiment to the "Loan_Default_Prediction_Models" experiment and
# returns the Experiment metadata
loan_default_experiment_1 = mlflow.set_experiment("Loan_Default_Prediction_Models_3")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "load_rf_test"

# Define an artifact path that the model will be saved to.
artifact_path = "rf_loan"

In [14]:
# Split the data into features and target and drop irrelevant date field and target field
X = df_features
y = df["default"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

params = {
    "n_estimators": 100, 
    "max_depth": 6,          
    "min_samples_split": 10,    
    "min_samples_leaf": 4,     
    "bootstrap": True,        
    "oob_score": False,        
    "random_state": 888,        
}

# Train the RandomForestRegressor
rf = RandomForestClassifier(**params)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf.predict(X_val)

# Calculate error metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1_score = f1_score(y_val, y_pred)

# Assemble the metrics we're going to write into a collection
metrics = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1_score}

# Initiate the MLflow run context
with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=rf, input_example=X_val, artifact_path=artifact_path
    )
    



🏃 View run load_rf_test at: http://127.0.0.1:8080/#/experiments/858358373506321674/runs/78fa1bcca03d4e2399b6b86f9bcb42cd
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/858358373506321674


### Saving Model

In [16]:
from joblib import dump

dump(rf, "random_forest_model.pkl")


['random_forest_model.pkl']

In [17]:
from joblib import load


rf_loaded = load("random_forest_model.pkl")

# Check type
print(type(rf_loaded))


<class 'sklearn.ensemble._forest.RandomForestClassifier'>
