Lab 13

In [8]:
pip show azure-ai-ml

Name: azure-ai-ml
Version: 1.29.0
Summary: Microsoft Azure Machine Learning Client Library for Python
Home-page: https://github.com/Azure/azure-sdk-for-python
Author: Microsoft Corporation
Author-email: azuresdkengsysadmins@microsoft.com
License: MIT License
Location: /home/tshen/dp-100/mslearn-azure-ml/.venv/lib/python3.12/site-packages
Requires: azure-common, azure-core, azure-mgmt-core, azure-monitor-opentelemetry, azure-storage-blob, azure-storage-file-datalake, azure-storage-file-share, colorama, isodate, jsonschema, marshmallow, pydash, pyjwt, pyyaml, strictyaml, tqdm, typing-extensions
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [10]:
# Step 2 - Set MLflow enviroment varibale
%env MLFLOW_TRACKING_URI=http://localhost:5000
# verify it is set
%env MLFLOW_TRACKING_URI

env: MLFLOW_TRACKING_URI=http://localhost:5000


'http://localhost:5000'

Credit card model

Step 1 - Download data and create data and script directories

In [12]:
# Step 1 - download the data and create data folder and script folder if not exist
# download data locally
import os
import urllib.request
os.makedirs("./data", exist_ok=True)
os.makedirs("./src", exist_ok=True)
urllib.request.urlretrieve(
    "https://azuremlexamples.blob.core.windows.net/datasets/credit_card/default_of_credit_card_clients.csv",
    "./data/credit_card.csv"
)

('./data/credit_card.csv', <http.client.HTTPMessage at 0x720494257470>)

Step 2 - Check and start MLflow experiment

In [16]:
# Step 2 - Check mlflow and start experiment
import mlflow
mlflow.set_experiment("credit_card")

2025/09/24 10:44:39 INFO mlflow.tracking.fluent: Experiment with name 'credit_card' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/468553786695061552', creation_time=1758728679644, experiment_id='468553786695061552', last_update_time=1758728679644, lifecycle_stage='active', name='credit_card', tags={}>

Step 3 - Create credit-card-model script

In [17]:
# %%writefile ./src/credit-card-model.py
# Step 3 - Create a training script
import os
import argparse
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="Path to input data")
    parser.add_argument("--test_train_ratio", type=float, default=0.25)
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--learning_rate", type=float, default=0.1)
    parser.add_argument("--registered_model_name", type=str, help="Model name")
    args = parser.parse_args()

    mlflow.start_run()
    mlflow.sklearn.autolog()

    credit_df = pd.read_csv(args.data, header=1, index_col=0)
    train_df, test_df = train_test_split(credit_df, test_size=args.test_train_ratio)

    y_train = train_df.pop("default payment next month")
    X_train = train_df.values
    y_test = test_df.pop("default payment next month")
    X_test = test_df.values

    clf = GradientBoostingClassifier(n_estimators=args.n_estimators, learning_rate=args.learning_rate)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(classification_report(y_test, y_pred))

    mlflow.sklearn.log_model(sk_model=clf, registered_model_name=args.registered_model_name, artifact_path=args.registered_model_name)
    
    # mlflow.sklearn.save_model(sk_model=clf, path=os.path.join(args.registered_model_name, "trained_model"))
    mlflow.end_run()

if __name__ == "__main__":
    main()

Overwriting ./src/credit-card-model.py


Step 4 - run credit card model script

In [18]:
# Credit card model
!python ./src/credit-card-model.py --data ./data/credit_card.csv --n_estimators 50 --learning_rate 0.2 --registered_model_name credit_card_model

              precision    recall  f1-score   support

           0       0.83      0.95      0.89      5775
           1       0.70      0.37      0.48      1725

    accuracy                           0.82      7500
   macro avg       0.77      0.66      0.69      7500
weighted avg       0.80      0.82      0.80      7500

Successfully registered model 'credit_card_model'.
2025/09/24 10:45:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: credit_card_model, version 1
Created version '1' of model 'credit_card_model'.
🏃 View run bemused-seal-212 at: http://localhost:5000/#/experiments/468553786695061552/runs/2089a56ccf7d43689f794602119397f7
🧪 View experiment at: http://localhost:5000/#/experiments/468553786695061552


Diabetes model

Step 1 - Fix data

In [28]:
%%writefile ./src/diabetes-fix-missing-data.py
# import libraries
import argparse
import glob
from pathlib import Path
import pandas as pd
import mlflow

# get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input_data", type=str, help='Path to input data')
parser.add_argument('--output_data', type=str, help='Path of output data')
args = parser.parse_args()

# load the data (passed as an input dataset)
# data_path = args.input_data
# all_files = glob.glob(data_path + "/*.csv")
# df = pd.concat((pd.read_csv(f) for f in all_files), sort=False)
df = pd.read_csv(args.input_data)

# log row count input data
row_count = (len(df))
mlflow.log_metric('row count input data', row_count)

# remove nulls
df = df.dropna()

# log processed rows
row_count_processed = (len(df))
mlflow.log_metric('row count output data', row_count_processed)

# set the processed data as output
output_df = df.to_csv(args.output_data)

Writing ./src/diabetes-fix-missing-data.py


Step 2 - Run data fix script

In [None]:
!python ./src/diabetes-fix-missing-data.py --input_data ./data/diabetes.csv --output_data ./data/diabetes-out.csv

Step 3 - Normalize data

In [29]:
%%writefile ./src/diabetes-normalize-data.py
# import libraries
import argparse
import os
import glob
from pathlib import Path
import pandas as pd
import mlflow
from sklearn.preprocessing import MinMaxScaler

# get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input_data", type=str, help='Path to input data')
parser.add_argument('--output_data', type=str, help='Path of output data')
args = parser.parse_args()

"""
# load the data (passed as an input dataset)
print("files in input_data path: ")
arr = os.listdir(args.input_data)
print(arr)

for filename in arr:
    print("reading file: %s ..." % filename)
    with open(os.path.join(args.input_data, filename), "r") as handle:
        print(handle.read())

data_path = args.input_data
all_files = glob.glob(data_path + "/*.csv")
df = pd.concat((pd.read_csv(f) for f in all_files), sort=False)
"""

df = pd.read_csv(args.input_data)
# log row count input data
row_count = (len(df))
mlflow.log_metric('row count input data', row_count)

# remove nulls
df = df.dropna()

# log processed rows
row_count_processed = (len(df))
mlflow.log_metric('row count output data', row_count_processed)

# set the processed data as output
output_df = df.to_csv(args.output_data)

# normalize the numeric columns
scaler = MinMaxScaler()
num_cols = ['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree']
df[num_cols] = scaler.fit_transform(df[num_cols])

# log processed rows
row_count_processed = (len(df))
mlflow.log_metric('row count output data', row_count_processed)

# set the processed data as output
output_df = df.to_csv(args.output_data)

Writing ./src/diabetes-normalize-data.py


Step 4 - Run data normalization script

In [None]:
!python ./src/diabetes-normalize-data.py --input_data ./data/diabetes.csv --output_data ./data/diabetes-out-2.csv

Step 5 - Create diabetes model script

In [26]:
%%writefile src/diabetes-model.py
# import libraries
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import argparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# load the diabetes dataset
parser = argparse.ArgumentParser()
parser.add_argument("--data", type=str, help="Path to input data")
parser.add_argument("--test_size", type=float, default=0.30, help="Proportion of the dataset to include in the test split")
parser.add_argument("--reg", type=float, default=0.01, help="Regularization rate parameter")
parser.add_argument("--registered_model_name", type=str, help="Model name")
 
args = parser.parse_args()
args

print("Loading Data...")
diabetes = pd.read_csv(args.data)

# Start an MLflow run
mlflow.start_run()
mlflow.sklearn.autolog()

# separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_size)

# set regularization hyperparameter
reg = args.reg

# train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg, 'test size of', args.test_size)
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))

# mlflow.sklearn.log_model(sk_model=model, registered_model_name=args.registered_model_name, artifact_path=args.registered_model_name)
mlflow.sklearn.log_model(sk_model=model, registered_model_name=args.registered_model_name, name=args.registered_model_name)

mlflow.end_run()


Overwriting src/diabetes-model.py


Step 6 - Run diabetes-model script

In [27]:
!python ./src/diabetes-model.py --data ./data/diabetes.csv --test_size 0.20 --reg 0.02 --registered_model_name diabetes_model

Loading Data...
Training a logistic regression model with regularization rate of 0.02 test size of 0.2
Accuracy: 0.786
AUC: 0.8607077822102089
Registered model 'diabetes_model' already exists. Creating a new version of this model...
2025/09/24 11:20:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: diabetes_model, version 2
Created version '2' of model 'diabetes_model'.
🏃 View run stately-squid-613 at: http://localhost:5000/#/experiments/468553786695061552/runs/a16011736b144852bc1e27a221fbf4ff
🧪 View experiment at: http://localhost:5000/#/experiments/468553786695061552


Step 7 - Create diabetes decision tree model script

In [30]:
%%writefile ./src/diabetes-decision-tree-model.py
# Import libraries
import argparse
import glob
import pickle
from pathlib import Path

import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# get parameters
parser = argparse.ArgumentParser("train")
parser.add_argument("--training_data", type=str, help="Path to training data")
parser.add_argument("--model_output", type=str, help="Path of output model")

args = parser.parse_args()

training_data = args.training_data
model_output = args.model_output

"""
# load the prepared data file in the training folder
print("Loading Data...")
data_path = args.training_data
all_files = glob.glob(data_path + "/*.csv")
df = pd.concat((pd.read_csv(f) for f in all_files), sort=False)
"""

df = pd.read_csv(args.training_data)

# Separate features and labels
X, y = (
    df[
        [
            "Pregnancies",
            "PlasmaGlucose",
            "DiastolicBloodPressure",
            "TricepsThickness",
            "SerumInsulin",
            "BMI",
            "DiabetesPedigree",
            "Age",
        ]
    ].values,
    df["Diabetic"].values,
)

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=0
)

# Train a decision tree model
print('Training a decision tree model...')
model = DecisionTreeClassifier().fit(X_train, y_train)

# Calculate accuracy
y_pred = model.predict(X_test)
acc = np.average(y_pred == y_test)
print("Accuracy:", acc)
mlflow.log_metric("Accuracy", float(acc))

# Calculate AUC
y_pred_proba = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_pred_proba[:, 1])
print("AUC: " + str(auc))
mlflow.log_metric("AUC", float(auc))

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], "k--")
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.savefig("ROCcurve.png")
mlflow.log_artifact("ROCcurve.png")

# Create confusion matrix
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(
            x=j, y=i, s=conf_matrix[i, j], va="center", ha="center",
            size="xx-large"
        )

plt.xlabel("Predictions", fontsize=18)
plt.ylabel("Actuals", fontsize=18)
plt.title("Confusion Matrix", fontsize=18)
plt.savefig("ConfusionMatrix.png")
mlflow.log_artifact("ConfusionMatrix.png")

# Output the model and test data
# Output the model and test data
# pickle.dump(model, open((Path(args.model_output) / "model.sav"), "wb"))
pickle.dump(model, open(model_output, "wb"))

Writing ./src/diabetes-decision-tree-model.py


Step 8 - Run diabetes decision tree model script

In [None]:
!python ./src/diabetes-decision-tree.py --training_data ./data/diabetes-out-2.csv --model_output ./model/decision-tree.save

Step 9 - Create diabetes Logistic Regression model script

In [32]:
%%writefile ./src/diabetes-logistic-regression-model.py
# Import libraries
import argparse
import glob
import pickle
from pathlib import Path

import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split

# get parameters
parser = argparse.ArgumentParser("train")
parser.add_argument("--training_data", type=str, help="Path to training data")
parser.add_argument("--reg_rate", type=float, default=0.01)
parser.add_argument("--model_output", type=str, help="Path of output model")

args = parser.parse_args()

training_data = args.training_data

# load the prepared data file in the training folder
print("Loading Data...")
data_path = args.training_data
all_files = glob.glob(data_path + "/*.csv")
df = pd.concat((pd.read_csv(f) for f in all_files), sort=False)

# Separate features and labels
X, y = (
    df[
        [
            "Pregnancies",
            "PlasmaGlucose",
            "DiastolicBloodPressure",
            "TricepsThickness",
            "SerumInsulin",
            "BMI",
            "DiabetesPedigree",
            "Age",
        ]
    ].values,
    df["Diabetic"].values,
)

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=0
)

# Train a logistic regression model
print('Training a logistic regression model...')
model = LogisticRegression(C=1 / args.reg_rate, solver="liblinear").fit(
    X_train, y_train
)

# Calculate accuracy
y_pred = model.predict(X_test)
acc = np.average(y_pred == y_test)
print("Accuracy:", acc)
mlflow.log_metric("Accuracy", np.float(acc))

# Calculate AUC
y_pred_proba = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_pred_proba[:, 1])
print("AUC: " + str(auc))
mlflow.log_metric("AUC", np.float(auc))

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], "k--")
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.savefig("ROCcurve.png")
mlflow.log_artifact("ROCcurve.png")

# Create confusion matrix
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(
            x=j, y=i, s=conf_matrix[i, j], va="center", ha="center",
            size="xx-large"
        )

plt.xlabel("Predictions", fontsize=18)
plt.ylabel("Actuals", fontsize=18)
plt.title("Confusion Matrix", fontsize=18)
plt.savefig("ConfusionMatrix.png")
mlflow.log_artifact("ConfusionMatrix.png")

# Output the model and test data
pickle.dump(model, open((Path(args.model_output) / "model.sav"), "wb"))

Writing ./src/diabetes-logistic-regression-model.py


Step 10 - Run diabetes logistic regression model script 

In [None]:
!python ./src/diabetes-logistic-regression.py --training_data ./data/diabetes-out-2.csv --model_output ./model/log-regression-tree.save

Step 11 - Create diabetes Random Forrest model script

In [33]:
%%writefile ./src/diabetes-random-forrest-model.py
# train.py
# import necessary libraries
import pandas as pd
import argparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# get parameters
parser = argparse.ArgumentParser("train")
parser.add_argument("--training_data", type=str, help="Path to training data")
parser.add_argument("--n_estimators", type=int, default=100, help="Number of trees in the forest")
parser.add_argument("--model_output", type=str, help="Path of output model")
parser.add_argument("--test_size", type=float, default=0.30, help="test size")
parser.add_argument("--random_state", type=int, default=None, help="random state")

args = parser.parse_args()
print(args)

training_data = args.training_data
model_output = args.model_output
n_estimators = args.n_estimators
test_size = args.test_size
random_state = args.random_state

# Load your dataset
df = pd.read_csv(training_data)  # Replace with your actual file

# Assume the last column is the target
# X = data.iloc[:, :-1]  # Features
# y = data.iloc[:, -1]   # Target

# Separate features and labels
X, y = (
    df[
        [
            "Pregnancies",
            "PlasmaGlucose",
            "DiastolicBloodPressure",
            "TricepsThickness",
            "SerumInsulin",
            "BMI",
            "DiabetesPedigree",
            "Age",
        ]
    ].values,
    df["Diabetic"].values,
)

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)

# Initialize and train the classifier
model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Writing ./src/diabetes-random-forrest-model.py


Step 12 - Run diabetes Random Forrest model script

In [None]:
!python ./src/diabetes-random-forrest-model.py --training_data ./diabetes-data/diabetes.csv --model_output ./model/random-forest.save

Step 13 - Track diabetes Logistic Regression model training with MLflow

In [None]:
import mlflow
experiment_name = "mlflow-experiment-diabetes"
mlflow.set_experiment(experiment_name)

Step 13a - Track diabetes Logistic Regression model training with autolog

In [None]:
# Prepare the data
import mlflow
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

print("Reading data...")
df = pd.read_csv('./data/diabetes.csv')
df.head()

print("Splitting data...")
X, y = df[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, df['Diabetic'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

with mlflow.start_run():
    mlflow.sklearn.autolog()

    # Train with GPU if available
    with tf.device('/GPU:0'):
        model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)


Step 13b - Track diabetes Logistic Regression model training w/o autolog

In [None]:
mlflow.sklearn.autolog(disable=True)

Step 13c - Track diabetes Tensorflow Framework LR model training

In [None]:
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
import numpy as np

with mlflow.start_run():
    # with tf.device('/GPU:0'):
    model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)
    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)
    mlflow.log_param("regularization_rate", 0.1)
    mlflow.log_metric("Accuracy", acc)

Step 13d - Track diabetes sklearn LR model training

In [None]:
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
import numpy as np

with mlflow.start_run():
    with tf.device('/GPU:0'):
        model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)
        y_hat = model.predict(X_test)
        acc = np.average(y_hat == y_test)
        mlflow.log_param("regularization_rate", 0.1)
        mlflow.log_metric("Accuracy", acc)

Step 13e - Track diabetes sklearn Decision Tree model training

In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np

with mlflow.start_run():
    model = DecisionTreeClassifier().fit(X_train, y_train)

    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)

    mlflow.log_param("estimator", "DecisionTreeClassifier")
    mlflow.log_metric("Accuracy", acc)

Step 13f - Track diabetes sklearn DT model training and log artifacts

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import numpy as np

with mlflow.start_run():
    model = DecisionTreeClassifier().fit(X_train, y_train)

    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)

    # plot ROC curve
    y_scores = model.predict_proba(X_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.savefig("ROC-Curve.png")

    mlflow.log_param("estimator", "DecisionTreeClassifier")
    mlflow.log_metric("Accuracy", acc)
    mlflow.log_artifact("ROC-Curve.png")

Step 13g - Create diabetes MLflow tracking custom log script

In [37]:
%%writefile src/diabetes-model-mlflow-custom-log.py
# import libraries
import mlflow
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

def main(args):
    # read data
    df = get_data(args.training_data)

    # split data
    X_train, X_test, y_train, y_test = split_data(df)

    # train model
    model = train_model(args.reg_rate, X_train, X_test, y_train, y_test)

    # evaluate model
    eval_model(model, X_test, y_test)

# function that reads the data
def get_data(path):
    print("Reading data...")
    df = pd.read_csv(path)
    
    return df

# function that splits the data
def split_data(df):
    print("Splitting data...")
    X, y = df[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
    'SerumInsulin','BMI','DiabetesPedigree','Age']].values, df['Diabetic'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

    return X_train, X_test, y_train, y_test

# function that trains the model
def train_model(reg_rate, X_train, X_test, y_train, y_test):
    mlflow.log_param("Regularization rate", reg_rate)
    print("Training model...")
    model = LogisticRegression(C=1/reg_rate, solver="liblinear").fit(X_train, y_train)

    return model

# function that evaluates the model
def eval_model(model, X_test, y_test):
    # calculate accuracy
    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)
    print('Accuracy:', acc)
    mlflow.log_metric("Accuracy", acc)

    # calculate AUC
    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test,y_scores[:,1])
    print('AUC: ' + str(auc))
    mlflow.log_metric("AUC", auc)

    # plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.savefig("ROC-Curve.png")
    mlflow.log_artifact("ROC-Curve.png")    

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--training_data", dest='training_data',
                        type=str)
    parser.add_argument("--reg_rate", dest='reg_rate',
                        type=float, default=0.01)

    # parse args
    args = parser.parse_args()

    # return args
    return args

# run script
if __name__ == "__main__":
    # add space in logs
    print("\n\n")
    print("*" * 60)

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")


Writing src/diabetes-model-mlflow-custom-log.py


Step 13h - Run diabetes MLflow tracking custom log script

In [None]:
!python ./src/diabetes-model-mlflow-custom-log.py --training_data ./data/diabetes.csv --reg_rate 0.01

Step 13i - Create diabetes MLflow tracking autolog script

In [38]:
%%writefile src/diabetes-model-mlflow-autolog.py
# import libraries
import mlflow
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

def main(args):
    # enable autologging
    mlflow.autolog()

    # read data
    df = get_data(args.training_data)

    # split data
    X_train, X_test, y_train, y_test = split_data(df)

    # train model
    model = train_model(args.reg_rate, X_train, X_test, y_train, y_test)

    eval_model(model, X_test, y_test)

# function that reads the data
def get_data(path):
    print("Reading data...")
    df = pd.read_csv(path)
    
    return df

# function that splits the data
def split_data(df):
    print("Splitting data...")
    X, y = df[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
    'SerumInsulin','BMI','DiabetesPedigree','Age']].values, df['Diabetic'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

    return X_train, X_test, y_train, y_test

# function that trains the model
def train_model(reg_rate, X_train, X_test, y_train, y_test):
    print("Training model...")
    model = LogisticRegression(C=1/reg_rate, solver="liblinear").fit(X_train, y_train)

    return model

# function that evaluates the model
def eval_model(model, X_test, y_test):
    # calculate accuracy
    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)
    print('Accuracy:', acc)

    # calculate AUC
    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test,y_scores[:,1])
    print('AUC: ' + str(auc))

    # plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.savefig("ROC-Curve.png") 

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--training_data", dest='training_data',
                        type=str)
    parser.add_argument("--reg_rate", dest='reg_rate',
                        type=float, default=0.01)

    # parse args
    args = parser.parse_args()

    # return args
    return args

# run script
if __name__ == "__main__":
    # add space in logs
    print("\n\n")
    print("*" * 60)

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")


Writing src/diabetes-model-mlflow-autolog.py


Step 13j - Run diabetes MLflow tracking autolog script

In [None]:
!python ./src/diabetes-model-mlflow-autolog-log.py --training_data ./data/diabetes.csv --reg_rate 0.01 

Step 13k - Query diabetes MLflow logs

In [40]:
# check if mlflow server is running
!pgrep -a mlflow

import mlflow
experiments = mlflow.search_experiments()
for exp in experiments:
    print(exp.name)
    
experiment_name = "diabetes-training"
exp = mlflow.get_experiment_by_name(experiment_name)
print(exp)

if exp is not None:
    mlflow.search_runs(exp.experiment_id)
    mlflow.search_runs(exp.experiment_id, order_by=["start_time DESC"], max_results=2)

    query = "metrics.AUC > 0.8 and tags.model_type = 'LogisticRegression'"
    mlflow.search_runs(exp.experiment_id, filter_string=query)
else:
    print(f"Experiment '{experiment_name}' not found. Please check the experiment name or create it first.")


3675 /home/tshen/dp-100/mslearn-azure-ml/.venv/bin/python3 /home/tshen/dp-100/mslearn-azure-ml/.venv/bin/mlflow ui
26709 /home/tshen/dp-100/mslearn-azure-ml/.venv/bin/python3 /home/tshen/dp-100/mslearn-azure-ml/.venv/bin/mlflow models serve -m models:/wine-quality-predictor/1 --port 5002 --env-manager local
credit_card
Experiment with 5000
/my-experiment
MLflow Quickstart
wine-quality-optimization
Default
None
Experiment 'diabetes-training' not found. Please check the experiment name or create it first.


Step 14 - Create PyTorch Framework IrisClassifier model script

In [34]:
%%writefile ./src/pytorch_iris.py
# pytorch_iris.py
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import argparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# get parameters
parser = argparse.ArgumentParser("train")
parser.add_argument("--training_data", type=str, default='./iris-data/iris.csv', help="Path to training data")
parser.add_argument("--reg_rate", type=float, default=0.01)
parser.add_argument("--model_output", type=str, default='./model/iris.save', help="Path of output model")
parser.add_argument("--test_size", type=float, default=0.30, help="test size")
parser.add_argument("--random_state", type=int, default=0, help="random state")
parser.add_argument("--n_epoch", type=int, default=50, help="number of epochs")
parser.add_argument("--learning_rate", type=float, default=0.01, help="learning rate")
args = parser.parse_args()
print(args)

training_data = args.training_data
reg_rate = args.reg_rate
model_output = args.model_output
test_size = args.test_size
random_state = args.random_state
n_epoch = args.n_epoch
learning_rate = args.learning_rate

# Load Iris dataset
df = pd.read_csv(training_data)  # Replace with your actual file
X = df.drop("species", axis=1).values
y = LabelEncoder().fit_transform(df["species"])
# Assume the last column is the target
# X = df.iloc[:, :-1]  # Features
# y = df.iloc[:, -1]   # Target

# Preprocess
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Define model
class IrisClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(4, 16),
            nn.ReLU(),
            nn.Linear(16, 3)
        )

    def forward(self, x):
        return self.net(x)

model = IrisClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=reg_rate)

# Training loop
for epoch in range(50):
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# Save model
# torch.save(model.state_dict(), "iris_model.pt")
torch.save(model.state_dict(), model_output)

# Evaluate
with torch.no_grad():
    preds = torch.argmax(model(X_test), dim=1)
    acc = (preds == y_test).float().mean()
    print(f"Test Accuracy: {acc:.4f}")

Writing ./src/pytorch_iris.py


Step 15 - Run PyTorch Framework IrisClassifier model script

In [None]:
!python ./src/pytorch_iris.py --training_data ./data/iris.csv

Step 16 - Create Tensoflow Framework Iris Keras Sequential model script

In [35]:
%%writefile ./src/tensorflow_iris.py
# tensorflow_iris.py
import tensorflow as tf
import pandas as pd
import argparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# get parameters
parser = argparse.ArgumentParser("train")
parser.add_argument("--training_data", type=str, default='./iris-data/iris.csv', help="Path to training data")
parser.add_argument("--reg_rate", type=float, default=0.01)
parser.add_argument("--model_output", type=str, default='./model/iris_tf.keras', help="Path of output model, file extension .keras instead of .h5 required")
parser.add_argument("--test_size", type=float, default=0.30, help="test size")
parser.add_argument("--random_state", type=int, default=0, help="random state")
parser.add_argument("--n_epoch", type=int, default=50, help="number of epochs")
parser.add_argument("--learning_rate", type=float, default=0.01, help="learning rate")
parser.add_argument("--batch_size", type=int, default=16, help="batch size")
parser.add_argument("--gpu", type=str, default="no", help="if gpu available yes/no")
args = parser.parse_args()
print(args)

training_data = args.training_data
reg_rate = args.reg_rate
model_output = args.model_output
test_size = args.test_size
random_state = args.random_state
n_epoch = args.n_epoch
learning_rate = args.learning_rate
batch_size = args.batch_size
gpu = args.gpu
print(f"GPU enabled: {gpu}")

# Load Iris dataset
df = pd.read_csv(training_data)  # Replace with your actual file
X = df.drop("species", axis=1).values
y = LabelEncoder().fit_transform(df["species"])

# Preprocess
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Build model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(batch_size, activation='relu', input_shape=(4,)),
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])

# Train with GPU if available
if gpu.lower() == "yes":    
    print("Training with GPU...")
    with tf.device('/GPU:0'):
        model.fit(X_train, y_train, epochs=n_epoch, batch_size=batch_size, verbose=1)
else:
    print("Training with CPU...")   
    model.fit(X_train, y_train, epochs=n_epoch, batch_size=batch_size, verbose=1)

# Save model
model.save(model_output) # Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.4f}")

Writing ./src/tensorflow_iris.py


Step 17 - Run Tensorflow Framework Iris Keras Sequential model script

In [None]:
!python ./src/tensorflow_iris.py --training_data ./data/iris.csv --gpu yes --n_epoch 80 --learning_rate 0.01 --batch_size 16