# LOAD LIBRARIES

In [1]:
import mlflow
import mlflow.xgboost
import mlflow.data
from mlflow.data.pandas_dataset import PandasDataset

import xgboost as xgb
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd
import os

# SETTINGS

In [2]:
mlflow_arn = "arn:aws:sagemaker:eu-west-1:575618486322:mlflow-tracking-server/dev-mlflow"
mlflow_experiment_name = "02-sample-experiment"

# SET MLFLOW

In [3]:
mlflow.set_tracking_uri(mlflow_arn)
mlflow.set_experiment(mlflow_experiment_name)

<Experiment: artifact_location='s3://ipf-sds-datalake-dev-data-science-bucket/mlflow/2', creation_time=1730285259326, experiment_id='2', last_update_time=1730285259326, lifecycle_stage='active', name='02-sample-experiment', tags={}>

# LOAD DATA

In [21]:
# Load Diabetes dataset
data = load_diabetes()
X = data.data
y = data.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df_train = pd.DataFrame(data=X_train, columns=data.feature_names)
df_train['target'] = y_train

df_test = pd.DataFrame(data=X_test, columns=data.feature_names)
df_test['target'] = y_test

# BUILD A MODEL

In [22]:
# Create and train model
model = xgb.XGBRegressor(n_estimators=100, max_depth=1, random_state=42)
model.fit(X_train, y_train)

# CALCULTE QUALITY CHART

In [23]:
df_train["predictions"] = model.predict(df_train.iloc[:, :-1])

mse_per_tree = []
for i in range(20):
    preds = model.predict(df_train.iloc[:, :-2], iteration_range=(0,i+1))
    mse = mean_squared_error(y_train, preds)
    mse_per_tree.append(mse)

# LOG CHART

In [25]:
# Start a new MLflow run
with mlflow.start_run(run_name = "run-with-chart") as run:
    run_id = run.info.run_id
    print(f"Run ID: {run_id}")

    for i in range(20):
        mlflow.log_metric("mse_train", mse_per_tree[i], step = i + 1)
    
    # Log parameters directly from the model
    params = model.get_params()
    for param, value in params.items():
        mlflow.log_param(param, value)
    
    # Log the model
    mlflow.xgboost.log_model(model, "model")

print("Logging completed.")

Run ID: 1087d8e820dc42b8be84ee3156c3d38b


2024/10/30 11:05:55 INFO mlflow.tracking._tracking_service.client: üèÉ View run run-with-chart at: https://eu-west-1.experiments.sagemaker.aws/#/experiments/2/runs/1087d8e820dc42b8be84ee3156c3d38b.
2024/10/30 11:05:55 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://eu-west-1.experiments.sagemaker.aws/#/experiments/2.


Logging completed.
