# LOAD LIBRARIES

In [21]:
import mlflow
import mlflow.xgboost
import xgboost as xgb

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt
import pandas as pd
import os

# SETTINGS

In [5]:
mlflow_arn = "arn:aws:sagemaker:eu-west-1:575618486322:mlflow-tracking-server/dev-mlflow"
mlflow_experiment_name = "02-sample-experiment"

# SET MLFLOW

In [6]:
mlflow.set_tracking_uri(mlflow_arn)
mlflow.set_experiment(mlflow_experiment_name)

2024/10/30 10:47:39 INFO mlflow.tracking.fluent: Experiment with name '02-sample-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://ipf-sds-datalake-dev-data-science-bucket/mlflow/2', creation_time=1730285259326, experiment_id='2', last_update_time=1730285259326, lifecycle_stage='active', name='02-sample-experiment', tags={}>

# LOAD DATA

In [7]:
# Load Diabetes dataset
data = load_diabetes()
X = data.data
y = data.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# BUILD A MODEL

In [8]:
# Create and train model
model = xgb.XGBRegressor(n_estimators=100, max_depth=3, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# REGISTER AN ARTIFACT IN A FOLDER

In [13]:
# Start a new MLflow run
with mlflow.start_run() as run:
    run_id = run.info.run_id
    print(f"Run ID: {run_id}")

    # Log parameters directly from the model
    params = model.get_params()
    for param, value in params.items():
        mlflow.log_param(param, value)

    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)

    mlflow.set_tag("model_type", "xgboost")

    # Log the model
    mlflow.xgboost.log_model(model, "mymodel")


    # Log a sample input
    sample_input = X_test[0]
    input_df = pd.DataFrame([sample_input], columns=data.feature_names)
    input_file = "sample_input.csv"
    input_df.to_csv(input_file, index=False)
    mlflow.log_artifact(input_file, artifact_path = "DATA")

    # Remove temporary files
    os.remove(residuals_plot_file)
    os.remove(input_file)

print("Logging completed.")

Run ID: 8195ec1a094349afb580725eddfe47eb


2024/10/30 10:50:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run overjoyed-doe-579 at: https://eu-west-1.experiments.sagemaker.aws/#/experiments/2/runs/8195ec1a094349afb580725eddfe47eb.
2024/10/30 10:50:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://eu-west-1.experiments.sagemaker.aws/#/experiments/2.


Logging completed.


# RETRIEVE AN ARTIFACT

In [39]:
import boto3
from mlflow.tracking import MlflowClient

# Create MLflow cliden
client = MlflowClient()

# Get run
run = client.get_run(run_id)

# Get S3 path to artifacts
artifact_uri = run.info.artifact_uri
print(artifact_uri)

# Remove the "s3://" prefix and split the path
path_parts = artifact_uri.replace("s3://", "").split("/", 1)
bucket = path_parts[0]           # The first part is the bucket name
prefix = path_parts[1] if len(path_parts) > 1 else ""  # The remaining part is the prefix
print(bucket)
print(prefix)

# Initialize boto3 S3 client
s3_client = boto3.client("s3")

# List objects in the specified prefix
response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)

# Check if the response contains contents
if "Contents" in response:
    print("Files in specified S3 location:")
    for obj in response["Contents"]:
        print(obj["Key"])  # Print the full path (key) of each file
else:
    print("No files found in the specified S3 location.")


# Retrieve the file
response_file = s3_client.get_object(Bucket=bucket, Key=response["Contents"][0]["Key"])

# Read and decode the file content
file_content = response_file['Body'].read().decode('utf-8')
print(file_content)


s3://ipf-sds-datalake-dev-data-science-bucket/mlflow/2/97eac6e056ee4f5c9fc5453bb4bbe134/artifacts
ipf-sds-datalake-dev-data-science-bucket
mlflow/2/97eac6e056ee4f5c9fc5453bb4bbe134/artifacts
Files in specified S3 location:
mlflow/2/97eac6e056ee4f5c9fc5453bb4bbe134/artifacts/DATA/sample_input.csv
mlflow/2/97eac6e056ee4f5c9fc5453bb4bbe134/artifacts/model/MLmodel
mlflow/2/97eac6e056ee4f5c9fc5453bb4bbe134/artifacts/model/conda.yaml
mlflow/2/97eac6e056ee4f5c9fc5453bb4bbe134/artifacts/model/model.xgb
mlflow/2/97eac6e056ee4f5c9fc5453bb4bbe134/artifacts/model/python_env.yaml
mlflow/2/97eac6e056ee4f5c9fc5453bb4bbe134/artifacts/model/requirements.txt
mlflow/2/97eac6e056ee4f5c9fc5453bb4bbe134/artifacts/residuals_plot.png
age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0.04534098333546186,-0.044641636506989144,-0.006205954135807083,-0.015998975220305175,0.12501870313429186,0.1251981011367534,0.019186997017453092,0.03430885887772673,0.03243232415655107,-0.005219804415300423

