In [1]:
import polars as pl
from sklearn import datasets
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
data_dict = datasets.load_diabetes()
df_base=pl.DataFrame(data_dict.get("data"), schema=data_dict.get("feature_names"))
df = df_base.with_columns(pl.Series("target", data_dict.get("target"))).with_columns(
    pl.col("age").qcut(quantiles=4, labels=['a','b','c','d']).alias("age_groups")
    )
model_df_dict = {}
for i in df.select(pl.col("age_groups")).unique().to_series().to_list():
    key_val = {str(i) : df.filter(pl.col("age_groups")==i).to_pandas()}
    model_df_dict.update(key_val)

In [None]:
# optional: print all rows and columns
with pl.Config(tbl_cols=-1, tbl_rows=-1):
    print(df)

In [4]:
# separate the dataset into four groups based on the age column
model_df_dict = {}
for i in df.select(pl.col("age_groups")).unique().to_series().to_list():
    key_val = {str(i) : df.filter(pl.col("age_groups")==i).to_pandas()}
    model_df_dict.update(key_val)

In [5]:
# define a function that takes in the model_df_dict and age_group classification and builds a fitted linear regression model on the data
def build_LinearRegression_model(model_df_dict:dict, age_group:str)  -> LinearRegression:
    X_cols = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
    y_col = ['target']
    X = model_df_dict.get(age_group)[X_cols].copy()
    y = model_df_dict.get(age_group)[y_col].copy()
    y = y['target'].to_list()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0)
    model = LinearRegression()
    model.fit(X_train, y_train)
    model_score = model.score(X_test, y_test)
    return model, model_score
    

In [6]:
# define a function that takes in the model_df_dict and age_group classification and builds a fitted gradient boosting regression model on the data
def build_GradienBoosting_model(model_df_dict:dict, age_group:str)  -> GradientBoostingRegressor:
    X_cols = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
    y_col = ['target']
    X = model_df_dict.get(age_group)[X_cols].copy()
    y = model_df_dict.get(age_group)[y_col].copy()
    y = y['target'].to_list()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0)
    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)
    model_score = model.score(X_test, y_test)
    return model, model_score

In [1]:
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("your-experiment-name2")
with mlflow.start_run() as run:
    X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    params = {"max_depth": 2, "random_state": 42}
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)

    # Infer the model signature
    y_pred = model.predict(X_test)
    signature = infer_signature(X_test, y_pred)

    # Log parameters and metrics using the MLflow APIs
    mlflow.log_params(params)
    mlflow.log_metrics({"mse": mean_squared_error(y_test, y_pred)})

    # Log the sklearn model and register as version 1
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="sklearn-model",
        signature=signature,
        registered_model_name="sk-learn-random-forest-reg-model",
    )

2024/12/16 21:37:15 INFO mlflow.tracking.fluent: Experiment with name 'your-experiment-name2' does not exist. Creating a new experiment.
Successfully registered model 'sk-learn-random-forest-reg-model'.
2024/12/16 21:37:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sk-learn-random-forest-reg-model, version 1
Created version '1' of model 'sk-learn-random-forest-reg-model'.
2024/12/16 21:37:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run peaceful-doe-952 at: http://localhost:5000/#/experiments/1/runs/da07f33ec241475eb33151583ec46cec.
2024/12/16 21:37:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


In [None]:
# test
model, score = build_GradienBoosting_model(model_df_dict, 'a')

(10, -0.078464055859915)


In [7]:
# loop throught 
for i in model_df_dict.keys():
    build_model(model_df_dict, i)

0.3830295939287842
0.4200441469358238
0.4469238120580451
0.2871730424430763
