# Función para realizar las predicciones

In [1]:
from datetime import datetime
#import sklearn
import pickle
import joblib
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature
import mlflow.sklearn
import mlflow.pyfunc
from sklearn.preprocessing import MinMaxScaler
from config.config import MLFLOW_PROJECT, MLFLOW_PWD, MLFLOW_USER, historical_data_1hrfuture
from sklearn.pipeline import Pipeline
#import os
import utils.utils
from utils.utils import *
from sklearn.model_selection import train_test_split
import pandas as pd
import pyspark
#from config.config import DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME, DATABASE_PORT
#from sqlalchemy import create_engine

In [2]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
client = MlflowClient()

# Logging the models

In [3]:
station = 'mer'
table_name = 'apicalidadaire_'+station+'_norm'
target = 'O3'
X, y, df, dates = table_data(table_name, target, station)
time_steps = 24
time_future =1
X_seq, y_seq = create_sequences2(X, y, time_steps, time_future)
# Dividir los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

In [4]:
model_dir = 'C:/Users/valer/Documents/CIC/doctorado/Proyecto_Innovacion/air-poll-predict-dev/ML/Modelos/best_model_XGBoost_24timesteps_O3_1timefuture_MER_2024-08-06 12_34.pkl'
loaded_model = joblib.load(model_dir)
dir_scaler = 'C:/Users/valer/Documents/CIC/doctorado/Proyecto_Innovacion/air-poll-predict-dev/ML/Scalers/MER_scaler_O3.pkl'
with open(dir_scaler, 'rb') as file:
    loaded_scaler = pickle.load(file)

predicciones_normalizadas = loaded_model.predict(X_test)
predicciones_normalizadas = predicciones_normalizadas.reshape(-1, 1)
predicciones = loaded_scaler.inverse_transform(predicciones_normalizadas)
y_test_normalizadas = y_test.reshape(-1, 1)
y_test = loaded_scaler.inverse_transform(y_test_normalizadas)
metrics_results = metrics(X_test, y_test, predicciones)

R^2: 0.792765
R^2 ajustado: 1.476641
RMSE 16.178246
MAE: 11.134049


In [7]:
mlflow.set_experiment(MLFLOW_PROJECT)
run_name = f"{target}-{time_future}hr-{station}test"
params = loaded_model.get_params()
# Start an MLflow run
with mlflow.start_run() as run:
    # Log the hyperparameters
    mlflow.log_params(params)
    run_id = run.info.run_id

    with open(dir_scaler, 'rb') as file:
        loaded_scaler = pickle.load(file)
    
    # Loguear el archivo JSON como un artifact en MLflow
    mlflow.log_artifact(dir_scaler, artifact_path="artifacts")


    # Log the loss metric
    for metric_name, value in metrics_results.items():
        mlflow.log_metric(metric_name, value)

    # Set a tag that we can use to remind ourselves what this run was for
    info =  f"XGboost model for {target}-{time_future} hr prediction, with the {station}-station data"
    mlflow.set_tag("Training Info",info)

    # Infer the model signature
    signature = infer_signature(X_train, loaded_model.predict(X_train))
    model_name = f"{target}-{station}_{time_future}hr_forecast_model"
    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=loaded_model,
        artifact_path=f"{station} station model for {target}-{time_future}hr forecasting",
        signature=signature,
        input_example=X_train,
    )




In [8]:
# Create a new version of the rfr model under the registered model name
#model_uri = "runs:/{}/sklearn-model".format(run.info.run_id)
model_uri = f"runs:/{run_id}/mer station model for O3-1hr forecasting".format(run.info.run_id)
mv = client.create_model_version(model_name, model_uri, run.info.run_id)
# Set registered model alias
client.set_registered_model_alias(model_name, "validation_status", mv.version)
# Asignar un tag al modelo registrado
client.set_model_version_tag(
    name=model_name,
    version= mv.version,
    key="historicalData",
    value=historical_data_1hrfuture
)

2024/08/13 21:54:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: O3-MER_1hr_forecast_model, version 13


### Registrar como el mejor modelo, si supero las métricas

In [9]:
client = MlflowClient()
model_name = "O3-mer_1hr_forecast_model"
best_model_alias = "champion"
#best_model = mlflow.pyfunc.load_model(f"models:/{model_name}@{best_model_alias}")
best_model_info = client.get_model_version_by_alias(model_name, best_model_alias)
best_model_version = best_model_info.version
best_model_run_id = best_model_info.run_id
# Acceder a las métricas del mejor modelo
best_metrics = client.get_run(best_model_run_id).data.metrics
# Comparar las métricas del modelo actual con el mejor modelo hasta el momento
print("Métricas del run:", run_id)
if metrics_results["r2adjusted"] > best_metrics["r2adjusted"] and metrics_results["rmse"] < best_metrics["rmse"]:
    #Registra el nuevo modelo como el mejor
    client.set_registered_model_alias(model_name, best_model_alias, mv.version)
    client.delete_registered_model_alias(model_name, "validation_status") 
    client.set_registered_model_alias(model_name, "old_champion", best_model_version)
    

Métricas del run: 96eab81bee814307ba32388fcef4142b


# Loading the model

In [11]:
model_name = "O3-mer_1hr_forecast_model"
alias = "champion"
best_model = mlflow.pyfunc.load_model(f"models:/{model_name}@{alias}")


Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

In [18]:
station = "MER"
X, y, df, dates = table_data(table_name, target, station)
data = ingest(df, target, time_steps)
norm_predictions = best_model.predict(data)
norm_predictions

array([0.11871517], dtype=float32)

In [17]:
artifacts = client.list_artifacts(run_id, path="artifacts")
local_path = mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path="artifacts/MER_scaler_O3.pkl")

# Abrir el archivo .pkl descargado
with open(local_path, "rb") as f:
    scaler = pickle.load(f)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
norm_predictions = norm_predictions.reshape(-1, 1)
predictions = loaded_scaler.inverse_transform(norm_predictions)
ozone_value = round(float(predictions),4)
print("The Ozone value for the next hour is", ozone_value, "ppb")

The Ozone value for the next hour is 19.7067 ppb


In [27]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [29]:
class ozonePredictor():    
    def __init__(self,
                 train_experiment,
                 from_start = False):
        self.from_start = from_start
        self.train_experiment = train_experiment
        
    def save(self,path):
        joblib.dump(self.model, 'model.pkl')
    
    def get_model_from_mlflow(self):
        return mlflow.sklearn.load_model(self.model_uri)
    
    def load(self):
        if self.from_start:
            self.model = Pipeline([
                    ('std_scaler',MinMaxScaler()),
                    ])
            
        else:
            
            # ------------------------
            # Cargar el ultimo modelo registrado
            # ------------------------   
            #self.model = mlflow.pytorch.load_model(model_uri)
            model_name = "O3-mer_1hr_forecast_model"
            alias = "champion"
            self.model = mlflow.pyfunc.load_model(f"models:/{model_name}@{alias}")
            
    def load_historical_data(self, target, time_steps):
        #Conexion con postgress     
        engine = create_engine(f'postgresql://{DATABASE_USER}:{DATABASE_PASSWORD}@{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_NAME}')
        esquema = 'public'
        # Recuperar los datos y cargar en un DataFrame
        table_name = 'apicalidadaire_'+station+'_norm'
        query = f"SELECT * FROM {esquema}.{table_name};"
        df = pd.read_sql_query(query, engine)
        df = df.tail(time_steps)
        X = df.drop(columns=['idData', 'date', 'year', 'month', 'day', 'hour', 'minutes', 'NOX'])
        X = X.drop(columns=[target])
        array = X.to_numpy()
        vector = array.flatten()
        return np.array([vector])
            
    def load_dataset(table_name, target,station,time_steps, time_future, test_size):
        engine = create_engine(f'postgresql://{DATABASE_USER}:{DATABASE_PASSWORD}@{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_NAME}')
        esquema = 'public'
        # Recuperar los datos y cargar en un DataFrame
        table_name = 'apicalidadaire_'+station+'_norm'
        query = f"SELECT * FROM {esquema}.{table_name};"
        df = pd.read_sql_query(query, engine)
        dates = df.date
        y = df[target]
        X = df.drop(columns=['idData', 'date', 'year', 'month', 'day', 'hour', 'minutes', 'NOX'])
        X = X.drop(columns=[target])
        Xs, ys = [], []
        for i in range(len(X) - time_steps-time_future):
            df = X[i:(i + time_steps)]
            array = df.to_numpy()
            # Aplanar el array a un vector
            vector = array.flatten()
            Xs.append(vector)
            ys.append(y[i + time_steps+time_future])
        X_seq = np.array(Xs)
        y_seq = np.array(ys)
        # Dividir los datos en conjunto de entrenamiento y prueba
        X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=test_size, random_state=42)
        return X_train, X_test, y_train, y_test

    def predict(self,data):
        #data = [CO, NO, NO2, SO2, PM10, PM25, RH, TMP, TRFC..etc]
        predictions =  self.model.predict(data)
        return predictions
        
                
    def train(self,):
        # ------------------------
        # Acceder a los registros de MLflow
        # ------------------------   
        # Si no existe el experimento, lo crea y configura para registrar los parámetros del modelo
        if not mlflow.get_experiment_by_name(self.train_experiment):
            mlflow.create_experiment(name=self.train_experiment)
        # URL y puerto del servidor MLFLOW
        mlflow.set_experiment(self.train_experiment)
        experiment = mlflow.get_experiment_by_name(self.train_experiment)
        
        # Dividir los datos en conjunto de entrenamiento y prueba
        table_name = 'apicalidadaire_'+station+'_norm'
        target = 'O3'
        station = 'mer'
        time_steps = 24
        time_future = 1
        test_size = 0.2
        X_train, X_test, y_train, y_test = self.load_dataset(table_name, target,station,time_steps, time_future, test_size)
        # ------------------------
        # 5 Entrenamiento
        # ------------------------
        # Definir los parámetros para GridSearchCV
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 4, 5],
            'learning_rate': [0.01, 0.05, 0.1]
        }
        retrain_model = GridSearchCV(estimator=self.model, param_grid=param_grid, cv=5, scoring='r2',return_train_score=True)
        # Entrenar GridSearchCV
        retrain_model.fit(X_train, y_train)

        mlflow.set_experiment(MLFLOW_PROJECT)
        run_name = f"{target}-{time_future}hr-{station}test"
        params = loaded_model.get_params()
        # Start an MLflow run
        with mlflow.start_run() as run:
            # Log the hyperparameters
            mlflow.log_params(params)
            run_id = run.info.run_id

            with open(dir_scaler, 'rb') as file:
                loaded_scaler = pickle.load(file)
            
            # Loguear el archivo JSON como un artifact en MLflow
            mlflow.log_artifact(dir_scaler, artifact_path="artifacts")


            # Log the loss metric
            for metric_name, value in metrics_results.items():
                mlflow.log_metric(metric_name, value)

            # Set a tag that we can use to remind ourselves what this run was for
            info =  f"XGboost model for {target}-{time_future} hr prediction, with the {station}-station data"
            mlflow.set_tag("Training Info",info)

            # Infer the model signature
            signature = infer_signature(X_train, retrain_model.predict(X_train))
            model_name = f"{target}-{station}_{time_future}hr_forecast_model"
            # Log the model
            model_info = mlflow.sklearn.log_model(
                sk_model=retrain_model,
                artifact_path=f"{station} station model for {target}-{time_future}hr forecasting",
                signature=signature,
                input_example=X_train,
            )

            model_uri = f"runs:/{run_id}/mer station model for O3-1hr forecasting".format(run.info.run_id)
            mv = client.create_model_version(model_name, model_uri, run.info.run_id)
            # Set registered model alias
            client.set_registered_model_alias(model_name, "validation_status", mv.version)
            # Asignar un tag al modelo registrado
            client.set_model_version_tag(
                name=model_name,
                version= mv.version,
                key="historicalData",
                value=historical_data_1hrfuture
            )
            #Pregunta si el nuevo modelo es mejor que el último mejor modelo registrado
            #De ser así, registra el nuevo modelo como el mejor bajo el alias de "champion"
            client = MlflowClient()
            model_name = "O3-mer_1hr_forecast_model"
            best_model_alias = "champion"
            best_model_info = client.get_model_version_by_alias(model_name, best_model_alias)
            best_model_version = best_model_info.version
            best_model_run_id = best_model_info.run_id
            # Acceder a las métricas del mejor modelo
            best_metrics = client.get_run(best_model_run_id).data.metrics
            # Comparar las métricas del modelo actual con el mejor modelo hasta el momento
            if metrics_results["r2adjusted"] > best_metrics["r2adjusted"] and metrics_results["rmse"] < best_metrics["rmse"]:
                #Registra el nuevo modelo como el mejor
                client.set_registered_model_alias(model_name, best_model_alias, mv.version)
                client.delete_registered_model_alias(model_name, "validation_status") 
                client.set_registered_model_alias(model_name, "old_champion", best_model_version)
            else:
                print("No se mejoró el modelo")
            

In [None]:
# 128.000:8080/predict
#def predict():
datetime_now = datetime.now()
print(datetime_now)
date = pd.to_datetime(
                datetime_now,
                format='%d/%m/%Y %H:%M:%S',
                dayfirst=True,
                errors='coerce'
            )
date = date.dt.round('H')
print(date)


In [1]:
from mlflow.tracking import MlflowClient

# Crear un cliente de MLflow
client = MlflowClient()

# Borrar todas las versiones de todos los modelos registrados
def delete_all_model_versions():
    registered_models = client.list_registered_models()
    for model in registered_models:
        model_name = model.name
        versions = client.get_latest_versions(model_name, stages=["None", "Staging", "Production", "Archived"])
        for version in versions:
            client.delete_model_version(model_name, version.version)
        # Opcional: eliminar el registro del modelo si no quedan versiones
        client.delete_registered_model(model_name)

# Borrar todos los experimentos
def delete_all_experiments():
    experiments = client.list_experiments()
    for experiment in experiments:
        client.delete_experiment(experiment.experiment_id)

# Ejecutar las funciones
delete_all_model_versions()
delete_all_experiments()


AttributeError: 'MlflowClient' object has no attribute 'list_registered_models'