In [1]:
from datetime import datetime
import pickle
import joblib
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature
import mlflow.sklearn
import mlflow.pyfunc
from sklearn.preprocessing import MinMaxScaler
from config.config import MLFLOW_PROJECT, MLFLOW_PWD, MLFLOW_USER, historical_data_1hrfuture, RUTA_MODELOS
from sklearn.pipeline import Pipeline
import utils.utils
from utils.utils import *
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
mlflow.set_tracking_uri(uri="http://geopiig.com:5000")
client = MlflowClient()

# Logging the first models of each station and each problem (O3, for 1 and 24 hrs)

In [3]:
stations = ['MER', 'UIZ']
times_future = [1,24]
time_steps = 24
target = 'O3'

for time_future in times_future:
    for station in stations:
        print(station)
        # Se carga el archivo de los modelos y del scaler por cada estacion, el target, y el tiempo a predecir
        model_dir = RUTA_MODELOS+'air-poll-predict-dev/ML/Modelos/best_model_XGBoost_'+str(time_steps)+'timesteps_O3_'+str(time_future)+'timefuture_'+station+'.pkl'
        loaded_model = joblib.load(model_dir)
        dir_scaler = RUTA_MODELOS+'air-poll-predict-dev/ML/Scalers/'+station+'_scaler.pkl'
        with open(dir_scaler, 'rb') as file:
            loaded_scaler = pickle.load(file)

        # Se acceden a los datos de su respectiva base de datos, 
        # se dividen en conjunto de entrenamiento y prueba 
        # Se evalúan las métricas para registrar el desempeño del modelo
        station = station.lower()
        table_name = 'apicalidadaire_'+station+'_norm'
        X, y, df, dates = table_data(table_name, target, station)
        X_seq, y_seq = create_sequences2(X, y, time_steps, time_future)
        X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)
        predicciones_normalizadas = loaded_model.predict(X_test)
        #predicciones_normalizadas = predicciones_normalizadas.reshape(-1, 1)
        #predicciones = loaded_scaler.inverse_transform(predicciones_normalizadas)
        min_val = loaded_scaler.data_min_[4]  # Valor mínimo del O3
        max_val = loaded_scaler.data_max_[4]  # Valor máximo del O3
        # Aplicar la transformación inversa 
        predicciones = predicciones_normalizadas * (max_val - min_val) + min_val
        #y_test_normalizadas = y_test.reshape(-1, 1)
        #y_test = loaded_scaler.inverse_transform(y_test_normalizadas)
        y_test = y_test * (max_val - min_val) + min_val
        metrics_results = metrics(X_test, y_test, predicciones, printData=False)

        MLFLOW_experiment = f"{target} {time_future}hr forecast {station}"
        mlflow.set_experiment(MLFLOW_experiment)
        params = loaded_model.get_params()
        print("2")
        # Start an MLflow run
        with mlflow.start_run() as run:
            # Log the hyperparameters
            mlflow.log_params(params)
            run_id = run.info.run_id
            
            # Log scaler as an artifact
            mlflow.log_artifact(dir_scaler, artifact_path="artifacts")

            # Log the metrics
            for metric_name, value in metrics_results.items():
                mlflow.log_metric(metric_name, value)

            # Set a tag that we can use to remind ourselves what this run was for
            info =  f"XGboost model for {target}-{time_future} hr prediction, with the {station}-station data"
            mlflow.set_tag("Training Info",info)

            # Infer the model signature (la forma de la entrada del modelo)
            signature = infer_signature(X_train, loaded_model.predict(X_train))
            model_name = f"{target}-{station}_{time_future}hr_forecast_model"
            # Log the model
            model_info = mlflow.sklearn.log_model(
                sk_model=loaded_model,
                artifact_path=f"{station} station model for {target}-{time_future}hr forecasting",
                signature=signature,
                input_example=X_train,
                registered_model_name=model_name
            )
            # Asignar un alias al modelo
            client.set_registered_model_alias(model_name, "champion", '1')
            # Asignar un tag al modelo registrado
            client.set_model_version_tag(
                name=model_name,
                version= '1',
                key="historicalData",
                value=historical_data_1hrfuture
            )

MER
6057
2


Registered model 'O3-mer_1hr_forecast_model' already exists. Creating a new version of this model...
2025/02/05 22:37:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: O3-mer_1hr_forecast_model, version 2
Created version '2' of model 'O3-mer_1hr_forecast_model'.


UIZ
4546
2


Registered model 'O3-uiz_1hr_forecast_model' already exists. Creating a new version of this model...
2025/02/05 22:37:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: O3-uiz_1hr_forecast_model, version 2
Created version '2' of model 'O3-uiz_1hr_forecast_model'.


MER
6057
2


Registered model 'O3-mer_24hr_forecast_model' already exists. Creating a new version of this model...
2025/02/05 22:37:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: O3-mer_24hr_forecast_model, version 2
Created version '2' of model 'O3-mer_24hr_forecast_model'.


UIZ
4546
2


Registered model 'O3-uiz_24hr_forecast_model' already exists. Creating a new version of this model...
2025/02/05 22:37:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: O3-uiz_24hr_forecast_model, version 2
Created version '2' of model 'O3-uiz_24hr_forecast_model'.


In [24]:
predicciones_normalizadas.shape

(896,)

In [23]:

predicciones_normalizadas = loaded_model.predict(X_test)
#predicciones_normalizadas = predicciones_normalizadas.reshape(-1, 1)
#predicciones = loaded_scaler.inverse_transform(predicciones_normalizadas)
predicciones_normalizadas

array([0.5123995 , 0.20661518, 0.0691115 , 0.43910307, 0.5845062 ,
       0.2942079 , 0.28867874, 0.5399862 , 0.44263145, 0.2636149 ,
       0.1331098 , 0.16433848, 0.1286836 , 0.1266099 , 0.11534319,
       0.2945012 , 0.17290457, 0.15259965, 0.3437878 , 0.5677621 ,
       0.10512304, 0.4570045 , 0.5420686 , 0.35755098, 0.119244  ,
       0.01513318, 0.357254  , 0.53718084, 0.4602354 , 0.13168329,
       0.36686084, 0.23268494, 0.16821368, 0.31479388, 0.16613628,
       0.27324602, 0.29387668, 0.34584916, 0.3337541 , 0.44957396,
       0.14511365, 0.03505878, 0.1093286 , 0.3111384 , 0.1251995 ,
       0.32548943, 0.21406625, 0.5865842 , 0.3243709 , 0.05581969,
       0.4311216 , 0.29952562, 0.39735347, 0.51576376, 0.08200467,
       0.09602525, 0.10745889, 0.45755127, 0.6218891 , 0.22120109,
       0.3745377 , 0.04064942, 0.7289362 , 0.32042503, 0.06457216,
       0.45498884, 0.6080833 , 0.02745872, 0.22336665, 0.03060006,
       0.02821485, 0.46386075, 0.3343176 , 0.4737987 , 0.14599

In [11]:
predicciones_normalizadas2 = predicciones_normalizadas.reshape(-1, 1)
predicciones_normalizadas2

array([[0.5123995 ],
       [0.20661518],
       [0.0691115 ],
       [0.43910307],
       [0.5845062 ],
       [0.2942079 ],
       [0.28867874],
       [0.5399862 ],
       [0.44263145],
       [0.2636149 ],
       [0.1331098 ],
       [0.16433848],
       [0.1286836 ],
       [0.1266099 ],
       [0.11534319],
       [0.2945012 ],
       [0.17290457],
       [0.15259965],
       [0.3437878 ],
       [0.5677621 ],
       [0.10512304],
       [0.4570045 ],
       [0.5420686 ],
       [0.35755098],
       [0.119244  ],
       [0.01513318],
       [0.357254  ],
       [0.53718084],
       [0.4602354 ],
       [0.13168329],
       [0.36686084],
       [0.23268494],
       [0.16821368],
       [0.31479388],
       [0.16613628],
       [0.27324602],
       [0.29387668],
       [0.34584916],
       [0.3337541 ],
       [0.44957396],
       [0.14511365],
       [0.03505878],
       [0.1093286 ],
       [0.3111384 ],
       [0.1251995 ],
       [0.32548943],
       [0.21406625],
       [0.586

In [25]:
loaded_scaler.data_range_

array([  4.05, 332.  , 331.  , 107.  , 139.  , 147.  ,  84.  ,  89.  ,
        64.  ,  23.  , 360.  ,   6.6 ])

In [27]:
min_val = loaded_scaler.data_min_[4]  # Valor mínimo del O3
max_val = loaded_scaler.data_max_[4]  # Valor máximo del O3

# Aplicar la transformación inversa manualmente
predicciones_reales = predicciones_normalizadas * (max_val - min_val) + min_val
print(predicciones_reales)

[ 71.223526    28.71951      9.606499    61.035328    81.24636
  40.894897    40.126343    75.05808     61.525772    36.64247
  18.502262    22.84305     17.88702     17.598778    16.032703
  40.935665    24.033735    21.211351    47.786503    78.91893
  14.6121025   63.523624    75.347534    49.699585    16.574917
   2.1035123   49.658306    74.66814     63.972717    18.303978
  50.993656    32.34321     23.381702    43.75635     23.092943
  37.981197    40.848858    48.073032    46.39182     62.49078
  20.170797     4.873171    15.196675    43.248238    17.40273
  45.24303     29.755209    81.5352      45.087555     7.758937
  59.9259      41.63406     55.232132    71.69116     11.398649
  13.34751     14.936786    63.59963     86.44259     30.746952
  52.06074      5.6502695  101.32213     44.539078     8.975531
  63.24345     84.52358      3.8167627   31.047964     4.2534084
   3.9218636   64.47665     46.470146    65.85802     20.292637
  46.872395    36.236767    80.869736    14.

# Loading the model

In [None]:
model_name = "O3-mer_1hr_forecast_model"
best_model_alias = "champion"
best_model = mlflow.pyfunc.load_model(f"models:/{model_name}@{best_model_alias}")
best_model_info = client.get_model_version_by_alias(model_name, best_model_alias)
best_model_version = best_model_info.version
best_model_run_id = best_model_info.run_id
station = "MER"
target = 'O3'
time_steps = 24
time_future = 1
table_name = 'apicalidadaire_'+station+'_norm'
X, y, df, dates = table_data(table_name, target, station)
data = ingest(df, target, time_steps)
norm_predictions = best_model.predict(data)
artifacts = client.list_artifacts(best_model_run_id, path="artifacts")
scaler_dir = 'artifacts/'+station.upper()+'_scaler_'+target+'.pkl'
local_path = mlflow.artifacts.download_artifacts(run_id=best_model_run_id, artifact_path=scaler_dir)
# Abrir el archivo .pkl descargado
with open(local_path, "rb") as f:
    loaded_scaler = pickle.load(f)
#norm_predictions = norm_predictions.reshape(-1, 1)
#print(norm_predictions)
#predictions = scaler.inverse_transform(norm_predictions)

min_val = loaded_scaler.data_min_[4]  # Valor mínimo del O3
max_val = loaded_scaler.data_max_[4]  # Valor máximo del O3
# Aplicar la transformación inversa 
predictions = predicciones_normalizadas * (max_val - min_val) + min_val
#y_test_normalizadas = y_test.reshape(-1, 1)
#y_test = loaded_scaler.inverse_transform(y_test_normalizadas)
y_test = y_test * (max_val - min_val) + min_val
metrics_results = metrics(X_test, y_test, predicciones, printData=False)

ozone_value = round(float(predictions),4)
print("The Ozone value for the next hour is", ozone_value, "ppb")

# Definición de la clase para el predictor

In [27]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
class ozonePredictor():    
    def __init__(self,
                 train_experiment,
                 from_start = False):
        self.from_start = from_start
        self.train_experiment = train_experiment
        
    def save(self,path):
        joblib.dump(self.model, 'model.pkl')
    
    def get_model_from_mlflow(self):
        return mlflow.sklearn.load_model(self.model_uri)
    
    def load(self):
        if self.from_start:
            self.model = Pipeline([
                    ('std_scaler',MinMaxScaler()),
                    ])
            
        else:
            
            # ------------------------
            # Cargar el ultimo modelo registrado
            # ------------------------   
            model_name = "O3-mer_1hr_forecast_model"
            alias = "champion"
            self.model = mlflow.pyfunc.load_model(f"models:/{model_name}@{alias}")
            
    def load_historical_data(self, target, time_steps):
        #Conexion con postgress     
        engine = create_engine(f'postgresql://{DATABASE_USER}:{DATABASE_PASSWORD}@{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_NAME}')
        esquema = 'public'
        # Recuperar los datos y cargar en un DataFrame
        table_name = 'apicalidadaire_'+station+'_norm'
        query = f"SELECT * FROM {esquema}.{table_name};"
        df = pd.read_sql_query(query, engine)
        df = df.tail(time_steps)
        X = df.drop(columns=['idData', 'date', 'year', 'month', 'day', 'hour', 'minutes', 'NOX'])
        X = X.drop(columns=[target])
        array = X.to_numpy()
        vector = array.flatten()
        return np.array([vector])
            
    def load_dataset(table_name, target,station,time_steps, time_future, test_size):
        engine = create_engine(f'postgresql://{DATABASE_USER}:{DATABASE_PASSWORD}@{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_NAME}')
        esquema = 'public'
        # Recuperar los datos y cargar en un DataFrame
        table_name = 'apicalidadaire_'+station+'_norm'
        query = f"SELECT * FROM {esquema}.{table_name};"
        df = pd.read_sql_query(query, engine)
        dates = df.date
        y = df[target]
        X = df.drop(columns=['idData', 'date', 'year', 'month', 'day', 'hour', 'minutes', 'NOX'])
        X = X.drop(columns=[target])
        Xs, ys = [], []
        for i in range(len(X) - time_steps-time_future):
            df = X[i:(i + time_steps)]
            array = df.to_numpy()
            # Aplanar el array a un vector
            vector = array.flatten()
            Xs.append(vector)
            ys.append(y[i + time_steps+time_future])
        X_seq = np.array(Xs)
        y_seq = np.array(ys)
        # Dividir los datos en conjunto de entrenamiento y prueba
        X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=test_size, random_state=42)
        return X_train, X_test, y_train, y_test

    def predict(self,data):
        #data = [CO, NO, NO2, SO2, PM10, PM25, RH, TMP, TRFC..etc]
        predictions =  self.model.predict(data)
        return predictions
        
                
    def train(self,):
        # ------------------------
        # Acceder a los registros de MLflow
        # ------------------------   
        # Si no existe el experimento, lo crea y configura para registrar los parámetros del modelo
        if not mlflow.get_experiment_by_name(self.train_experiment):
            mlflow.create_experiment(name=self.train_experiment)
        # URL y puerto del servidor MLFLOW
        mlflow.set_experiment(self.train_experiment)
        experiment = mlflow.get_experiment_by_name(self.train_experiment)
        
        # Dividir los datos en conjunto de entrenamiento y prueba
        table_name = 'apicalidadaire_'+station+'_norm'
        target = 'O3'
        station = 'mer'
        time_steps = 24
        time_future = 1
        test_size = 0.2
        X_train, X_test, y_train, y_test = self.load_dataset(table_name, target,station,time_steps, time_future, test_size)
        # ------------------------
        # 5 Entrenamiento
        # ------------------------
        # Definir los parámetros para GridSearchCV
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 4, 5],
            'learning_rate': [0.01, 0.05, 0.1]
        }
        retrain_model = GridSearchCV(estimator=self.model, param_grid=param_grid, cv=5, scoring='r2',return_train_score=True)
        # Entrenar GridSearchCV
        retrain_model.fit(X_train, y_train)

        mlflow.set_experiment(MLFLOW_PROJECT)
        run_name = f"{target}-{time_future}hr-{station}test"
        params = loaded_model.get_params()
        # Start an MLflow run
        with mlflow.start_run() as run:
            # Log the hyperparameters
            mlflow.log_params(params)
            run_id = run.info.run_id

            with open(dir_scaler, 'rb') as file:
                loaded_scaler = pickle.load(file)
            
            # Loguear el archivo JSON como un artifact en MLflow
            mlflow.log_artifact(dir_scaler, artifact_path="artifacts")


            # Log the loss metric
            for metric_name, value in metrics_results.items():
                mlflow.log_metric(metric_name, value)

            # Set a tag that we can use to remind ourselves what this run was for
            info =  f"XGboost model for {target}-{time_future} hr prediction, with the {station}-station data"
            mlflow.set_tag("Training Info",info)

            # Infer the model signature
            signature = infer_signature(X_train, retrain_model.predict(X_train))
            model_name = f"{target}-{station}_{time_future}hr_forecast_model"
            # Log the model
            model_info = mlflow.sklearn.log_model(
                sk_model=retrain_model,
                artifact_path=f"{station} station model for {target}-{time_future}hr forecasting",
                signature=signature,
                input_example=X_train,
            )

            model_uri = f"runs:/{run_id}/{station} station model for {target}-{time_future}hr forecasting".format(run.info.run_id)
            mv = client.create_model_version(model_name, model_uri, run.info.run_id)
            # Set registered model alias
            client.set_registered_model_alias(model_name, "validation_status", mv.version)
            # Asignar un tag al modelo registrado
            client.set_model_version_tag(
                name=model_name,
                version= mv.version,
                key="historicalData",
                value=historical_data_1hrfuture
            )
            #Pregunta si el nuevo modelo es mejor que el último mejor modelo registrado
            #De ser así, registra el nuevo modelo como el mejor bajo el alias de "champion"
            client = MlflowClient()
            model_name = "O3-mer_1hr_forecast_model"
            best_model_alias = "champion"
            best_model_info = client.get_model_version_by_alias(model_name, best_model_alias)
            best_model_version = best_model_info.version
            best_model_run_id = best_model_info.run_id
            # Acceder a las métricas del mejor modelo
            best_metrics = client.get_run(best_model_run_id).data.metrics
            # Comparar las métricas del modelo actual con el mejor modelo hasta el momento
            if metrics_results["r2adjusted"] > best_metrics["r2adjusted"] and metrics_results["rmse"] < best_metrics["rmse"]:
                #Registra el nuevo modelo como el mejor
                client.set_registered_model_alias(model_name, best_model_alias, mv.version)
                client.delete_registered_model_alias(model_name, "validation_status") 
                client.set_registered_model_alias(model_name, "old_champion", best_model_version)
            else:
                print("No se mejoró el modelo")
            

In [None]:
# 128.000:8080/predict
#def predict():
datetime_now = datetime.now()
print(datetime_now)
date = pd.to_datetime(
                datetime_now,
                format='%d/%m/%Y %H:%M:%S',
                dayfirst=True,
                errors='coerce'
            )
date = date.dt.round('H')
print(date)
