# Monitor

In this step we will create Monitor workflow pipeline based on schedule, whose purpose is to call

1) call the service created after training a data prediction model using darts framework and NBEATS Deep Learning model. (see notebook parcheggi_ml.ipynb)
2) save the prediction in database.


## Platform Support - Data Ops
We use the platform support to read the data created into the platform after the execution of notebook(parcheggi_data_pipeline.ipynb) for training

In [None]:
import pandas as pd
import requests
import os
import json
import digitalhub as dh

In [None]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/exports/csv?lang=it&timezone=UTC&use_labels=true&delimiter=%3B"

In [None]:
PROJECT_NAME = "parcheggi-scheduler"
proj = dh.get_or_create_project(PROJECT_NAME)
print("created project {}".format(PROJECT_NAME))
PROJECT_NAME

In [None]:
data_item_download = proj.get_dataitem("dataset")

In [None]:
parkings_df = data_item_download.as_df()
# parkings_df[parkings_df.columns[0]].count()
parkings_df.head()

In this script, one needs to update the 'serve' RUN id of the NBEATSModel service. From the project console, go to RUNS(model_serve) in RUNNING state, and copy the identifier value (last part of key value) 

**project.get_run(identifier='f4823893-1785-4a14-aeb3-99335b64f0fb')**


In [None]:
%%writefile "src/predict_nbeats_timeseries.py"
from digitalhub_runtime_python import handler
from sqlalchemy import create_engine
import datetime 
import requests
import json
import os
import pandas as pd
import digitalhub as dh

@handler()
def predict_day(project,  parkings_di):
    """
    Monitor and predict parking occupancy.
    """

    # get serving predictor function run
    run_serve_model =  project.get_run(identifier='3067478c-5da4-4190-a587-73a1ddb3fac9')
    
    # get current date and time as string
    date_str = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

    # get parkings dataset and convert it to a dataframe
    parkings_df = parkings_di.as_df()

    # initialize an empty dataframe for predictions
    pred_df = pd.DataFrame(columns=['parcheggio', 'datetime', 'predicted_mean'])

    # iterate over each parking in the dataset
    parcheggi =  parkings_df['parcheggio'].unique()
    #parcheggi = ['Riva Reno' ,'VIII Agosto']
    for parking_str in parcheggi:
        # construct API URL based on parking and current date
        API_URL = f'https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/records?where=data%3C%3D%27{date_str}%27%20and%20parcheggio%3D%27{parking_str}%27&order_by=data%20DESC&limit=100'

        # define the file to store the latest data
        latest_data_file = 'last_records.json'

        # fetch data from the API and save it to a file
        with requests.get(API_URL) as r:
            with open(latest_data_file, "wb") as f:
                f.write(r.content)

        # read the latest data from the file and process it
        with open(latest_data_file) as f:
            json_data = json.load(f)
            df_latest = pd.json_normalize(json_data['results']).drop(columns=['guid', 'occupazione']).rename(columns={"coordinate.lon": "lon", "coordinate.lat": "lat"})
            df_latest.data = df_latest.data.astype('datetime64[ns, UTC]')
            df_latest['value'] = df_latest.posti_occupati / df_latest.posti_totali
            df_latest['date'] = df_latest.data.dt.round('30min')
            df_latest = df_latest.drop(columns=['parcheggio'])
            df_latest = df_latest.groupby('date').agg({'value': 'mean'})

        # convert the processed data to JSON and make a request to the serving predictor function
        jsonstr = df_latest.reset_index().to_json(orient='records')
        arr = json.loads(jsonstr)
        SERVICE_URL = run_serve_model.status.to_dict()["service"]["url"]
        with requests.post(f'http://{SERVICE_URL}', json={"inference_input":arr}) as r:
            res = json.loads(r.content)
        res_df = pd.DataFrame(res)
        res_df['datetime'] = res_df['date']
        res_df['parcheggio'] = parking_str
        res_df['predicted_mean'] = res_df['value']
        res_df = res_df.drop(columns=['date', 'value'])
        pred_df = pd.concat([pred_df, res_df], ignore_index=True)
        
        
    # write data to database
    USERNAME = os.getenv("POSTGRES_USER")
    PASSWORD = os.getenv("POSTGRES_PASSWORD")
    engine = create_engine('postgresql+psycopg2://'+USERNAME+':'+PASSWORD+'@database-postgres-cluster/digitalhub')
    # save in db.
    with engine.connect() as connection:
        try: connection.execute("DELETE FROM parkings_prediction_nbeats WHERE datetime < now() - interval '30 days'")
        except: pass
    with engine.connect() as connection:
        try: connection.execute("DELETE FROM parkings_prediction_nbeats_sliced WHERE datetime < now() - interval '30 days'")
        except: pass
                
    pred_df['datetime'] = pd.to_datetime(pred_df['datetime'], unit='ms')
    # read existing table
    query = 'select * from parkings_prediction_nbeats'
    saved_df = pd.DataFrame()
    try:
        saved_df = pd.read_sql_query(query, engine)
        saved_df = saved_df.drop(columns=['index'])
    except: pass
        
    new_df = pd.concat([saved_df, pred_df])
    new_df = new_df.drop_duplicates(subset=['parcheggio', 'datetime'])
    new_df.to_sql('parkings_prediction_nbeats', engine, if_exists="replace")
    new_df['slice_datetime'] = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S')
    new_df.to_sql('parkings_prediction_nbeats_sliced', engine, if_exists="replace")

In [None]:
func = proj.new_function(name="predict-day-nbeats-model",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/predict_nbeats_timeseries.py", "handler": "predict_day"})

In [None]:
data_item_download = proj.get_dataitem("dataset").key
run_monitor_parkings = func.run(action="job",inputs={"parkings_di": data_item_download},outputs={})

Wait until prediction run is completed

In [None]:
run_monitor_parkings.refresh().status.state

In [None]:
# from digitalhub_runtime_python import handler
# from sqlalchemy import create_engine
# import datetime 
# import requests
# import json
# import os
# import pandas as pd
# import digitalhub as dh

# # get serving predictor function run
# run_serve_model =  proj.get_run(identifier='3067478c-5da4-4190-a587-73a1ddb3fac9')

# # get current date and time as string
# date_str = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

# # get parkings dataset and convert it to a dataframe
# parkings_df = data_item_download.as_df()

# # initialize an empty dataframe for predictions
# pred_df = pd.DataFrame(columns=['parcheggio', 'datetime', 'predicted_mean'])

# # iterate over each parking in the dataset
# parcheggi =  parkings_df['parcheggio'].unique()
# for parking_str in parcheggi:
#     # construct API URL based on parking and current date
#     API_URL = f'https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/records?where=data%3C%3D%27{date_str}%27%20and%20parcheggio%3D%27{parking_str}%27&order_by=data%20DESC&limit=100'

#     # define the file to store the latest data
#     latest_data_file = 'last_records.json'

#     # fetch data from the API and save it to a file
#     with requests.get(API_URL) as r:
#         with open(latest_data_file, "wb") as f:
#             f.write(r.content)

#     # read the latest data from the file and process it
#     with open(latest_data_file) as f:
#         json_data = json.load(f)
#         df_latest = pd.json_normalize(json_data['results']).drop(columns=['guid', 'occupazione']).rename(columns={"coordinate.lon": "lon", "coordinate.lat": "lat"})
#         df_latest.data = df_latest.data.astype('datetime64[ns, UTC]')
#         df_latest['value'] = df_latest.posti_occupati / df_latest.posti_totali
#         df_latest['date'] = df_latest.data.dt.round('30min')
#         df_latest = df_latest.drop(columns=['parcheggio'])
#         df_latest = df_latest.groupby('date').agg({'value': 'mean'})

#     # convert the processed data to JSON and make a request to the serving predictor function
#     jsonstr = df_latest.reset_index().to_json(orient='records')
#     arr = json.loads(jsonstr)
#     SERVICE_URL = run_serve_model.status.to_dict()["service"]["url"]
#     with requests.post(f'http://{SERVICE_URL}', json={"inference_input":arr}) as r:
#         res = json.loads(r.content)
#     res_df = pd.DataFrame(res)
#     res_df['datetime'] = res_df['date']
#     res_df['parcheggio'] = parking_str
#     res_df['predicted_mean'] = res_df['value']
#     res_df = res_df.drop(columns=['date', 'value'])
#     pred_df = pd.concat([pred_df, res_df], ignore_index=True)

#     # write data to database
#     USERNAME = os.getenv("POSTGRES_USER")
#     PASSWORD = os.getenv("POSTGRES_PASSWORD")
#     engine = create_engine('postgresql+psycopg2://'+USERNAME+':'+PASSWORD+'@database-postgres-cluster/digitalhub')
    
# # save in db.
# with engine.connect() as connection:
#     try: connection.execute("DELETE FROM parkings_prediction_nbeats WHERE datetime < now() - interval '30 days'")
#     except: pass

# # save in db.
# with engine.connect() as connection:
#     try: connection.execute("DELETE FROM parkings_prediction_nbeats_sliced WHERE datetime < now() - interval '30 days'")
#     except: pass


# pred_df['datetime'] = pd.to_datetime(pred_df['datetime'], unit='ms')

# query = 'select * from parkings_prediction_nbeats'
# saved_df = pd.DataFrame()
# try:
#     saved_df = pd.read_sql_query(query, engine)
#     saved_df = saved_df.drop(columns=['index'])
# except: pass
    
# new_df = pd.concat([saved_df, pred_df])
# new_df = new_df.drop_duplicates(subset=['parcheggio', 'datetime'])
# len(new_df)
# new_df.to_sql('parkings_prediction_nbeats', engine, if_exists="replace")
# new_df['slice_datetime'] = date_str
# new_df.to_sql('parkings_prediction_nbeats_sliced', engine, if_exists="replace")

## Pipeline

In this step we will create a workflow pipeline whose purpose is to call the download function to fetch data and pass it to predict_day function which produce prediction based on NBEATS model. The entire workflow is scheduled for frequent runs based on frequrency provided using CRON expression.

In [None]:
%%writefile "src/parkings_pipeline_nbeats_model.py"

from digitalhub_runtime_kfp.dsl import pipeline_context

def myhandler(di):
    with pipeline_context() as pc:
        s2_predict = pc.step(name="predict-day-nbeats-model", function="predict-day-nbeats-model", action="job", inputs={"parkings_di":di}, outputs={})

In [None]:
workflow = proj.new_workflow(name="pipeline_parcheggi_nbeats_model", kind="kfp", source={"source": "src/parkings_pipeline_nbeats_model.py", "handler": "myhandler"})

## Schedule

Nbeats model Pipeline workflow is scheduled for frequent runs using Crons expression.

In [None]:
di = proj.get_dataitem("dataset").key
# workflow.run(parameters={"di": di})

In [None]:
workflow.run(parameters={"di": di}, schedule="0 8-18 * * *")