# Monitor

In this step we will create Monitor workflow pipeline based on schedule, whose purpose is to call

1) call the service created after training a data prediction model using darts framework and NBEATS Deep Learning model. (see notebook parcheggi_ml.ipynb)
2) save the prediction in database.


## Platform Support - Data Ops
We use the platform support to read the data created into the platform after the execution of notebook(parcheggi_data_pipeline.ipynb) for training

In [2]:
import pandas as pd
import requests
import os
import json
import digitalhub as dh

In [3]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/exports/csv?lang=it&timezone=UTC&use_labels=true&delimiter=%3B"

In [4]:
PROJECT_NAME = "parcheggi-scheduler"
proj = dh.get_or_create_project(PROJECT_NAME)
print("created project {}".format(PROJECT_NAME))
PROJECT_NAME

created project parcheggi-scheduler


'parcheggi-scheduler'

In [4]:
data_item_download = proj.get_dataitem("dataset")

In [5]:
parkings_df = data_item_download.as_df()
parkings_df

Unnamed: 0,parcheggio,data,posti_liberi,posti_occupati,posti_totali,lat,lon
0,Riva Reno,2024-07-02T08:09:00+00:00,343.0,127.0,470,44.501153,11.336062
1,VIII Agosto,2024-07-02T08:29:00+00:00,385.0,240.0,625,44.500297,11.345368
2,Autostazione,2024-07-02T08:49:00+00:00,214.0,51.0,265,44.504422,11.346514
3,Riva Reno,2024-07-02T08:59:00+00:00,319.0,151.0,470,44.501153,11.336062
4,Autostazione,2024-07-02T08:52:00+00:00,212.0,53.0,265,44.504422,11.346514
...,...,...,...,...,...,...,...
49981,Autostazione,2024-09-25T02:59:00+00:00,190.0,75.0,265,44.504422,11.346514
49982,Autostazione,2024-09-25T03:09:00+00:00,190.0,75.0,265,44.504422,11.346514
49983,Autostazione,2024-09-25T03:49:00+00:00,191.0,74.0,265,44.504422,11.346514
49984,VIII Agosto,2024-09-25T03:59:00+00:00,469.0,156.0,625,44.500297,11.345368


In this script, one needs to update the 'serve' RUN id of the model service. From the project console, go to RUNS(model_serve) in RUNNING state, and copy the identifier value (last part of key value) 

**project.get_run(identifier='f4823893-1785-4a14-aeb3-99335b64f0fb')**


In [1]:
%%writefile "src/predict_nbeats_timeseries.py"
from digitalhub_runtime_python import handler
import datetime 
import requests
import json
import pandas as pd
import digitalhub as dh

@handler()
def predict_day(project,  parkings_di):
    """
    Monitor and predict parking occupancy.
    """

    # get serving predictor function run
    run_serve_model =  project.get_run(identifier='f4823893-1785-4a14-aeb3-99335b64f0fb')
    
    # get current date and time as string
    date_str = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

    # get parkings dataset and convert it to a dataframe
    parkings_df = parkings_di.as_df()

    # initialize an empty dataframe for predictions
    pred_df = pd.DataFrame(columns=['parcheggio', 'datetime', 'predicted_mean'])

    # iterate over each parking in the dataset
    parcheggi =  parkings_df['parcheggio'].unique()
    #parcheggi = ['Riva Reno' ,'VIII Agosto']
    for parking_str in parcheggi:
        # construct API URL based on parking and current date
        API_URL = f'https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/records?where=data%3C%3D%27{date_str}%27%20and%20parcheggio%3D%27{parking_str}%27&order_by=data%20DESC&limit=100'

        # define the file to store the latest data
        latest_data_file = 'last_records.json'

        # fetch data from the API and save it to a file
        with requests.get(API_URL) as r:
            with open(latest_data_file, "wb") as f:
                f.write(r.content)

        # read the latest data from the file and process it
        with open(latest_data_file) as f:
            json_data = json.load(f)
            df_latest = pd.json_normalize(json_data['results']).drop(columns=['guid', 'occupazione']).rename(columns={"coordinate.lon": "lon", "coordinate.lat": "lat"})
            df_latest.data = df_latest.data.astype('datetime64[ns, UTC]')
            df_latest['value'] = df_latest.posti_occupati / df_latest.posti_totali
            df_latest['date'] = df_latest.data.dt.round('30min')
            df_latest = df_latest.drop(columns=['parcheggio'])
            df_latest = df_latest.groupby('date').agg({'value': 'mean'})

        # convert the processed data to JSON and make a request to the serving predictor function
        jsonstr = df_latest.reset_index().to_json(orient='records')
        arr = json.loads(jsonstr)
        SERVICE_URL = run_serve_model.status.to_dict()["service"]["url"]
        with requests.post(f'http://{SERVICE_URL}', json={"inference_input":arr}) as r:
            res = json.loads(r.content)
        res_df = pd.DataFrame(res)
        res_df['datetime'] = res_df['date']
        res_df['parcheggio'] = parking_str
        res_df['predicted_mean'] = res_df['value']
        res_df = res_df.drop(columns=['date', 'value'])
        pred_df = pd.concat([pred_df, res_df], ignore_index=True)

    # concatenate the predicted results with the existing data (if any) and remove duplicates
    old_pd = pred_df
    try: 
        dat_old = project.get_dataitem('parking_prediction_nbeats_model')
        old_pd = pd.concat([dat_old.as_df(), pred_df], ignore_index=True)
        old_pd = old_pd.drop_duplicates(subset=['parcheggio', 'datetime'])
    except: pass

    # log the predictions as a dataset in the project
    project.log_dataitem('parking_prediction_nbeats_model', data=old_pd, kind="table")

    old_pd = pred_df.copy()
    old_pd['slice_datetime'] = date_str
    try:
        dat_old = project.get_dataitem('parking_prediction_nbeats_model_sliced')
        old_pd = pd.concat([dat_old.as_df(), old_pd], ignore_index=True)
    except: pass

    # log the predictions as a dataset in the project
    project.log_dataitem('parking_prediction_nbeats_model_sliced', data=old_pd, kind="table")

Writing src/predict_nbeats_timeseries.py


In [6]:
func = proj.new_function(name="predict-day-nbeats-model",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/predict_nbeats_timeseries.py", "handler": "predict_day"})

In [7]:
data_item_download = proj.get_dataitem("dataset").key
run_monitor_parkings = func.run(action="job",inputs={"parkings_di": data_item_download},outputs={})

## Pipeline

In this step we will create a workflow pipeline whose purpose is to call the download function to fetch data and pass it to predict_day function which produce prediction based on NBEATS model. The entire workflow is scheduled for frequent runs based on frequrency provided using CRON expression.

In [17]:
%%writefile "src/parkings_pipeline_nbeats_model.py"

from digitalhub_runtime_kfp.dsl import pipeline_context

def myhandler(url):
    with pipeline_context() as pc:
        s1_dataset = pc.step(name="download", function="downloader-funct", action="job", inputs={"url":url},outputs={"dataset":"dataset"})
        s2_predict = pc.step(name="predict-day-nbeats-model", function="predict-day", action="job", inputs={"parkings_di":s1_dataset.outputs['dataset']}, outputs={})

Writing src/parkings_pipeline_nbeats_model.py


In [18]:
workflow = proj.new_workflow(name="pipeline_parcheggi_nbeats_model", kind="kfp", source={"source": "src/parkings_pipeline_nbeats_model.py", "handler": "myhandler"})

## Schedule

Nbeats model Pipeline workflow is scheduled for frequent runs using Crons expression.

In [22]:
di= proj.new_dataitem(name="url_data_item",kind="table",path=URL)
workflow.run(parameters={"url": di.key}, schedule="@hourly")
#workflow.run(parameters={"url": di.key}, schedule="*/5 * * * *")

{'kind': 'kfp+run', 'metadata': {'project': 'parcheggi-nk-scheduler-tenant2', 'name': 'd9a7f68c-6019-4a68-84f3-10d9462b88ff', 'created': '2024-10-02T10:48:42.985Z', 'updated': '2024-10-02T10:48:43.002Z', 'created_by': 'tenant2userid', 'updated_by': 'tenant2userid'}, 'spec': {'task': 'kfp+pipeline://parcheggi-nk-scheduler-tenant2/pipeline_parcheggi_nbeats_model:454416a4-33e3-4933-9e1f-05beefab227e', 'local_execution': False, 'source': {'source': 'src/parkings_pipeline_nbeats_model.py', 'handler': 'myhandler', 'base64': 'CmZyb20gZGlnaXRhbGh1Yl9ydW50aW1lX2tmcC5kc2wgaW1wb3J0IHBpcGVsaW5lX2NvbnRleHQKCmRlZiBteWhhbmRsZXIodXJsKToKICAgIHdpdGggcGlwZWxpbmVfY29udGV4dCgpIGFzIHBjOgogICAgICAgIHMxX2RhdGFzZXQgPSBwYy5zdGVwKG5hbWU9ImRvd25sb2FkIiwgZnVuY3Rpb249ImRvd25sb2FkZXItZnVuY3QiLCBhY3Rpb249ImpvYiIsIGlucHV0cz17InVybCI6dXJsfSxvdXRwdXRzPXsiZGF0YXNldCI6ImRhdGFzZXQifSkKICAgICAgICBzMl9wcmVkaWN0ID0gcGMuc3RlcChuYW1lPSJwcmVkaWN0LWRheS1uYmVhdHMtbW9kZWwiLCBmdW5jdGlvbj0icHJlZGljdC1kYXkiLCBhY3Rpb249ImpvYiIsIGlucHV