# Platform Support - Data Ops
We use the platform support to read the data created into the platform after the execution of notebook(parcheggi_data_pipeline.ipynb) for training

In [1]:
import pandas as pd
import requests
import os
import json

In [2]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/exports/csv?lang=it&timezone=UTC&use_labels=true&delimiter=%3B"

In [3]:
import digitalhub as dh
import getpass as gt

PROJECT_NAME = "parcheggi-nk-scheduler-"+gt.getuser()
proj = dh.get_or_create_project(PROJECT_NAME)
print("created project {}".format(PROJECT_NAME))
PROJECT_NAME

created project parcheggi-nk-scheduler-khurshid


'parcheggi-nk-scheduler-khurshid'

In [4]:
import os
directory="src"
if not os.path.exists(directory):
    os.makedirs(directory)

# Predict day (Regression SARIMAX)

In [5]:
%%writefile "src/predict_sarimax_regression.py"
from digitalhub_runtime_python import handler
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pandas as pd
from sqlalchemy import create_engine
import datetime
from pandas.tseries.offsets import DateOffset
import os
import json

# Define a custom function to serialize datetime objects 
def serialize_datetime(obj): 
    if isinstance(obj, datetime.datetime): 
        return obj.isoformat() 
    raise TypeError("Type not serializable") 

    
def to_point(point):
    """
    Convert a decimal number representing minutes to a datetime object.

    Args:
        point (float): The decimal number representing minutes.

    Returns:
        datetime.datetime: A datetime object representing the current date and time with the minutes derived from the input.

    Example:
        >>> to_point(45)
        datetime.datetime(2022, 1, 1, 0, 22, 30)
    """
    today = datetime.datetime.today()
    #return datetime.datetime(today.year, today.month, today.day, int(point * 30 / 60), int(point * 30 % 60))
    dt = datetime.datetime(today.year, today.month, today.day, int(point * 30 / 60), int(point * 30 % 60))
    return json.dumps(dt, default=serialize_datetime)


@handler(outputs=["parking_data_predicted_regression"])
def predict_day(project, parkings_di):
    """
    Predicts the occupancy of parking spaces for the next 48 steps and saves the results in a PostgreSQL database.

    Args:
        parkings_di: The data item containing the parking data.

    Returns:
        None
    """
    # Convert data item to pandas DataFrame
    df = parkings_di.as_df()

    # Create a clean copy of the DataFrame
    df_clean = df.copy()

    # Remove unnecessary columns
    df_clean = df_clean.drop(columns=['lat', 'lon'])

    # Convert 'data' column to datetime
    df_clean.data = df_clean.data.astype('datetime64[ms, UTC]')

    # Calculate the occupancy rate
    df_clean['occupied'] = df_clean.posti_occupati / df_clean.posti_totali

    # Round the 'data' column to the nearest 30 minutes
    df_clean['date_time_slice'] = df_clean.data.dt.round('30min').dt.tz_convert(None)

    # Extract the date from the 'data' column
    df_clean['date'] = df_clean.data.dt.tz_convert(None)
    # df_clean['date'] = df_clean['date'].tz_convert(None)

    # Filter out data from the last 30 days
    df_clean = df_clean[df_clean.date_time_slice >= (datetime.datetime.today() - pd.DateOffset(30))]

    # Filter out data from today
    df_clean = df_clean[df_clean.date <= (datetime.datetime.today() - pd.DateOffset(1))]

    # Remove the 'date' column
    df_clean = df_clean.drop(['date'], axis=1)

    # Ensure that 'posti_occupati' is within the range of [0, posti_totali]
    df_clean.posti_occupati = df_clean.apply(lambda x: max(0, min(x['posti_totali'], x['posti_occupati'])), axis=1)

    # Recalculate the occupancy rate
    df_clean['occupied'] = df_clean.posti_occupati / df_clean.posti_totali

    # Get unique parking locations
    parcheggi = df_clean['parcheggio'].unique()
    # parcheggi = ['Riva Reno' ,'VIII Agosto']

    # Initialize a list to store the predictions
    res = []

    # Iterate over each parking location
    for parcheggio in parcheggi:
        # Create a copy of the cleaned DataFrame
        cp = df_clean.copy()

        # Filter data for the current parking location
        parc_df = cp[cp['parcheggio'] == parcheggio]

        # Group data by 'date_time_slice' and aggregate metrics
        parc_df = parc_df.groupby('date_time_slice').agg({'posti_occupati':['sum','count'], 'posti_totali':['sum','count']})

        # Calculate the occupancy rate
        parc_df['occupied'] = parc_df.posti_occupati['sum'] / parc_df.posti_totali['sum']

        # Remove unnecessary columns
        parc_df.drop(columns=['posti_occupati', 'posti_totali'], inplace=True)

        # Sort the DataFrame by index
        parc_df.sort_index(inplace=True)

        # Extract the 'occupied' column as a Series
        data = parc_df.reset_index()['occupied']

        # Define the SARIMA model parameters
        my_seasonal_order = (1, 1, 1, 48)

        # Create and fit the SARIMA model
        sarima_model = SARIMAX(data, order=(1, 0, 1), seasonal_order=my_seasonal_order)
        results_SAR = sarima_model.fit(disp=-1)

        # Generate predictions for the next 48 steps
        pred = results_SAR.forecast(steps=48).reset_index()

        # Add the 'parcheggio' column
        pred['parcheggio'] = parcheggio
        res.append(pred)
    
    for pred in res:
        pred['point'] = (pred.index).astype('int')
        pred['datetime'] = pred['point'].apply(to_point)
        pred.drop(['point'], axis=1, inplace=True)
    
    all = pd.concat(res, ignore_index=True)[['predicted_mean', 'parcheggio', 'datetime']]

    USERNAME = os.getenv("POSTGRES_USER")
    PASSWORD = os.getenv("POSTGRES_PASSWORD")
    engine = create_engine('postgresql+psycopg2://'+USERNAME+':'+PASSWORD+'@database-postgres-cluster/digitalhub')
    with engine.connect() as connection: 
        try: connection.execute("DELETE FROM parkings_prediction")
        except: pass

    all.to_sql('parkings_prediction', engine, if_exists="append")

    # old_pd = all
    # try: 
    #     dat_old = project.get_dataitem('parking_prediction_sarima_model')
    #     old_pd = pd.concat([dat_old.as_df(), all], ignore_index=True)
    # except: pass
    # project.log_dataitem(name='parking_prediction_sarima_model', data=old_pd, kind="table")
    return all

Writing src/predict_sarimax_regression.py


In [10]:
func = proj.new_function(name="predict-day-sarimax-regression",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/predict_sarimax_regression.py", "handler": "predict_day"})

In [11]:
func.run(action="build", instructions=["pip3 install statsmodels"])

{'kind': 'python+run', 'metadata': {'project': 'parcheggi-nk-scheduler-khurshid', 'name': '773f11f3-5d2e-403e-aa0e-ef242c7cf85e', 'created': '2024-10-11T07:54:26.01Z', 'updated': '2024-10-11T07:54:26.037Z', 'created_by': 'khurshid@fbk.eu', 'updated_by': 'khurshid@fbk.eu'}, 'spec': {'task': 'python+build://parcheggi-nk-scheduler-khurshid/predict-day-sarimax-regression:4e58846e-2153-47a4-9dac-791d1c043b67', 'local_execution': False, 'function': 'python://parcheggi-nk-scheduler-khurshid/predict-day-sarimax-regression:4e58846e-2153-47a4-9dac-791d1c043b67', 'source': {'source': 'src/predict_sarimax_regression.py', 'handler': 'predict_day', 'base64': 'ZnJvbSBkaWdpdGFsaHViX3J1bnRpbWVfcHl0aG9uIGltcG9ydCBoYW5kbGVyCmZyb20gc3RhdHNtb2RlbHMudHNhLnN0YXRlc3BhY2Uuc2FyaW1heCBpbXBvcnQgU0FSSU1BWAppbXBvcnQgcGFuZGFzIGFzIHBkCmZyb20gc3FsYWxjaGVteSBpbXBvcnQgY3JlYXRlX2VuZ2luZQppbXBvcnQgZGF0ZXRpbWUKZnJvbSBwYW5kYXMudHNlcmllcy5vZmZzZXRzIGltcG9ydCBEYXRlT2Zmc2V0CmltcG9ydCBvcwppbXBvcnQganNvbgoKIyBEZWZpbmUgYSBjdXN0b2

In [13]:
data_item_download = proj.get_dataitem("dataset").key
run_parkings = func.run(action="job",inputs={"parkings_di":data_item_download},outputs={})

## Pipeline

In this step we will create a workflow pipeline whose purpose is to call the download and predict-day(regression) function based on schedule.

In [13]:
%%writefile "src/parkings_pipeline_sarimax_regression.py"

from digitalhub_runtime_kfp.dsl import pipeline_context

def myhandler(di):
    with pipeline_context() as pc:
        #s1_dataset = pc.step(name="download", function="downloader-funct", action="job", inputs={"url":url},outputs={"dataset":"dataset"})
        s2_predict = pc.step(name="predict-day-sarimax-regression", function="predict-day-sarimax-regression", action="job", inputs={"parkings_di":di}, outputs={})

Writing src/parkings_pipeline_sarimax_regression.py


In [14]:
workflow = proj.new_workflow(name="pipeline_parcheggi_sarimax_regression", kind="kfp", source={"source": "src/parkings_pipeline_sarimax_regression.py", "handler": "myhandler"})

In [None]:
data_item_download = proj.get_dataitem("dataset").key

In [15]:
# di= proj.new_dataitem(name="url_data_item",kind="table",path=URL)
c = workflow.run(parameters={"di": data_item_download})

## Schedule

Regression Pipeline workflow is scheduled for frequent runs using Crons expression.

In [16]:
workflow.run(parameters={"url": di.key}, schedule="@hourly")
#workflow.run(parameters={"url": di.key}, schedule="*/5 * * * *")

{'kind': 'kfp+run', 'metadata': {'project': 'parcheggi-nk-scheduler-digitalhubdev', 'name': 'f82d0f78-9040-48eb-84d8-282d884e849d', 'created': '2024-09-30T11:51:27.291Z', 'updated': '2024-09-30T11:51:27.305Z', 'created_by': 'tenant1userid', 'updated_by': 'tenant1userid'}, 'spec': {'task': 'kfp+pipeline://parcheggi-nk-scheduler-digitalhubdev/pipeline_parcheggi_regression:f7c7d060-4903-4660-9943-f38636ca195f', 'local_execution': False, 'source': {'source': 'src/parkings_pipeline_regression.py', 'handler': 'myhandler', 'base64': 'CmZyb20gZGlnaXRhbGh1Yl9ydW50aW1lX2tmcC5kc2wgaW1wb3J0IHBpcGVsaW5lX2NvbnRleHQKCmRlZiBteWhhbmRsZXIodXJsKToKICAgIHdpdGggcGlwZWxpbmVfY29udGV4dCgpIGFzIHBjOgogICAgICAgIHMxX2RhdGFzZXQgPSBwYy5zdGVwKG5hbWU9ImRvd25sb2FkIiwgZnVuY3Rpb249ImRvd25sb2FkZXItZnVuY3QiLCBhY3Rpb249ImpvYiIsIGlucHV0cz17InVybCI6dXJsfSxvdXRwdXRzPXsiZGF0YXNldCI6ImRhdGFzZXQifSkKICAgICAgICBzMl9wcmVkaWN0ID0gcGMuc3RlcChuYW1lPSJwcmVkaWN0IiwgZnVuY3Rpb249InByZWRpY3QtZGF5IiwgYWN0aW9uPSJqb2IiLCBpbnB1dHM9eyJwYXJraW5