In [1]:
import pandas as pd
import requests
import os
import json

In [2]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/exports/csv?lang=it&timezone=UTC&use_labels=true&delimiter=%3B"
df = pd.read_csv(URL, sep=";")
df[['lat', 'lon']] = df['coordinate'].str.split(', ',expand=True)
df = df.drop(columns=['% occupazione', 'GUID', 'coordinate']).rename(columns={'Parcheggio': 'parcheggio', 'Data': 'data', 'Posti liberi': 'posti_liberi', 'Posti occupati': 'posti_occupati', 'Posti totali': 'posti_totali'})
df

Unnamed: 0,parcheggio,data,posti_liberi,posti_occupati,posti_totali,lat,lon
0,VIII Agosto,2024-06-07T01:59:00+00:00,484.0,141.0,625,44.500297,11.345368
1,Riva Reno,2024-06-07T02:09:00+00:00,369.0,101.0,470,44.501153,11.336062
2,Riva Reno,2024-06-07T02:19:00+00:00,369.0,101.0,470,44.501153,11.336062
3,VIII Agosto,2024-06-07T02:29:00+00:00,487.0,138.0,625,44.500297,11.345368
4,Riva Reno,2024-06-07T02:29:00+00:00,369.0,101.0,470,44.501153,11.336062
...,...,...,...,...,...,...,...
44394,Autostazione,2024-09-26T08:59:00+00:00,179.0,86.0,265,44.504422,11.346514
44395,VIII Agosto,2024-09-26T09:09:00+00:00,374.0,251.0,625,44.500297,11.345368
44396,Autostazione,2024-09-26T09:09:00+00:00,173.0,92.0,265,44.504422,11.346514
44397,Riva Reno,2024-09-26T09:19:00+00:00,331.0,139.0,470,44.501153,11.336062


In [5]:
import digitalhub as dh
import getpass as gt

PROJECT_NAME = "parcheggi-nk-scheduler"+gt.getuser()
proj = dh.get_or_create_project(PROJECT_NAME)
print("created project {}".format(PROJECT_NAME))
PROJECT_NAME

created project parcheggi-nk-schedulerdigitalhubdev


'parcheggi-nk-schedulerdigitalhubdev'

In [14]:
import os
os.makedirs("src")

In [15]:
%%writefile "src/download_all.py"
from digitalhub_runtime_python import handler
import pandas as pd

@handler(outputs=["dataset"])
def downloader(project, url):
    df = url.as_df(file_format='csv',sep=";")
    df[['lat', 'lon']] = df['coordinate'].str.split(', ',expand=True)
    df = df.drop(columns=['% occupazione', 'GUID', 'coordinate']).rename(columns={'Parcheggio': 'parcheggio', 'Data': 'data', 'Posti liberi': 'posti_liberi', 'Posti occupati': 'posti_occupati', 'Posti totali': 'posti_totali'})
    df["lat"] = pd.to_numeric(df["lat"])
    df["lon"] = pd.to_numeric(df["lon"])
    return df

Writing src/download_all.py


In [16]:
func = proj.new_function(name="downloader-funct",
                         kind="python",
                         python_version="PYTHON3_9",
                         source={"source": "src/download_all.py", "handler": "downloader"})

In [17]:
di = proj.new_dataitem(name="url_data_item",kind="table",path=URL)

In [20]:
run_download = func.run(action="job",inputs={"url":di.key},outputs={"dataset":"dataset"})

# Predict day (Regression SARIMAX)

In [43]:
%%writefile "src/predict_parkings.py"
from digitalhub_runtime_python import handler
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pandas as pd
from sqlalchemy import create_engine
import datetime
from pandas.tseries.offsets import DateOffset
import os
import json

# Define a custom function to serialize datetime objects 
def serialize_datetime(obj): 
    if isinstance(obj, datetime.datetime): 
        return obj.isoformat() 
    raise TypeError("Type not serializable") 

    
def to_point(point):
    """
    Convert a decimal number representing minutes to a datetime object.

    Args:
        point (float): The decimal number representing minutes.

    Returns:
        datetime.datetime: A datetime object representing the current date and time with the minutes derived from the input.

    Example:
        >>> to_point(45)
        datetime.datetime(2022, 1, 1, 0, 22, 30)
    """
    today = datetime.datetime.today()
    #return datetime.datetime(today.year, today.month, today.day, int(point * 30 / 60), int(point * 30 % 60))
    dt = datetime.datetime(today.year, today.month, today.day, int(point * 30 / 60), int(point * 30 % 60))
    return json.dumps(dt, default=serialize_datetime)


@handler(outputs=["parking_data_predicted_regression"])
def predict_day(project, parkings_di):
    """
    Predicts the occupancy of parking spaces for the next 48 steps and saves the results in a PostgreSQL database.

    Args:
        parkings_di: The data item containing the parking data.

    Returns:
        None
    """
    # Convert data item to pandas DataFrame
    df = parkings_di.as_df()

    # Create a clean copy of the DataFrame
    df_clean = df.copy()

    # Remove unnecessary columns
    df_clean = df_clean.drop(columns=['lat', 'lon'])

    # Convert 'data' column to datetime
    df_clean.data = df_clean.data.astype('datetime64[ms, UTC]')

    # Calculate the occupancy rate
    df_clean['occupied'] = df_clean.posti_occupati / df_clean.posti_totali

    # Round the 'data' column to the nearest 30 minutes
    df_clean['date_time_slice'] = df_clean.data.dt.round('30min').dt.tz_convert(None)

    # Extract the date from the 'data' column
    df_clean['date'] = df_clean.data.dt.tz_convert(None)
    # df_clean['date'] = df_clean['date'].tz_convert(None)

    # Filter out data from the last 30 days
    df_clean = df_clean[df_clean.date_time_slice >= (datetime.datetime.today() - pd.DateOffset(30))]

    # Filter out data from today
    df_clean = df_clean[df_clean.date <= (datetime.datetime.today() - pd.DateOffset(1))]

    # Remove the 'date' column
    df_clean = df_clean.drop(['date'], axis=1)

    # Ensure that 'posti_occupati' is within the range of [0, posti_totali]
    df_clean.posti_occupati = df_clean.apply(lambda x: max(0, min(x['posti_totali'], x['posti_occupati'])), axis=1)

    # Recalculate the occupancy rate
    df_clean['occupied'] = df_clean.posti_occupati / df_clean.posti_totali

    # Get unique parking locations
    parcheggi = df_clean['parcheggio'].unique()

    # Initialize a list to store the predictions
    res = []

    # Iterate over each parking location
    for parcheggio in parcheggi:
        # Create a copy of the cleaned DataFrame
        cp = df_clean.copy()

        # Filter data for the current parking location
        parc_df = cp[cp['parcheggio'] == parcheggio]

        # Group data by 'date_time_slice' and aggregate metrics
        parc_df = parc_df.groupby('date_time_slice').agg({'posti_occupati':['sum','count'], 'posti_totali':['sum','count']})

        # Calculate the occupancy rate
        parc_df['occupied'] = parc_df.posti_occupati['sum'] / parc_df.posti_totali['sum']

        # Remove unnecessary columns
        parc_df.drop(columns=['posti_occupati', 'posti_totali'], inplace=True)

        # Sort the DataFrame by index
        parc_df.sort_index(inplace=True)

        # Extract the 'occupied' column as a Series
        data = parc_df.reset_index()['occupied']

        # Define the SARIMA model parameters
        my_seasonal_order = (1, 1, 1, 48)

        # Create and fit the SARIMA model
        sarima_model = SARIMAX(data, order=(1, 0, 1), seasonal_order=my_seasonal_order)
        results_SAR = sarima_model.fit(disp=-1)

        # Generate predictions for the next 48 steps
        pred = results_SAR.forecast(steps=48).reset_index()

        # Add the 'parcheggio' column
        pred['parcheggio'] = parcheggio
        res.append(pred)
    
    for pred in res:
        pred['point'] = (pred.index).astype('int')
        pred['datetime'] = pred['point'].apply(to_point)
        pred.drop(['point'], axis=1, inplace=True)
    
    all = pd.concat(res, ignore_index=True)[['predicted_mean', 'parcheggio', 'datetime']]

    USERNAME = os.getenv("POSTGRES_USER")
    PASSWORD = os.getenv("POSTGRES_PASSWORD")
    engine = create_engine('postgresql+psycopg2://'+USERNAME+':'+PASSWORD+'@database-postgres-cluster/digitalhub')
    with engine.connect() as connection: 
        try: connection.execute("DELETE FROM parkings_prediction")
        except: pass

    all.to_sql('parkings_prediction', engine, if_exists="append")

    # old_pd = all
    # try: 
    #     dat_old = project.get_dataitem('parking_prediction_sarima_model')
    #     old_pd = pd.concat([dat_old.as_df(), all], ignore_index=True)
    # except: pass
    # project.log_dataitem(name='parking_prediction_sarima_model', data=old_pd, kind="table")
    return all

Overwriting src/predict_parkings.py


In [44]:
func = proj.new_function(name="predict-day",
                         kind="python",
                         python_version="PYTHON3_9",
                         source={"source": "src/predict_parkings.py", "handler": "predict_day"})

In [45]:
data_item_download = proj.get_dataitem("dataset").key
run_parkings = func.run(action="job",inputs={"parkings_di":data_item_download},outputs={})

## Pipeline

In this step we will create a workflow pipeline whose purpose is to call the download and predict-day(regression) function based on schedule.

In [64]:
%%writefile "src/parkings_pipeline_regression.py"

from digitalhub_runtime_kfp.dsl import pipeline_context

def myhandler(url):
    with pipeline_context() as pc:
        s1_dataset = pc.step(name="download", function="downloader-funct", action="job", inputs={"url":url},outputs={"dataset":"dataset"})
        s2_predict = pc.step(name="predict", function="predict-day", action="job", inputs={"parkings_di":s1_dataset.outputs['dataset']}, outputs={})

Overwriting src/parkings_pipeline_regression.py


In [65]:
workflow = proj.new_workflow(name="pipeline_parcheggi_regression", kind="kfp", source={"source": "src/parkings_pipeline_regression.py", "handler": "myhandler"})

In [67]:
di= proj.new_dataitem(name="url_data_item",kind="table",path=URL)
c = workflow.run(parameters={"url": di.key})

## Schedule

In [3]:
workflow.run(parameters={"url": di.key}, schedule="@hourly")
#workflow.run(parameters={"url": di.key}, schedule="*/5 * * * *")

# Monitor

In this step we will create Monitor workflow pipeline based on schedule, whose purpose is to call

1) call the service created after training a data prediction model using darts framework and NBEATS Deep Learning model. (see notebook parcheggi_ml.ipynb)
2) save the prediction in database.
