In [None]:
import pandas as pd
import requests
import os
import json

## 1. Data Exploration

### 1.1. Download data
Download data from the API, and load it into a pandas dataframe.

In [None]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/exports/csv?lang=it&timezone=UTC&use_labels=true&delimiter=%3B"

df = pd.read_csv(URL, sep=";")
df[['lat', 'lon']] = df['coordinate'].str.split(', ',expand=True)
df = df.drop(columns=['% occupazione', 'GUID', 'coordinate']).rename(columns={'Parcheggio': 'parcheggio', 'Data': 'data', 'Posti liberi': 'posti_liberi', 'Posti occupati': 'posti_occupati', 'Posti totali': 'posti_totali'})
df

In [None]:
df.head()

### 1.2. Extract parkings
Extract distinct parkings from the dataframe.

In [None]:
KEYS = ['parcheggio', 'lat', 'lon']
df_parcheggi = df.groupby(['parcheggio']).first().reset_index()[KEYS]
df_parcheggi

### 1.3 Aggregate Parking Data
Aggregate Parking Data by date, hour, dow, and parking.

In [None]:
rdf = df.copy()
rdf['data'] = pd.to_datetime(rdf['data'])
rdf['day'] = rdf['data'].apply(lambda t: t.replace(second=0, minute=0))
rdf['lat'] = rdf['lat'].apply(lambda t: float(t))
rdf['lon'] = rdf['lon'].apply(lambda t: float(t))
rdf = rdf.drop(columns=['data'])
grouped =rdf.groupby(['parcheggio','day']).mean()
df_aggregated = grouped.reset_index()
df_aggregated

## 2. Platform Support - Data Ops

We use the platform support to load the data into the platform, version it, and automate the execution of the data management operations.


### 2.1. Initalization
Create the working context: data management project for the parking data processing. Project is a placeholder for the code, data, and management of the parking data operations. To keep it reproducible, we use the `git` source type to store the definition and code.

In [None]:
import digitalhub as dh

PROJECT_NAME = "demo-platform-2025"
proj = dh.get_or_create_project(PROJECT_NAME)
print("created project {}".format(PROJECT_NAME))
PROJECT_NAME

### 2.2. Data management functions
We convert the data management ETL operations into functions - single executable operations that can be executed in the platform.

Create a directory named 'src' to save all the python source files.

In [None]:
import os
directory="src"
if not os.path.exists(directory):
    os.makedirs(directory)

In [None]:
%%writefile "src/download_all.py"
from digitalhub_runtime_python import handler
import pandas as pd

@handler(outputs=["dataset"])
def downloader(project, url):
    df = url.as_df(file_format='csv',sep=";")
    df[['lat', 'lon']] = df['coordinate'].str.split(', ',expand=True)
    df = df.drop(columns=['% occupazione', 'GUID', 'coordinate']).rename(columns={'Parcheggio': 'parcheggio', 'Data': 'data', 'Posti liberi': 'posti_liberi', 'Posti occupati': 'posti_occupati', 'Posti totali': 'posti_totali'})
    df["lat"] = pd.to_numeric(df["lat"])
    df["lon"] = pd.to_numeric(df["lon"])
    return df

In [None]:
func = proj.new_function(name="downloader-funct",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/download_all.py", "handler": "downloader"})

In [None]:
di = proj.new_dataitem(name="url_data_item",kind="table",path=URL)

In [None]:
run_download = func.run(action="job",inputs={"url":di.key},outputs={"dataset":"dataset"}, local_execution=False)

In [None]:
run_download.refresh().status.state

Wait the run to finish. Monitor the execution status of the run using the console or with the run ``refresh`` function.

In [None]:
%%writefile "src/extract_parkings.py"
from digitalhub_runtime_python import handler
import pandas as pd

@handler(outputs=["parkings"])
def extract_parkings(project, di):
    KEYS = ['parcheggio', 'lat', 'lon', 'posti_totali']
    df_parcheggi = di.as_df().groupby(['parcheggio']).first().reset_index()[KEYS]
    return df_parcheggi

In [None]:
func = proj.new_function(name="extract-parkings",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/extract_parkings.py", "handler": "extract_parkings"})

In [None]:
data_item_download = proj.get_dataitem("dataset").key
run_parkings = func.run(action="job",inputs={"di":data_item_download},outputs={"parkings":"parkings"})

Wait the run to finish. Monitor the execution status of the run using the console or with the run ``refresh`` function.

In [None]:
run_parkings.refresh().status.state

Once 'Completed', proceed to next step

In [None]:
%%writefile "src/aggregations_parkings.py"
from datetime import datetime
from digitalhub_runtime_python import handler
import pandas as pd

@handler(outputs=["parking_data_aggregated"])
def aggregate_parkings(project, di):
    rdf = di.as_df()
    rdf['data'] = pd.to_datetime(rdf['data'])
    rdf['day'] = rdf['data'].apply(lambda t: t.replace(second=0, minute=0))
    rdf['hour'] = rdf['day'].dt.hour
    rdf['dow'] = rdf['day'].dt.dayofweek
    #rdf['type'] = rdf['data']#.apply(lambda t: "sadassad"+t.astype(str))
    rdf['day'] = rdf['day'].apply(lambda t: datetime.timestamp(t)) #added because complain of timestamp not JSOn serializable#
    rdf = rdf.drop(columns=['data'])
    rdf['lat'] = rdf['lat'].apply(lambda t: float(t))
    rdf['lon'] = rdf['lon'].apply(lambda t: float(t))
    grouped = rdf.groupby(['parcheggio','day']).mean() #
    df_aggregated = grouped.reset_index()
    return df_aggregated

In [None]:
func = proj.new_function(name="aggregate-parkings",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/aggregations_parkings.py", "handler": "aggregate_parkings"})

In [None]:
run_aggregate = func.run(action="job",inputs={"di":data_item_download},outputs={"parking_data_aggregated":"parking_data_aggregated"})

In [None]:
run_aggregate.refresh().status.state

Once completed, proceed to next step.

In [None]:
%%writefile "src/parkings_to_db.py"
from digitalhub_runtime_python import handler
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime
import datetime as dtt
import os

@handler()
def to_db(project, agg_di , parkings_di ):
    USERNAME = os.getenv("POSTGRES_USER")
    PASSWORD = os.getenv("POSTGRES_PASSWORD")
    engine = create_engine('postgresql+psycopg2://'+USERNAME+':'+PASSWORD+'@database-postgres-cluster/digitalhub')
    
    agg_df = agg_di.as_df(file_format="parquet")
        
    # Keep only last two calendar years
    date = dtt.date.today() - dtt.timedelta(days=365*2)
    agg_df['day'] = agg_df['day'].apply(lambda t: datetime.fromtimestamp(t)) #added because before was converted the type
    agg_df = agg_df[agg_df['day'].dt.date >= date]
    
    with engine.connect() as connection: 
        try: 
            connection.execute("DELETE FROM parkings") 
            connection.execute("DELETE FROM parking_data_aggregated") 
        except e: 
            print(e)

    agg_df.to_sql("parking_data_aggregated", engine, if_exists="append")
    parkings_di.as_df().to_sql('parkings', engine, if_exists="append")
    return

In [None]:
func = proj.new_function(name="to-db",
                         kind="python",
                         requirements=["sqlalchemy"],
                         python_version="PYTHON3_10",
                         source={"source": "src/parkings_to_db.py", "handler": "to_db"})

In [None]:
data_item_parkings = proj.get_dataitem("parkings").key
data_item_aggregate = proj.get_dataitem("parking_data_aggregated").key
run_to_db = func.run(action="job",inputs={"agg_di":data_item_aggregate,"parkings_di":data_item_parkings},outputs={}, local_execution=True)

Wait the run to finish. Monitor the execution status of the run using the console or with the run refresh function.

In [None]:
run_to_db.refresh().status.state

### 2.3 Data Management Pipeline

We put together the operations in an automated pipeline.

In [None]:
workflow = proj.new_workflow(name="parkings_pipeline", kind="kfp", code_src= "src/parkings_pipeline.py", handler = "myhandler")

In [None]:
workflow_run = workflow.run(parameters={"url": di.key})