In [1]:
import pandas as pd
import requests
import os
import json

## 1. Data Exploration

### 1.1. Download data
Download data from the API, and load it into a pandas dataframe.

In [2]:
URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/exports/csv?lang=it&timezone=UTC&use_labels=true&delimiter=%3B"

df = pd.read_csv(URL, sep=";")
df[['lat', 'lon']] = df['coordinate'].str.split(', ',expand=True)
df = df.drop(columns=['% occupazione', 'GUID', 'coordinate']).rename(columns={'Parcheggio': 'parcheggio', 'Data': 'data', 'Posti liberi': 'posti_liberi', 'Posti occupati': 'posti_occupati', 'Posti totali': 'posti_totali'})
df

Unnamed: 0,parcheggio,data,posti_liberi,posti_occupati,posti_totali,lat,lon
0,Autostazione,2024-06-07T01:59:00+00:00,244.0,21.0,265,44.504422,11.346514
1,VIII Agosto,2024-06-07T02:19:00+00:00,486.0,139.0,625,44.500297,11.345368
2,Autostazione,2024-06-07T02:19:00+00:00,244.0,21.0,265,44.504422,11.346514
3,VIII Agosto,2024-06-07T02:49:00+00:00,488.0,137.0,625,44.500297,11.345368
4,Autostazione,2024-06-07T02:49:00+00:00,244.0,21.0,265,44.504422,11.346514
...,...,...,...,...,...,...,...
10012,VIII Agosto,2024-07-02T08:39:00+00:00,367.0,258.0,625,44.500297,11.345368
10013,Riva Reno,2024-07-02T08:39:00+00:00,329.0,141.0,470,44.501153,11.336062
10014,VIII Agosto,2024-07-02T08:49:00+00:00,359.0,266.0,625,44.500297,11.345368
10015,VIII Agosto,2024-07-02T09:09:00+00:00,325.0,300.0,625,44.500297,11.345368


### 1.2. Extract parkings
Extract distinct parkings from the dataframe.

In [3]:
KEYS = ['parcheggio', 'lat', 'lon']
df_parcheggi = df.groupby(['parcheggio']).first().reset_index()[KEYS]
df_parcheggi

Unnamed: 0,parcheggio,lat,lon
0,Autostazione,44.504422,11.346514
1,Riva Reno,44.501153,11.336062
2,VIII Agosto,44.500297,11.345368


### 1.3 Aggregate Parking Data
Aggregate Parking Data by date, hour, dow, and parking.

In [4]:
rdf = df.copy()
rdf['data'] = pd.to_datetime(rdf['data'])
rdf['day'] = rdf['data'].apply(lambda t: t.replace(second=0, minute=0))
rdf['lat'] = rdf['lat'].apply(lambda t: float(t))
rdf['lon'] = rdf['lon'].apply(lambda t: float(t))
rdf = rdf.drop(columns=['data'])
grouped =rdf.groupby(['parcheggio','day']).mean()
df_aggregated = grouped.reset_index()
df_aggregated

Unnamed: 0,parcheggio,day,posti_liberi,posti_occupati,posti_totali,lat,lon
0,Autostazione,2024-06-07 01:00:00+00:00,244.000000,21.000000,265.0,44.504422,11.346514
1,Autostazione,2024-06-07 02:00:00+00:00,244.000000,21.000000,265.0,44.504422,11.346514
2,Autostazione,2024-06-07 03:00:00+00:00,244.000000,21.000000,265.0,44.504422,11.346514
3,Autostazione,2024-06-07 04:00:00+00:00,244.333333,20.666667,265.0,44.504422,11.346514
4,Autostazione,2024-06-07 05:00:00+00:00,242.666667,22.333333,265.0,44.504422,11.346514
...,...,...,...,...,...,...,...
1705,VIII Agosto,2024-07-02 05:00:00+00:00,440.000000,185.000000,625.0,44.500297,11.345368
1706,VIII Agosto,2024-07-02 06:00:00+00:00,438.500000,186.500000,625.0,44.500297,11.345368
1707,VIII Agosto,2024-07-02 07:00:00+00:00,431.833333,193.166667,625.0,44.500297,11.345368
1708,VIII Agosto,2024-07-02 08:00:00+00:00,375.500000,249.500000,625.0,44.500297,11.345368


## 2. Platform Support - Data Ops

We use the platform support to load the data into the platform, version it, and automate the execution of the data management operations.


### 2.1. Initalization
Create the working context: data management project for the parking data processing. Project is a placeholder for the code, data, and management of the parking data operations. To keep it reproducible, we use the `git` source type to store the definition and code.

In [32]:
import digitalhub as dh

PROJECT_NAME = "parcheggi"
proj = dh.get_or_create_project(PROJECT_NAME) # source="git://github.com/scc-digitalhub/gdb-project-parkings.git"

### 2.2. Data management functions
We convert the data management ETL operations into functions - single executable operations that can be executed in the platform.

In [33]:
%%writefile "src/download_all_dh_core.py"
from digitalhub_runtime_python import handler
import pandas as pd

@handler(outputs=["dataset"])
def downloader(project, url):
    df = url.as_df(file_format='csv',sep=";")
    df[['lat', 'lon']] = df['coordinate'].str.split(', ',expand=True)
    df = df.drop(columns=['% occupazione', 'GUID', 'coordinate']).rename(columns={'Parcheggio': 'parcheggio', 'Data': 'data', 'Posti liberi': 'posti_liberi', 'Posti occupati': 'posti_occupati', 'Posti totali': 'posti_totali'})
    df["lat"] = pd.to_numeric(df["lat"])
    df["lon"] = pd.to_numeric(df["lon"])
    return df

Overwriting src/download_all_dh_core.py


In [34]:
FUNCTION_NAME="downloader-funct"
func = proj.new_function(name=FUNCTION_NAME,
                         kind="python",
                         python_version="PYTHON3_9",
                         source={"source": "src/download_all_dh_core.py", "handler": "downloader"})

In [35]:
di= proj.new_dataitem(name="url_data_item",kind="table",path=URL)

In [36]:
run_download = func.run(action="job",local_execution=True,inputs={"url":di.key},outputs={"dataset":"dataset"})# local_execution=True

2024-07-02 07:41:12,202 - INFO - Validating task.
2024-07-02 07:41:12,202 - INFO - Validating run.
2024-07-02 07:41:12,203 - INFO - Starting task.
2024-07-02 07:41:12,203 - INFO - Configuring execution.
2024-07-02 07:41:12,205 - INFO - Composing function arguments.
2024-07-02 07:41:12,206 - INFO - Function parameters: True
2024-07-02 07:41:12,261 - INFO - Executing run.
2024-07-02 07:41:23,015 - INFO - Task completed, returning run status.


In [40]:
run_download.status

{'state': 'COMPLETED', 'outputs': {'dataset': 'store://parcheggi/dataitems/table/dataset:f259bfd4-7a44-4efb-8a2d-43be2f09935f'}, 'results': {}}

In [41]:
run_download.refresh()

{'project': 'parcheggi', 'id': 'ebe3d7f5-9c63-4fb3-92c1-f2529dd965f0', 'kind': 'python+run', 'key': 'store://parcheggi/runs/python+run/ebe3d7f5-9c63-4fb3-92c1-f2529dd965f0', 'metadata': {'project': 'parcheggi', 'name': 'ebe3d7f5-9c63-4fb3-92c1-f2529dd965f0', 'created': '2024-07-02T07:41:12.139Z', 'updated': '2024-07-02T07:41:23.069Z', 'created_by': 'tenant1userid', 'updated_by': 'tenant1userid'}, 'spec': {'task': 'python+job://parcheggi/downloader-funct:f19c7610-f7e0-433a-b1f5-51d2af98b693', 'local_execution': True, 'source': {'source': 'src/download_all_dh_core.py', 'handler': 'downloader', 'base64': 'ZnJvbSBkaWdpdGFsaHViX3J1bnRpbWVfcHl0aG9uIGltcG9ydCBoYW5kbGVyCmltcG9ydCBwYW5kYXMgYXMgcGQKCkBoYW5kbGVyKG91dHB1dHM9WyJkYXRhc2V0Il0pCmRlZiBkb3dubG9hZGVyKHByb2plY3QsIHVybCk6CiAgICBkZiA9IHVybC5hc19kZihmaWxlX2Zvcm1hdD0nY3N2JyxzZXA9IjsiKQogICAgZGZbWydsYXQnLCAnbG9uJ11dID0gZGZbJ2Nvb3JkaW5hdGUnXS5zdHIuc3BsaXQoJywgJyxleHBhbmQ9VHJ1ZSkKICAgIGRmID0gZGYuZHJvcChjb2x1bW5zPVsnJSBvY2N1cGF6aW9uZScsICdHVUlEJy

In [45]:
data_item_download = run_download.outputs()['dataset'].key

In [46]:
%%writefile "src/extract_parkings_dh_core.py"
from digitalhub_runtime_python import handler
import pandas as pd

@handler(outputs=["parkings"])
def extract_parkings(project, di):
    KEYS = ['parcheggio', 'lat', 'lon', 'posti_totali']
    df_parcheggi = di.as_df().groupby(['parcheggio']).first().reset_index()[KEYS]
    return df_parcheggi

Overwriting src/extract_parkings_dh_core.py


In [47]:
FUNCTION_NAME="extract-parkings"
func = proj.new_function(name=FUNCTION_NAME,
                         kind="python",
                         python_version="PYTHON3_9",
                         source={"source": "src/extract_parkings_dh_core.py", "handler": "extract_parkings"})

In [48]:
run_parkings = func.run(action="job",local_execution=True,inputs={"di":data_item_download},outputs={"parkings":"parkings"})# local_execution=True

2024-07-02 07:42:13,053 - INFO - Validating task.
2024-07-02 07:42:13,054 - INFO - Validating run.
2024-07-02 07:42:13,055 - INFO - Starting task.
2024-07-02 07:42:13,055 - INFO - Configuring execution.
2024-07-02 07:42:13,057 - INFO - Composing function arguments.
2024-07-02 07:42:13,058 - INFO - Function parameters: True
2024-07-02 07:42:13,096 - INFO - Executing run.
2024-07-02 07:42:13,203 - INFO - Task completed, returning run status.


In [49]:
data_item_parkings = run_parkings.outputs()['parkings'].key

In [50]:
%%writefile "src/aggregations_parkings_dh_core.py"
from digitalhub_runtime_python import handler
import pandas as pd

@handler(outputs=["parking_data_aggregated"])
def aggregate_parkings(project, di):
    rdf = di.as_df()
    rdf['data'] = pd.to_datetime(rdf['data'])
    rdf['day'] = rdf['data'].apply(lambda t: t.replace(second=0, minute=0))
    rdf['hour'] = rdf['day'].dt.hour
    rdf['dow'] = rdf['day'].dt.dayofweek
    rdf = rdf.drop(columns=['data'])
    rdf['lat'] = rdf['lat'].apply(lambda t: float(t))
    rdf['lon'] = rdf['lon'].apply(lambda t: float(t))
    grouped = rdf.groupby(['parcheggio','day']).mean()
    df_aggregated = grouped.reset_index()
    return df_aggregated

Overwriting src/aggregations_parkings_dh_core.py


In [51]:
FUNCTION_NAME="aggregate-parkings"
func = proj.new_function(name=FUNCTION_NAME,
                         kind="python",
                         python_version="PYTHON3_9",
                         source={"source": "src/aggregations_parkings_dh_core.py", "handler": "aggregate_parkings"})

In [52]:
#new_data_item = run.outputs()['dataset'].key

In [54]:
run_aggregate = func.run(action="job",local_execution=True,inputs={"di":data_item_download},outputs={"parking_data_aggregated":"parking_data_aggregated"})# local_execution=True

2024-07-02 07:42:39,762 - INFO - Validating task.
2024-07-02 07:42:39,762 - INFO - Validating run.
2024-07-02 07:42:39,763 - INFO - Starting task.
2024-07-02 07:42:39,763 - INFO - Configuring execution.
2024-07-02 07:42:39,765 - INFO - Composing function arguments.
2024-07-02 07:42:39,766 - INFO - Function parameters: True
2024-07-02 07:42:39,795 - INFO - Executing run.
2024-07-02 07:42:39,930 - INFO - Task completed, returning run status.


In [56]:
data_item_aggregate = run_aggregate.outputs()['parking_data_aggregated'].key

In [88]:
print(os.getenv("POSTGRES_USER"),os.getenv("POSTGRES_PASSWORD"))

digitalhub_owner_user tifE4hLlIUEryXyzQ2XXHnpIB2kM3lXdU8ndAmvJ6DsxvtkO7fgQ4lGwxYJaVfRQ


In [80]:
%%writefile "src/parkings_to_db.py"
from digitalhub_runtime_python import handler
import pandas as pd
from sqlalchemy import create_engine
import datetime
import os

@handler()
def to_db(project, agg_di , parkings_di ):
    USERNAME = os.getenv("POSTGRES_USER")#project.get_secret(entity_name='DB_USERNAME').read_secret_value()
    PASSWORD = os.getenv("POSTGRES_PASSWORD")#project.get_secret(entity_name='DB_PASSWORD').read_secret_value()
    engine = create_engine('postgresql://'+USERNAME+':'+PASSWORD+'@database-postgres-cluster/digitalhub')
    agg_df = agg_di.as_df()
    # Keep only last two calendar years
    date = datetime.date.today() - datetime.timedelta(days=365*2)
    agg_df = agg_df[agg_df['day'].dt.date >= date]
    with engine.connect() as connection: 
        try: connection.execute("DELETE FROM parkings")
        except: pass
        try: connection.execute("DELETE FROM parking_data_aggregated")
        except: pass
    agg_df.to_sql("parking_data_aggregated", engine, if_exists="append")
    parkings_di.as_df().to_sql('parkings', engine, if_exists="append")
    return

Overwriting src/parkings_to_db.py


In [81]:
FUNCTION_NAME="to-db"
func = proj.new_function(name=FUNCTION_NAME,
                         kind="python",
                         python_version="PYTHON3_9",
                         source={"source": "src/parkings_to_db.py", "handler": "to_db"})

In [89]:
## Set secrets
#secret_a = proj.new_secret(name="DB_USERNAME_NEW", secret_value="digitalhub_owner_user")
#secret_b = proj.new_secret(name="DB_PASSWORD", secret_value="secret")

In [83]:
run_to_db = func.run(action="job",local_execution=True,inputs={"agg_di":data_item_aggregate,"parkings_di":data_item_parkings},outputs={})# local_execution=True

2024-07-02 08:56:45,685 - INFO - Validating task.
2024-07-02 08:56:45,686 - INFO - Validating run.
2024-07-02 08:56:45,686 - INFO - Starting task.
2024-07-02 08:56:45,686 - INFO - Configuring execution.
2024-07-02 08:56:45,688 - INFO - Composing function arguments.
2024-07-02 08:56:45,689 - INFO - Function parameters: True
2024-07-02 08:56:45,725 - INFO - Executing run.
2024-07-02 08:56:45,961 - INFO - Task completed, returning run status.


In [84]:
run_to_db.status

{'state': 'COMPLETED', 'outputs': {}, 'results': {}}

In [85]:
run_to_db.refresh()

{'project': 'parcheggi', 'id': '6bb986d4-b64c-4ac9-8194-94660b7d96c5', 'kind': 'python+run', 'key': 'store://parcheggi/runs/python+run/6bb986d4-b64c-4ac9-8194-94660b7d96c5', 'metadata': {'project': 'parcheggi', 'name': '6bb986d4-b64c-4ac9-8194-94660b7d96c5', 'created': '2024-07-02T08:56:45.622Z', 'updated': '2024-07-02T08:56:45.992Z', 'created_by': 'tenant1userid', 'updated_by': 'tenant1userid'}, 'spec': {'task': 'python+job://parcheggi/to-db:1068555f-8a3f-4fa0-8b27-d73c8d89a892', 'local_execution': True, 'source': {'source': 'src/parkings_to_db.py', 'handler': 'to_db', 'base64': 'ZnJvbSBkaWdpdGFsaHViX3J1bnRpbWVfcHl0aG9uIGltcG9ydCBoYW5kbGVyCmltcG9ydCBwYW5kYXMgYXMgcGQKZnJvbSBzcWxhbGNoZW15IGltcG9ydCBjcmVhdGVfZW5naW5lCmltcG9ydCBkYXRldGltZQppbXBvcnQgb3MKCkBoYW5kbGVyKCkKZGVmIHRvX2RiKHByb2plY3QsIGFnZ19kaSAsIHBhcmtpbmdzX2RpICk6CiAgICBVU0VSTkFNRSA9IG9zLmdldGVudigiUE9TVEdSRVNfVVNFUiIpI3Byb2plY3QuZ2V0X3NlY3JldChlbnRpdHlfbmFtZT0nREJfVVNFUk5BTUUnKS5yZWFkX3NlY3JldF92YWx1ZSgpCiAgICBQQVNTV09SRCA9IG9z

### 2.3 Data Management Pipeline
We create a data management pipeline that executes the data management functions in the platform.

In [86]:
%%writefile "src/parking_data_pipeline.py"

from kfp import dsl
from digitalhub_runtime_python import handler
import digitalhub as dh

URL = "https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/exports/csv?lang=it&timezone=UTC&use_labels=true&delimiter=%3B"

@dsl.pipeline(name="Parking data pipeline")
def parking_pipeline():
    project = dh.get_current_project()

    run_download = project.run_function("download-all",inputs={'url':URL}, outputs=["dataset"])

    run_parkings = project.run_function("extract-parkings", inputs={'di':run_download.outputs()["dataset"].key}, outputs=["parkings"])

    run_aggregate = project.run_function("aggregate-parkings", inputs={'di':run_download.outputs()["dataset"].key}, outputs=["parking_data_aggregated"])
    
    project.run_function("to-db", inputs={'agg_di': run_aggregate.outputs()["parking_data_aggregated"].key, 'parkings_di': run_parkings.outputs()["parkings"].key})


Writing src/parking_data_pipeline.py


In [90]:
proj.set_workflow("pipeline","./pipeline.py", handler="pipeline")

NameError: name 'project' is not defined

In [None]:
proj.run("pipeline")