## 1. Latest Data Exploration

### 1.1. Download data
Download latest data from the API, and load it into a pandas dataframe.

In [None]:
import pandas as pd
import requests
import os
import json
import datetime
from sqlalchemy import create_engine

In [None]:
import digitalhub as dh

PROJECT_NAME = "parcheggi-scheduler"
proj = dh.get_or_create_project(PROJECT_NAME)
print("created project {}".format(PROJECT_NAME))
PROJECT_NAME

Create a directory named 'src' to save all the python source files.

In [None]:
import os
directory="src"
if not os.path.exists(directory):
    os.makedirs(directory)

In [None]:
%%writefile "src/parkings_latest.py"
from digitalhub_runtime_python import handler
import datetime
import json
import os
import pandas as pd
import requests
from sqlalchemy import create_engine

@handler()
def parkings_last_data():
    date_str = datetime.datetime.now().strftime('%Y-%m-%d')
    latest_data_file = 'latest_records.json'
    API_URL = f'https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/records?where=data%3E%3D%27{date_str}%27&order_by=data%20DESC&limit=100'
    
    # Download the latest data from the API and save it to a file
    with requests.get(API_URL) as r:
        with open(latest_data_file, "wb") as f:
            f.write(r.content)
            
        # Read the data from the file and convert it to a DataFrame
        with open(latest_data_file) as f:
            json_data = json.load(f)
            df_latest = pd.json_normalize(json_data['results']).drop(columns=['guid', 'occupazione']).rename(columns={"coordinate.lon": "lon", "coordinate.lat": "lat"})

    # convert 'data' column to datetime
    df_latest.data = df_latest.data.astype('datetime64[ns, UTC]')                     
    # write data to database
    USERNAME = os.getenv("POSTGRES_USER")
    PASSWORD = os.getenv("POSTGRES_PASSWORD")
    engine = create_engine('postgresql+psycopg2://'+USERNAME+':'+PASSWORD+'@database-postgres-cluster/digitalhub')
    
    with engine.connect() as connection: 
        try: connection.execute("DELETE FROM parkings_latest") 
        except: pass

    df_latest.to_sql('parkings_latest', engine, if_exists="append")   

In [36]:
func = proj.new_function(name="parkings-latest",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/parkings_latest.py", "handler": "parkings_last_data"})

In [37]:
run_latest = func.run(action="job",inputs={},outputs={})

In [None]:
run_latest.refresh().status.state

## Pipeline latest data

In [38]:
%%writefile "src/latest_parkings_pipeline.py"

from digitalhub_runtime_kfp.dsl import pipeline_context

def myhandler(di):
    with pipeline_context() as pc:
        s1_latest_data = pc.step(name="latest-parkings", function="parkings-latest", action="job", inputs={}, outputs={})

Overwriting src/latest_parkings_pipeline.py


In [39]:
workflow = proj.new_workflow(name="pipeline_latest_parkings", kind="kfp", source={"source": "src/latest_parkings_pipeline.py", "handler": "myhandler"})

In [40]:
#workflow.run()
workflow.run(parameters={}, schedule="*/10 * * * *")

{'kind': 'kfp+run', 'metadata': {'project': 'parcheggi-scheduler', 'name': 'b239f503-73b0-4310-91a7-77b072ae8687', 'created': '2024-11-11T11:58:33.045Z', 'updated': '2024-11-11T11:58:33.062Z', 'created_by': 'khurshid@fbk.eu', 'updated_by': 'khurshid@fbk.eu'}, 'spec': {'task': 'kfp+pipeline://parcheggi-scheduler/pipeline_latest_parkings:89e793c2-a297-45dd-9ab6-b6de0b1254f9', 'local_execution': False, 'function': 'kfp://parcheggi-scheduler/pipeline_latest_parkings:89e793c2-a297-45dd-9ab6-b6de0b1254f9', 'source': {'source': 'src/latest_parkings_pipeline.py', 'handler': 'myhandler', 'base64': 'CmZyb20gZGlnaXRhbGh1Yl9ydW50aW1lX2tmcC5kc2wgaW1wb3J0IHBpcGVsaW5lX2NvbnRleHQKCmRlZiBteWhhbmRsZXIoZGkpOgogICAgd2l0aCBwaXBlbGluZV9jb250ZXh0KCkgYXMgcGM6CiAgICAgICAgczFfbGF0ZXN0X2RhdGEgPSBwYy5zdGVwKG5hbWU9ImxhdGVzdC1wYXJraW5ncyIsIGZ1bmN0aW9uPSJwYXJraW5ncy1sYXRlc3QiLCBhY3Rpb249ImpvYiIsIGlucHV0cz17fSwgb3V0cHV0cz17fSkK', 'lang': 'python'}, 'schedule': '*/10 * * * *', 'inputs': {}, 'outputs': {}, 'parameters'