In [1]:
import mlrun
import pandas as pd

## Initialize project
Load project from git repository and set secrets to access gitlab. We set clone=True to clone the repository each time and to overwrite the code. 

In [2]:
project = mlrun.load_project("./parcheggi", "git://gitlab.hpc.cineca.it/dt-data-projects/gdb-project-parkings.git", clone=True, secrets={"GIT_TOKEN" : "XXXX"})

In [3]:
print(project)

{'kind': 'project', 'metadata': {'name': 'parcheggi', 'created': '2023-12-21T16:37:58.071000'}, 'spec': {'functions': [{'url': 'src/download-all.py', 'name': 'download-all', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'downloader'}, {'url': 'src/extract_parkings.py', 'name': 'extract-parkings', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'extract_parkings'}, {'url': 'src/aggregate_parkings.py', 'name': 'aggregate-parkings', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'aggregate_parkings'}, {'url': 'src/parkings_last_data.py', 'name': 'extract-parkings-latest', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'parkings_last_data', 'requirements': ['sqlalchemy', 'psycopg2-binary']}, {'url': 'src/parkings_to_db.py', 'name': 'to-db', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'to_db', 'requirements': ['sqlalchemy', 'psycopg2-binary']}, {'url': 'src/train_predictors.py', 'name': 'predict-day', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'predict_day', 'requir

### Set project secrets for execution
Set secrets for the project: database credentials and gitlab access token. These secrets will be used in the functions.

In [4]:
project.set_secrets({"GIT_TOKEN" : "XXXX", "DB_USERNAME": "digitalhub_owner_user", "DB_PASSWORD": "YYYYY"})

### Build functions
Build function images for remote execution. We use the base image "mlrun/mlrun" for the functions.

In [5]:
project.build_function("extract-parkings-latest", base_image="mlrun/mlrun")
project.build_function("predict-day", base_image="mlrun/mlrun")
project.build_function("to-db", base_image="mlrun/mlrun")
project.build_function("train-multimodel", base_image="mlrun/mlrun")

> 2024-01-23 10:26:24,372 [info] Started building image: .mlrun/func-parcheggi-extract-parkings-latest:latest
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:1.4.0  
[36mINFO[0m[0000] Retrieving image mlrun/mlrun:1.4.0 from registry index.docker.io 
[36mINFO[0m[0001] Built cross stage deps: map[]                
[36mINFO[0m[0001] Retrieving image manifest mlrun/mlrun:1.4.0  
[36mINFO[0m[0001] Returning cached image manifest              
[36mINFO[0m[0001] Executing 0 build triggers                   
[36mINFO[0m[0001] Unpacking rootfs as cmd RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt requires it. 
[36mINFO[0m[0019] RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt 
[36mINFO[0m[0019] Taking snapshot of full filesystem...        
[36mINFO[0m[0033] cmd: /bin/sh                                 
[36mINFO[0m[0033] args: [-c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] 


BuildStatus(ready=True, outputs={'image': '.mlrun/func-parcheggi-train-multimodel:latest'})

## Run the code
The project logically defines two procedures: one for data collection and aggregation and the other for the latest data update.
The first one, data-collection, is defined in the pipeline `data-update-pipeline` and the second one is defined in the function `extract-parking-latest`. We schedule the execution of the two procedures accordingly: the data-update-pipeline is scheduled every night, while the extract-parking-latest is scheduled every 10 minutes.

Once the operations are executed, the data is stored in the database and in the S3 bucket. It is possible to access it using SQLPad (for PostgreSQL), Minio (for S3), and MLRun UI for metadata. 

In [6]:
project.run("data-update-pipeline", schedule="0 0 * * *", artifact_path='s3://datalake/projects/parcheggi/artifacts/data-update-pipeline')

> 2024-01-23 10:46:22,657 [info] executing workflow scheduling 'workflow-runner-pipeline' remotely with kfp engine
> 2024-01-23 10:46:22,660 [info] Storing function: {'name': 'pipeline', 'uid': '3c237655b7df4182ba40d3bcb182be77', 'db': None}
> 2024-01-23 10:46:22,746 [info] task schedule created: {'schedule': '0 0 * * *', 'project': 'parcheggi', 'name': 'pipeline'}


In [7]:
project.run_function("extract-parkings-latest", local=False, schedule="*/10 * * * *")

> 2024-01-23 10:46:31,909 [info] Storing function: {'name': 'extract-parkings-latest-parkings-last-data', 'uid': '7373bff597d949ebba2f669773f61568', 'db': 'http://mlrun-api:8080'}
> 2024-01-23 10:46:31,992 [info] task schedule created: {'schedule': '*/10 * * * *', 'project': 'parcheggi', 'name': 'extract-parkings-latest-parkings-last-data'}


## Train the Model
the procedure ``train-multimodel`` is defined in the function ``src/train_multimodel.py``. It is used to build a NBEATS DL global model for the prediction of parking occupation. The model is trained for the specified number of epochs and is stored in the MLRun model storage.

In [8]:
project.run_function("train-multimodel", inputs={"parkings_di": "store://datasets/parcheggi/download-all_dataset#0:latest"}, params={"n_epochs": 10})

> 2024-01-23 10:49:43,866 [info] Storing function: {'name': 'train-multimodel-train-model', 'uid': 'aa2a78f007a04b44969c037c662ad2b3', 'db': 'http://mlrun-api:8080'}
> 2024-01-23 10:49:43,987 [info] Job is running in the background, pod: train-multimodel-train-model-84r6n
The `LightGBM` module could not be imported. To enable LightGBM support in Darts, follow the detailed instructions in the installation guide: https://github.com/unit8co/darts/blob/master/INSTALL.md
The `Prophet` module could not be imported. To enable Prophet support in Darts, follow the detailed instructions in the installation guide: https://github.com/unit8co/darts/blob/master/INSTALL.md
The `CatBoost` module could not be imported. To enable CatBoost support in Darts, follow the detailed instructions in the installation guide: https://github.com/unit8co/darts/blob/master/INSTALL.md
Epoch 9: 100%|##########| 213/213 [00:16<00:00, 13.29it/s, train_loss=0.00943]:08, 13.14it/s, train_loss=0.0292]it/s, train_loss=0.020]

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
parcheggi,...2ad2b3,0,Jan 23 10:51:20,completed,train-multimodel-train-model,v3io_user=testkind=jobowner=testmlrun/client_version=1.4.0mlrun/client_python_version=3.9.13host=train-multimodel-train-model-84r6n,parkings_di,n_epochs=10,,parcheggi_predictor_model





> 2024-01-23 10:54:08,064 [info] Run execution finished: {'status': 'completed', 'name': 'train-multimodel-train-model'}


<mlrun.model.RunObject at 0x7f24f4129430>

### Deploy Model
Next, we deploy the model in the serving function. Given that the interaction does not correspond to a standard MLRun model server, we will deploy the function as a custom server. See the implementation of ``predictor_serving.py`` for more details. 

For this, we add the model to the serving function using the ``add_model`` method.

In [9]:
serving_fn = project.set_function('src/predictor_serving.py', name='serving-predictor', kind='serving',image='mlrun/mlrun', requirements=['darts==0.25.0', 'pandas==1.4.4', 'numpy==1.22.4', 'patsy==0.5.2'])

In [10]:
serving_fn.add_model(
    "parcheggi_predictor_model",
    model_path="store://models/parcheggi/train-multimodel-train-model_parcheggi_predictor_model#0:latest",
    class_name="ParkingPredictorModel",
)
project.deploy_function(serving_fn)

> 2024-01-23 11:25:04,394 [info] Starting remote function deploy
2024-01-23 11:25:04  (info) Deploying function
2024-01-23 11:25:04  (info) Building
2024-01-23 11:25:04  (info) Staging files and preparing base images
2024-01-23 11:25:04  (info) Building processor image
2024-01-23 11:26:59  (info) Build complete
2024-01-23 11:27:09  (info) Function deploy complete
> 2024-01-23 11:27:15,008 [info] successfully deployed function: {'internal_invocation_urls': ['nuclio-parcheggi-serving-predictor.digitalhub-test.svc.cluster.local:8080'], 'external_invocation_urls': [':30650']}


DeployStatus(state=ready, outputs={'endpoint': 'http://:30650', 'name': 'parcheggi-serving-predictor'})

### Test the API

Once deployed, we can test the API. Let us test the latest data taken from the OpenData API. We take the latest data regarding the Autostazione parking and convert it into the format expected by the API: list of dates and values representing the occupation of the parking (percentage).

In [11]:
import datetime 
import requests
import json

date_str = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
parking_str='Autostazione'
API_URL = f'https://opendata.comune.bologna.it/api/explore/v2.1/catalog/datasets/disponibilita-parcheggi-storico/records?where=data%3C%3D%27{date_str}%27%20and%20parcheggio%3D%27{parking_str}%27&order_by=data%20DESC&limit=100'

latest_data_file = 'last_records.json'

with requests.get(API_URL) as r:
    with open(latest_data_file, "wb") as f:
        f.write(r.content)

with open(latest_data_file) as f:
    json_data = json.load(f)
    df_latest = pd.json_normalize(json_data['results']).drop(columns=['guid', 'occupazione']).rename(columns={"coordinate.lon": "lon", "coordinate.lat": "lat"})
    df_latest.data = df_latest.data.astype('datetime64')
    df_latest['value'] = df_latest.posti_occupati / df_latest.posti_totali
    df_latest['date'] = df_latest.data.dt.round('30min')
    df_latest = df_latest.drop(columns=['parcheggio'])
    df_latest = df_latest.groupby('date').agg({'value': 'mean'})


jsonstr = df_latest.reset_index().to_json(orient='records')
arr = json.loads(jsonstr)
arr

[{'date': 1705962600000, 'value': 0.1081761006},
 {'date': 1705964400000, 'value': 0.1044025157},
 {'date': 1705966200000, 'value': 0.1006289308},
 {'date': 1705968000000, 'value': 0.0981132075},
 {'date': 1705969800000, 'value': 0.0981132075},
 {'date': 1705971600000, 'value': 0.0955974843},
 {'date': 1705973400000, 'value': 0.0943396226},
 {'date': 1705975200000, 'value': 0.0943396226},
 {'date': 1705977000000, 'value': 0.0943396226},
 {'date': 1705978800000, 'value': 0.0943396226},
 {'date': 1705980600000, 'value': 0.0943396226},
 {'date': 1705982400000, 'value': 0.0981132075},
 {'date': 1705984200000, 'value': 0.0981132075},
 {'date': 1705986000000, 'value': 0.0943396226},
 {'date': 1705987800000, 'value': 0.0943396226},
 {'date': 1705989600000, 'value': 0.093081761},
 {'date': 1705991400000, 'value': 0.0918238994},
 {'date': 1705993200000, 'value': 0.0993710692},
 {'date': 1705995000000, 'value': 0.1044025157},
 {'date': 1705996800000, 'value': 0.1119496855},
 {'date': 17059986000

In [12]:
serving_fn.invoke(path="/v2/models/parcheggi_predictor_model/infer", body={"inputs": arr})

> 2024-01-23 14:57:43,717 [info] invoking function: {'method': 'POST', 'path': 'http://nuclio-parcheggi-serving-predictor.digitalhub-test.svc.cluster.local:8080/v2/models/parcheggi_predictor_model/infer'}


{'id': '3556029e-91f1-41c3-8e1c-01707a2f4561',
 'model_name': 'parcheggi_predictor_model',
 'outputs': [{'date': 1706023800000, 'value': 0.2431646884},
  {'date': 1706025600000, 'value': 0.2431481308},
  {'date': 1706027400000, 'value': 0.2334761067},
  {'date': 1706029200000, 'value': 0.2281330399},
  {'date': 1706031000000, 'value': 0.2107615773},
  {'date': 1706032800000, 'value': 0.1943966158},
  {'date': 1706034600000, 'value': 0.1683475054},
  {'date': 1706036400000, 'value': 0.1814267973},
  {'date': 1706038200000, 'value': 0.2069454072},
  {'date': 1706040000000, 'value': 0.200577501},
  {'date': 1706041800000, 'value': 0.2067392763},
  {'date': 1706043600000, 'value': 0.191728182}]}