In [1]:
import mlrun
import pandas as pd

## Initialize project
Load project from git repository and set secrets to access gitlab. We set clone=True to clone the repository each time and to overwrite the code. 

In [2]:
project = mlrun.load_project("./parcheggi", "git://gitlab.hpc.cineca.it/dt-data-projects/gdb-project-parkings.git", clone=True, secrets={"GIT_TOKEN" : "XXXXXX"})

In [3]:
print(project)

{'kind': 'project', 'metadata': {'name': 'parcheggi', 'created': '2023-12-21T16:37:58.071000'}, 'spec': {'functions': [{'url': 'src/download-all.py', 'name': 'download-all', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'downloader'}, {'url': 'src/extract_parkings.py', 'name': 'extract-parkings', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'extract_parkings'}, {'url': 'src/aggregate_parkings.py', 'name': 'aggregate-parkings', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'aggregate_parkings'}, {'url': 'src/parkings_last_data.py', 'name': 'extract-parkings-latest', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'parkings_last_data', 'requirements': ['sqlalchemy', 'psycopg2-binary']}, {'url': 'src/parkings_to_db.py', 'name': 'to-db', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'to_db', 'requirements': ['sqlalchemy', 'psycopg2-binary']}, {'url': 'src/train_predictors.py', 'name': 'predict-day', 'kind': 'job', 'image': 'mlrun/mlrun', 'handler': 'predict_day', 'requir

### Set project secrets for execution
Set secrets for the project: database credentials and gitlab access token. These secrets will be used in the functions.

In [4]:
project.set_secrets({"GIT_TOKEN" : "XXXXXX", "DB_USERNAME": "digitalhub_owner_user", "DB_PASSWORD": "XXXXXX-XXXXXXX"})

### Build functions
Build function images for remote execution. We use the base image "mlrun/mlrun" for the functions.

In [5]:
project.build_function("extract-parkings-latest", base_image="mlrun/mlrun")
project.build_function("predict-day", base_image="mlrun/mlrun")
project.build_function("to-db", base_image="mlrun/mlrun")

> 2024-01-15 10:37:35,692 [info] Started building image: .mlrun/func-parcheggi-extract-parkings-latest:latest
[36mINFO[0m[0000] Retrieving image manifest mlrun/mlrun:1.4.0  
[36mINFO[0m[0000] Retrieving image mlrun/mlrun:1.4.0 from registry index.docker.io 
[36mINFO[0m[0001] Built cross stage deps: map[]                
[36mINFO[0m[0001] Retrieving image manifest mlrun/mlrun:1.4.0  
[36mINFO[0m[0001] Returning cached image manifest              
[36mINFO[0m[0001] Executing 0 build triggers                   
[36mINFO[0m[0001] Unpacking rootfs as cmd RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt requires it. 
[36mINFO[0m[0019] RUN echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt 
[36mINFO[0m[0019] Taking snapshot of full filesystem...        
[36mINFO[0m[0034] cmd: /bin/sh                                 
[36mINFO[0m[0034] args: [-c echo 'Installing /empty/requirements.txt...'; cat /empty/requirements.txt] 


BuildStatus(ready=True, outputs={'image': '.mlrun/func-parcheggi-to-db:latest'})

## Run the code
The project logically defines two procedures: one for data collection and aggregation and the other for the latest data update.
The first one, data-collection, is defined in the pipeline `data-update-pipeline` and the second one is defined in the function `extract-parking-latest`. We schedule the execution of the two procedures accordingly: the data-update-pipeline is scheduled every night, while the extract-parking-latest is scheduled every 10 minutes.

Once the operations are executed, the data is stored in the database and in the S3 bucket. It is possible to access it using SQLPad (for PostgreSQL), Minio (for S3), and MLRun UI for metadata. 

In [6]:
project.run("data-update-pipeline", schedule="0 0 * * *", artifact_path='s3://datalake/projects/parcheggi/artifacts/data-update-pipeline')

> 2024-01-15 10:40:03,120 [info] executing workflow scheduling 'workflow-runner-pipeline' remotely with kfp engine
> 2024-01-15 10:40:03,123 [info] Storing function: {'name': 'pipeline', 'uid': 'f0cdb1d00b9044aebf33bab163568cd6', 'db': None}
> 2024-01-15 10:40:03,188 [info] task schedule modified: {'schedule': '0 0 * * *', 'project': 'parcheggi', 'name': 'pipeline'}


In [7]:
project.run_function("extract-parkings-latest", local=False, schedule="*/10 * * * *")

> 2024-01-15 10:40:03,201 [info] Storing function: {'name': 'extract-parkings-latest-parkings-last-data', 'uid': '1a912282bba3419e9f63447d16036209', 'db': 'http://mlrun-api:8080'}
> 2024-01-15 10:40:03,272 [info] task schedule modified: {'schedule': '*/10 * * * *', 'project': 'parcheggi', 'name': 'extract-parkings-latest-parkings-last-data'}
