In [1]:
import mlrun
import os
import pandas as pd

In [2]:
project = mlrun.get_or_create_project("daticomuni", context="./")

> 2024-07-24 13:43:33,549 [info] Project loaded successfully: {'project_name': 'daticomuni'}


In [3]:
new_folder = 'src'
if not os.path.exists(new_folder):
    os.makedirs(new_folder)


In [4]:
%%writefile "src/convert-all.py"

import mlrun
import pandas as pd

@mlrun.handler()
def convert_all(context, source_url_base: str):
    # direct processing, no actions required
    for ds_name in ["azioni", "campi", "macroambiti", "piani", "tassonomia"]:
        source_url = source_url_base + ds_name + ".txt"
        input_data = mlrun.get_dataitem(source_url)
        df = input_data.as_df(format="csv", encoding="windows-1251", delimiter=";")
        df.reset_index(drop=True, inplace=True)
        context.log_dataset(ds_name, df=df, index=False)

    # comuni: process name and dates
    source_url = source_url_base + "comuni" + ".txt"
    input_data = mlrun.get_dataitem(source_url)
    df = input_data.as_df(format="csv", encoding="windows-1251", delimiter=";")
    df["comune"] = df["NomeOrganizzazione"].str.replace("COMUNE DI ", "").str.upper()
    df["Data_det_assegnazione"] = pd.to_datetime(df["Data_det_assegnazione"], format="%d/%m/%Y %H:%M:%S", errors="ignore")
    df["Data_det_revoca"] = df["Data_det_revoca"].fillna("")
    df["Data_det_revoca"] = pd.to_datetime(df["Data_det_revoca"], format="%d/%m/%Y %H:%M:%S", errors="ignore")
    df.reset_index(drop=True, inplace=True)
    context.log_dataset("comuni", df=df, index=False)

    # valutazioni: process dates
    source_url = source_url_base + "valutazioni" + ".txt"
    input_data = mlrun.get_dataitem(source_url)
    df = input_data.as_df(format="csv", encoding="windows-1251", delimiter=";")
    df["data_pub"] = pd.to_datetime(df["data_pub"], format="%d/%m/%Y %H:%M:%S", errors="ignore")
    df.reset_index(drop=True, inplace=True)
    context.log_dataset("valutazioni", df=df, index=False)

Writing src/convert-all.py


In [5]:
project.set_function("src/convert-all.py", "convert-all", kind="job", image="mlrun/mlrun", handler="convert_all")
project.save()

<mlrun.projects.project.MlrunProject at 0x7bd574793760>

In [6]:
source_url_base = "s3://datalake/projects/daticomuni/base/"
project.run_function("convert-all", params={"source_url_base": source_url_base})

> 2024-07-24 13:43:33,673 [info] Storing function: {'name': 'convert-all-convert-all', 'uid': 'fd6739e49170494f94e8205eaa2b84c7', 'db': 'http://mlrun-api:8080'}
> 2024-07-24 13:43:33,778 [info] Job is running in the background, pod: convert-all-convert-all-hp2tj


The clone_target_dir attribute is deprecated in 1.6.2 and will be removed in 1.8.0. Use spec.build.source_code_target_dir instead.


> 2024-07-24 13:43:38,034 [info] To track results use the CLI: {'info_cmd': 'mlrun get run fd6739e49170494f94e8205eaa2b84c7 -p daticomuni', 'logs_cmd': 'mlrun logs fd6739e49170494f94e8205eaa2b84c7 -p daticomuni'}
> 2024-07-24 13:43:38,035 [info] Run execution finished: {'status': 'completed', 'name': 'convert-all-convert-all'}


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
daticomuni,...2b84c7,0,Jul 24 13:43:35,completed,convert-all-convert-all,v3io_user=ramankind=jobowner=ramanmlrun/client_version=1.6.2-rc1mlrun/client_python_version=3.9.13host=convert-all-convert-all-hp2tj,,source_url_base=s3://datalake/projects/daticomuni/base/,,azionicampimacroambitipianitassonomiacomunivalutazioni





> 2024-07-24 13:43:43,113 [info] Run execution finished: {'status': 'completed', 'name': 'convert-all-convert-all'}


<mlrun.model.RunObject at 0x7bd4f8a54670>

In [7]:
%%writefile "src/convert-aziendali.py"

import mlrun
import pandas as pd
import numpy as np

@mlrun.handler()
def convert_aziendali(context, source_url_base: str):
    converters={
        '2024_06_25 PIANI AZIENDALI': {'IDorganizzazione': np.int64, 'ANNUALITA': np.int64, 'Versione': np.int64, 'AnnoCompilazione': np.int64, 'CodiceCampoAzione': np.int64, 'CodiceTassonomiaAzione': np.int64, 'BeneF': np.int64, 'BeneM': np.int64, 'IDdettaglioAccorpamento': np.int64},
        'NuovaTassonomia': {},
        'T_NuovaTassonomia_DettaglioRev': {},
    }

    for ds_name in ["2024_06_25 PIANI AZIENDALI", "NuovaTassonomia", "T_NuovaTassonomia_DettaglioRev"]:
        source_url = source_url_base + ds_name + ".xlsx"
        input_data = mlrun.get_dataitem(source_url)
        df = pd.read_excel(input_data.get(), sheet_name=0, header=0, converters=converters[ds_name])
        df.reset_index(drop=True, inplace=True)
        context.log_dataset(ds_name, df=df, index=False)

Overwriting src/convert-aziendali.py


In [8]:
project.set_function("src/convert-aziendali.py", "convert-aziendali", kind="job", image="mlrun/mlrun", handler="convert_aziendali", requirements=["openpyxl"])
project.save()

<mlrun.projects.project.MlrunProject at 0x7bd574793760>

In [9]:
source_url_base = "s3://datalake/projects/daticomuni/base/"
project.run_function("convert-aziendali", params={"source_url_base": source_url_base}, local=True)

> 2024-07-24 13:43:43,202 [info] Storing function: {'name': 'convert-aziendali-convert-aziendali', 'uid': 'f0db7d89576a4f5ebd05deed98fb3e2d', 'db': 'http://mlrun-api:8080'}


The clone_target_dir attribute is deprecated in 1.6.2 and will be removed in 1.8.0. Use spec.build.source_code_target_dir instead.
Passing bytes to 'read_excel' is deprecated and will be removed in a future version. To read from a byte string, wrap it in a `BytesIO` object.
Passing bytes to 'read_excel' is deprecated and will be removed in a future version. To read from a byte string, wrap it in a `BytesIO` object.
Converting input from bool to <class 'numpy.uint8'> for compatibility.
Passing bytes to 'read_excel' is deprecated and will be removed in a future version. To read from a byte string, wrap it in a `BytesIO` object.


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
daticomuni,...fb3e2d,0,Jul 24 13:43:43,completed,convert-aziendali-convert-aziendali,v3io_user=ramankind=localowner=ramanhost=jupyter-raman-testraman-6cbb8f7d55-b6trl,,source_url_base=s3://datalake/projects/daticomuni/base/,,2024_06_25 PIANI AZIENDALINuovaTassonomiaT_NuovaTassonomia_DettaglioRev





> 2024-07-24 13:43:56,629 [info] Run execution finished: {'status': 'completed', 'name': 'convert-aziendali-convert-aziendali'}


<mlrun.model.RunObject at 0x7bd54982f0a0>