In [14]:
import os
import pandas as pd
import digitalhub as dh

In [15]:
project = dh.get_or_create_project('daticomuni')

In [16]:
new_folder ='src'
if not os.path.exists(new_folder):
    os.makedirs(new_folder)

In [9]:
di = project.new_artifact(name="daticomuni",kind="artifact", path='/daticomuni.zip')

In [10]:
di = project.get_artifact('daticomuni')
di.key

'store://daticomuni/artifact/artifact/daticomuni:1f7e46b1ef92406daacc50197ed4f96e'

In [11]:
%%writefile "src/convert-all.py"

import pandas as pd
from os import path, makedirs
import zipfile
        
file_basepath = "daticomuni"

def convert_all(project, source_artifact):
    data_dir = f"{file_basepath}/data"
    try:
        shutil.rmtree(data_dir)
    except:
        print("Error deleting data dir")
                
    # Create the directory for the data
    if not path.exists(data_dir):
        makedirs(data_dir)
        
    try:
        archiveFile =source_artifact.download(data_dir) # this must change in the function
        with zipfile.ZipFile(archiveFile, 'r') as zip_ref:
            zip_ref.extractall(data_dir)    
    except:
        print("Error downloading data")
        
           
    for ds_name in ["azioni", "campi", "macroambiti", "piani", "tassonomia"]:
        source_url = data_dir + '/' + ds_name + ".txt"        
        df = pd.read_csv(source_url, encoding="windows-1251", delimiter=";")
        df.reset_index(drop=True, inplace=True)
        project.log_dataitem(ds_name, data=df, kind='table', index=False)        

    # comuni: process name and dates
    source_url =  data_dir + '/' + "comuni" + ".txt"
    df = pd.read_csv(source_url, encoding="windows-1251", delimiter=";")
    df["comune"] = df["NomeOrganizzazione"].str.replace("COMUNE DI ", "").str.upper()
    df["Data_det_assegnazione"] = pd.to_datetime(df["Data_det_assegnazione"], format="%d/%m/%Y %H:%M:%S", errors="ignore")
    df["Data_det_revoca"] = df["Data_det_revoca"].fillna("")
    df["Data_det_revoca"] = pd.to_datetime(df["Data_det_revoca"], format="%d/%m/%Y %H:%M:%S", errors="ignore")
    df.reset_index(drop=True, inplace=True)
    project.log_dataitem("comuni", data=df, kind='table', index=False)

    # valutazioni: process dates
    source_url =  data_dir + '/' + "valutazioni" + ".txt"
    df = pd.read_csv(source_url, encoding="windows-1251", delimiter=";")
    df["data_pub"] = pd.to_datetime(df["data_pub"], format="%d/%m/%Y %H:%M:%S", errors="ignore")
    df.reset_index(drop=True, inplace=True)
    project.log_dataitem("valutazioni", data=df, kind='table', index=False)

Writing src/convert-all.py


In [98]:
func_convert_all = project.new_function(name="convert_all",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/convert-all.py", "handler": "convert_all"})

In [99]:
run_convert_all = func_convert_all.run(action="job",inputs={"source_artifact": di.key},outputs={}, local_execution=False)

In [17]:
di.key

'store://daticomuni/dataitem/table/source_url_base:835b1b8603094aa7982660123036ef48'

In [61]:
 for ds_name in ["azioni"]: #, "campi", "macroambiti", "piani", "tassonomia"
     source_url = di.key + '/' + ds_name + ".txt"
     print(source_url)
     input_data = project.get_dataitem(source_url)
     # print(input_data)
     df = input_data.as_df(file_format="csv", encoding="windows-1251", delimiter=";")     

store://daticomuni/dataitem/table/source_url_base:ec16b8c26bcb44aeaff79b5ede6f4c36/azioni.txt


IsADirectoryError: [Errno 21] Is a directory: 'tmp_data'

In [12]:
%%writefile "src/convert-aziendali.py"

import pandas as pd
import numpy as np

file_basepath = "daticomuni"

def convert_aziendali(project, source_artifact):
    converters={
        '2024_06_25 PIANI AZIENDALI': {'IDorganizzazione': np.int64, 'ANNUALITA': np.int64, 'Versione': np.int64, 'AnnoCompilazione': np.int64, 'CodiceCampoAzione': np.int64, 'CodiceTassonomiaAzione': np.int64, 'BeneF': np.int64, 'BeneM': np.int64, 'IDdettaglioAccorpamento': np.int64},
        'NuovaTassonomia': {},
        'T_NuovaTassonomia_DettaglioRev': {},
    }

    data_dir = f"{file_basepath}/data"
    try:
        shutil.rmtree(data_dir)
    except:
        print("Error deleting data dir")
                
    # Create the directory for the data
    if not path.exists(data_dir):
        makedirs(data_dir)
        
    try:
        archiveFile =source_artifact.download(data_dir) # this must change in the function
        with zipfile.ZipFile(archiveFile, 'r') as zip_ref:
            zip_ref.extractall(data_dir)    
    except:
        print("Error downloading data")
        
    for ds_name in ["2024_06_25 PIANI AZIENDALI", "NuovaTassonomia", "T_NuovaTassonomia_DettaglioRev"]:
        source_url = data_dir + '/' + ds_name + ".xlsx"
        df = pd.read_csv(source_url, encoding="windows-1251", delimiter=";")
        # df = pd.read_excel(input_data.get(), sheet_name=0, header=0, converters=converters[ds_name])
        df.reset_index(drop=True, inplace=True)
         project.log_dataitem(ds_name, data=df, kind='table', index=False)        



Writing src/convert-aziendali.py


In [18]:
func_az = project.new_function(name="convert_aziendali",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/convert-aziendali", "handler": "create_list"})

In [None]:
run_convert_az = func_az.run(action="job",inputs={},outputs={}, local_execution=False)

In [None]:
run_convert_az.refresh().status.state