In [1]:
import os
import pandas as pd
import digitalhub as dh

In [2]:
proj = dh.get_or_create_project('datiprotezione')

In [3]:
new_folder ='src'
if not os.path.exists(new_folder):
    os.makedirs(new_folder)

In [4]:
%%writefile "src/create-list.py"

import requests
import pandas as pd
from deep_translator import GoogleTranslator
import json

# translation
translator = GoogleTranslator(source='it', target='en')
def translate_text(text):
    if text is not None:
        text = str(text)
        text = translator.translate(str(text))
    return text

def givemelink(x):
    datakeywords = ['siat','urbanistica-dati','catasto']
    urlitems = []
    rd = x
    if isinstance(x, list):
        for urlist in x:
            for datakey in datakeywords:
                if datakey.lower() in urlist.lower():
                    urlitems = urlist.split("||")
                    break
    else:
        urlitems = x.split("||")
    
    for urlitem in urlitems:
        for datakey in datakeywords:
            if datakey.lower() in urlitem.lower():
                rd = urlitem
                break

    return(rd.replace("||",""))

def create_list(project): #context
    start = 1
    to = 20
    maxPageSize = 100
    url = "https://siat.provincia.tn.it/geonetwork/srv/ita/q?_content_type=json&any=&bucket=s101&facet.q=&fast=index&from=START&resultType=details&sortBy=relevance&sortOrder=&to=TO" 
    url = url.replace("START",str(start)).replace("TO",str(to))
    response = requests.get(url)
    data = response.json()
    start = int(data['@from'])
    to = int(data['@to'])
    total = int(data['summary']['@count'])
    rest = total % maxPageSize
    steps = (total - rest) / 100    
    
    dfs = []
    for start in range(int(steps)):
        start = 1 + start * 100
        to = start + 100 -1
        url = "https://siat.provincia.tn.it/geonetwork/srv/ita/q?_content_type=json&any=&bucket=s101&facet.q=&fast=index&from=START&resultType=details&sortBy=relevance&sortOrder=&to=TO" 
        url = url.replace("START",str(start)).replace("TO",str(to))
        response = requests.get(url)
        data = response.json()
        records = data.get('metadata', [])
        df = pd.DataFrame(records)
        dfs.append(df)
    
    start = to +1
    to = start + rest -1
    url = "https://siat.provincia.tn.it/geonetwork/srv/ita/q?_content_type=json&any=&bucket=s101&facet.q=&fast=index&from=START&resultType=details&sortBy=relevance&sortOrder=&to=TO" 
    url = url.replace("START",str(start)).replace("TO",str(to))
    response = requests.get(url)
    data = response.json()
    records = data.get('metadata', [])
    df = pd.DataFrame(records)
    dfs.append(df)
    
    # create a dataset with all the entries
    siatdata = pd.concat(dfs, ignore_index=True)
    # because these kind of attributes contain in some cases list I extract only the first value
    # (change it if you want have more information)
    siatdata['type'] = siatdata['type'].apply(lambda x: x[0] if isinstance(x, list) else x) 
    siatdata['identifier'] = siatdata['identifier'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    siatdata['format'] = siatdata['format'].apply(lambda x: x[0] if isinstance(x, list) else x)    
    
    siatdata['link'] = siatdata['link'].apply(givemelink)
    
    data = siatdata[['title','abstract','lineage','resourceConstraints','type',
          'legalConstraints','identifier',"crsDetails","maintenanceAndUpdateFrequency_text",
          'spatialRepresentationType_text','denominator',
          'tempExtentBegin','tempExtentEnd','serviceType',
          'updateFrequency','revisionDate','classification_text','defaultTitle',
          'publicationDate','creationDate','crs',"parentId",'link']]
    
    data['title_en'] = data['defaultTitle'].apply(translate_text)
    
    project.log_dataitem("siat_trentino", data=data, kind='table', index=False)

Writing src/create-list.py


In [5]:
func = proj.new_function(name="create_list",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/create-list.py", "handler": "create_list"},
                         requirements=["deep_translator"]) #"geopandas"

In [6]:
run_create_list = func.run(action="job",inputs={},outputs={}, local_execution=False)

In [7]:
di = proj.get_dataitem('siat_trentino')
df = di.as_df()

In [8]:
%%writefile "src/convert-list.py"

# import geopandas
import os
import urllib.request
import pandas as pd

# def read_geopandas(name, url, project):
#     gdf = geopandas.read_file(url)
#     gdf.to_parquet('./'+name+'.parquet')
#     with open('./'+name+'.parquet', 'rb') as in_file:
#         content = in_file.read()
#         project.log_dataitem(name=name, kind="table", data=content)
#         os.remove('./'+name+'.parquet')
        

def read_file(name, url, project):
    # get extension from url
    extension = url.split(".")[-1]
    path = f"./{name}.{extension}"
    urllib.request.urlretrieve(url, path)
    project.log_artifact(name=name, kind="artifact", source=path)
    os.remove(path)
 
def convert_list(project, list):
    data = list.as_df()
    data = data.to_dict(orient='records')
    error_data = []
    for index, item in enumerate(data): # after [:180],last 200 items [-200:]
        name = item['title_en']
        link = item['link']
        print(f"Converting {name} to artifact (link {link})")
        # convert name to lower case and replace spaces with underscores
        name = name.lower().replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_")
        if link.endswith(".zip"):
            # try:
                # read_geopandas(name, link, project)
            # except Exception as e: 
                # print(f"{type(e).__name__} was raised")
                try:
                    # os.remove('./'+name+'.parquet')
                    # print(f"Retrying {name} to save as artifact({link})")
                    read_file(name, link, project)
                except:
                    print(f"Error reading file: {item['link']}")                    
                    continue
        elif link.endswith(".png"):
            try:
                read_file(name, link, project)
            except:
                print(f"Error reading file: {item['link']}")
                continue
        else:    
            print(f"Error reading file: {item['link']}")
            error_data.append(item)

    project.log_dataitem(name="error_data", kind="table", data=pd.DataFrame(error_data))

Writing src/convert-list.py


In [9]:
func = proj.new_function(name="convert_list",
                         kind="python",
                         python_version="PYTHON3_10",
                         source={"source": "src/convert-list.py", "handler": "convert_list"}) #requirements=["geopandas"] 

In [10]:
run_convert_list = func.run(action="job",inputs={"list": di.key}, local_execution=False) 

In [11]:
# %pip install geopandas

In [12]:
# import pandas as pd
# import urllib.request
# import os

# name = 'Penalty for Deep Gravitational Slope Deformations (DGPV)'
# name = name.lower().replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_")

# url = 'https://siatservices.provincia.tn.it/idt/vector/p_TN_a9108393-45b3-4b9d-a5b9-24158aa7dca4.zip'
# extension = url.split(".")[-1]
# path = f"./{name}.{extension}"
# urllib.request.urlretrieve(url, path)
# proj.log_artifact(name='test', kind="artifact", source=path)
# os.remove(path)   

In [1]:
# import pandas as pd
# import urllib.request
# import os
# import geopandas

# url = 'https://siatservices.provincia.tn.it/idt/vector/p_TN_51662a53-60cc-4b00-ba14-ab457bcf4f44.zip'
# name = 'Ordinary contour lines 10m step for CTP'
# name = name.lower().replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_")
# gdf = geopandas.read_file(url)
# gdf

Unnamed: 0,OBJECTID_1,id,contour,sez_id,cv_liv_dt,ty_cv_liv,cv_liv_q,cv_liv_cer,fonte,scala,...,cv_10,cv_25,cv_50,cv_100,cv_200,ent_vs,semplifica,Shape_Leng,classid,geometry
0,1.0,1.0,3740.0,24160.0,03,02,3740.0,01,03,04,...,1.0,0.0,0.0,0.0,0.0,1,0.0,393.958142,GEO001_1,"LINESTRING (624169.076 5144568.924, 624169.673..."
1,2.0,2.0,3380.0,24160.0,03,02,3380.0,01,03,04,...,1.0,0.0,0.0,0.0,0.0,1,0.0,40.128869,GEO001_2,"LINESTRING (624998.958 5144346.416, 624999.601..."
2,3.0,3.0,3420.0,24160.0,03,02,3420.0,01,03,04,...,1.0,0.0,0.0,0.0,0.0,1,0.0,1.719230,GEO001_3,"LINESTRING (624767.823 5140988.177, 624767.689..."
3,4.0,4.0,2870.0,25100.0,03,02,2870.0,01,03,04,...,1.0,0.0,0.0,0.0,0.0,1,0.0,14.208717,GEO001_4,"LINESTRING (634756.899 5148493.92, 634756.207 ..."
4,5.0,5.0,2560.0,42140.0,03,02,2560.0,01,03,04,...,1.0,0.0,0.0,0.0,0.0,1,0.0,1540.277400,GEO001_5,"LINESTRING (636900.956 5121106.561, 636901.348..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116716,116718.0,116718.0,2910.0,0.0,03,02,2910.0,01,03,04,...,1.0,0.0,0.0,0.0,0.0,1,0.0,290.073645,GEO001_116718,"LINESTRING (623429.86 5110496.828, 623429.773 ..."
116717,116719.0,116719.0,2890.0,0.0,03,02,2890.0,01,03,04,...,1.0,0.0,0.0,0.0,0.0,1,0.0,354.400800,GEO001_116719,"LINESTRING (623508.774 5110492.786, 623508.677..."
116718,116720.0,116720.0,2880.0,0.0,03,02,2880.0,01,03,04,...,1.0,0.0,0.0,0.0,0.0,1,0.0,340.211264,GEO001_116720,"LINESTRING (623514.346 5110498.631, 623513.626..."
116719,116721.0,116721.0,2870.0,0.0,03,02,2870.0,01,03,04,...,1.0,0.0,0.0,0.0,0.0,1,0.0,329.177394,GEO001_116721,"LINESTRING (623529.045 5110509.653, 623526.759..."


In [2]:
# gdf.type

0         LineString
1         LineString
2         LineString
3         LineString
4         LineString
             ...    
116716    LineString
116717    LineString
116718    LineString
116719    LineString
116720    LineString
Length: 116721, dtype: object

In [19]:
# df1 = pd.DataFrame(gdf)

In [None]:
# proj.log_dataitem(name=name, kind="table", source=df1)