In [6]:
import json, requests, time, itertools
import pandas as pd
from tqdm import tqdm
from dateutil.parser import parse

##  #1 Retrive packages
This script retrieves packages metadata from data.gov.ua and saves it locally in **packages.json**.

In [7]:
# get package identifiers, generate URLs
package_list = requests.get("https://data.gov.ua/api/3/action/package_list").json()["result"]
package_list = ["https://data.gov.ua/api/3/action/package_show?id={}".format(i) for i in package_list]

# get packages matadata
packages = []
for i in tqdm(package_list):
    r = requests.get(i).json()
    packages.append(r)
    time.sleep(0.25)

# get result
packages = [i["result"] for i in packages if i["success"] == True]
    
# save copy of JSON locally
with open('../data/packages.json', 'w') as outfile:
    json.dump(packages, outfile, ensure_ascii=False)

## #2. Parse JSON

In [8]:
with open('data/packages.json') as json_file:
    packages = json.load(json_file)

# sample datasets of publishers from spreadsheet
publishers = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vTqiHo39Su2M7E31zMrKrWAqCZVMlrl49_VdItmFjf2lWAelJmO8P8dcMuzc6gq9LcVgmCcXMX6z-Eg/pub?gid=1990056977&single=true&output=csv")
packages = [i for i in packages if i["organization"]["name"] in list(publishers["organization_name"])]

# create df
packages_df =  pd.DataFrame({"id": [i["id"] for i in packages],
                             "title": [i["title"] for i in packages],
                             "notes": [i["notes"] for i in packages],
                             "purpose": [i["purpose_of_collecting_information"] for i in packages],
                             "license_title": [i["license_title"] if "license_title" in i else 'null' for i in packages],
                             "license_id": [i["license_id"] if "license_id" in i else 'null' for i in packages],
                             "num_resources": [i["num_resources"] if "num_resources" in i else 'null' for i in packages],
                             "tag_string": [i["tag_string"] for i in packages],
                             "num_tags": [i["num_tags"] if "num_tags" in i else 'null' for i in packages],
                             "metadata_created": [parse(i["metadata_created"]).date().isoformat() for i in packages],
                             "metadata_modified": [parse(i["metadata_modified"]).date().isoformat() for i in packages],
                             "organization_title": [i["organization"]["title"] for i in packages],
                             "organization_name": [i["organization"]["name"] for i in packages],
                             "maintainer": [i["maintainer"] if "maintainer" in i else 'null' for i in packages],
                             "maintainer_email": [i["maintainer_email"] if "maintainer_email" in i else 'null' for i in packages]
                            })

packages_df["url"] = ["https://data.gov.ua/dataset/{}".format(i) for i in packages_df["id"]]
packages_df["package_title_id"] = packages_df["title"]  + " (" + packages_df["id"] + ")"


packages_df.to_csv("data/packages_all.csv", index=False)