# Import metadata from Datenregister Berlin

In [5]:
import pandas as pd

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500

import requests
from tqdm.notebook import tqdm
import json
import warnings
import time

warnings.simplefilter("ignore", category=(UserWarning, FutureWarning))

In [13]:
# Constants
CKAN_API_LINK = (
    "https://datenregister.berlin.de/api/3/action/current_package_list_with_resources"
)

DATA_PATH = "01_dcat_de_metadata.csv"

In [6]:
def get_full_package_list(limit=500, sleep=2):
    """Get full package list from CKAN API"""
    offset = 0
    frames = []
    while True:
        print(f"{offset} packages retrieved.")
        url = CKAN_API_LINK + f"?limit={limit}&offset={offset}"
        res = requests.get(url)
        data = json.loads(res.content)
        if data["result"] == []:
            break
        data = pd.DataFrame(pd.json_normalize(data["result"]))
        frames.append(data)
        offset += limit
        time.sleep(sleep)
    data = pd.concat(frames)
    data = data.reset_index(drop=True)
    return data

# Load data

In [7]:
# Retrieve metadata for all datasets from CKAN API.
df = get_full_package_list()
df.to_csv(DATA_PATH, index=False)

0 packages retrieved.
500 packages retrieved.
1000 packages retrieved.
1500 packages retrieved.
2000 packages retrieved.
2500 packages retrieved.
3000 packages retrieved.
3500 packages retrieved.


In [33]:
print(
    f"We have {len(df):,.0f} datasets in the catalogue and {df.shape[1]} properties.\n"
)
display(df.info(memory_usage="deep"))
display(df.head())

We have 3,310 datasets in the catalogue and 54 properties.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3310 entries, 0 to 3309
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   author                        3310 non-null   object
 1   author_email                  923 non-null    object
 2   berlin_source                 3310 non-null   object
 3   berlin_type                   3310 non-null   object
 4   creator_user_id               3310 non-null   object
 5   date_released                 3310 non-null   object
 6   date_updated                  2880 non-null   object
 7   geographical_coverage         3298 non-null   object
 8   geographical_granularity      3297 non-null   object
 9   id                            3310 non-null   object
 10  isopen                        3310 non-null   bool  
 11  license_id                    3310 non-null   object
 12  license_title   

None

Unnamed: 0,author,author_email,berlin_source,berlin_type,creator_user_id,date_released,date_updated,geographical_coverage,geographical_granularity,id,...,organization.is_organization,organization.approval_status,organization.state,attribution_text,username,hvd_category,preview_image,sample_record,extras,ressource_count
0,"Senatsverwaltung für Arbeit, Soziales, Gleichs...",bildungszeit@senasgiva.berlin.de,simplesearch,datensatz,50e8a594-1945-487e-b77e-3baa14f6a3c7,2015-10-19,2024-02-14,Berlin,Berlin,a6828125-ae95-404f-9fb0-52eb91302fd8,...,True,approved,active,,,,,,,5
1,LAGeSo,webmaster@lageso.berlin.de,simplesearch,datensatz,b1e7b5de-7dab-497a-ba76-867bf93f22b3,2015-07-22,2024-07-26,Berlin,Berlin,4bd75f18-20bf-45e9-af17-3bf03ab6dc58,...,True,approved,active,,,,,,,9
2,"Presse,- Öffentlichkeitsarbeit & Digitale Komm...",internet@ba-sz.berlin.de,simplesearch,datensatz,50e8a594-1945-487e-b77e-3baa14f6a3c7,2020-02-18,2024-08-02,Steglitz-Zehlendorf,Bezirk,ab9cc108-b6f1-4115-8169-68d9d793b484,...,True,approved,active,,,,,,,5
3,Industrie- und Handelskammer zu Berlin,,api-gitaction,datensatz,caa908e5-339d-4411-a11d-b13eb8e17f0d,2023-03-10,2024-08-02,Berlin,GPS-Koordinaten,52bb6a20-819d-49dc-bf75-58941f134e59,...,True,approved,active,,Theresa Ewert,,,,,1
4,VBB - Verkehrsverbund Berlin-Brandenburg GmbH,,webform,datensatz,b136bb34-d39b-4be2-8dd7-8779c2d92fe6,2013-07-01,2024-08-01,Berlin,GPS-Koordinaten,96643363-971f-43f9-b93e-e721c5b84872,...,True,approved,active,VBB Verkehrsverbund Berlin-Brandenburg GmbH,,,,,,1


# Examine resources

In [23]:
df["ressource_count"] = df.resources.apply(len)
print(
    f"Datasets have between {df.ressource_count.min()} and {df.ressource_count.max()} resources."
)
display(df.ressource_count.value_counts().sort_index())

Datasets have between 0 and 143 resources.


ressource_count
0         1
1      1022
2       353
3       176
4       595
5      1034
6        64
7        21
8         2
9        31
11        2
12        2
13        2
14        1
15        1
18        1
44        1
143       1
Name: count, dtype: int64

In [30]:
# Create dataframe from distributions.

results = []
for idx, row in df.iterrows():
    tmp = [pd.json_normalize(x) for x in row.resources]
    try:
        tmp = pd.concat(tmp)
    except Exception as e:
        print(f"Error at index {idx}: {e}")
        continue
    results.append(tmp)

dist = pd.concat(results)
dist.reset_index(drop=True, inplace=True)

Error at index 3121: No objects to concatenate


In [32]:
print(f"We have {len(dist):,.0f} ressources in the {len(df)} datasets.")
print()
display(dist.info())
display(dist.head())

We have 10,938 ressources in the 3310 datasets.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10938 entries, 0 to 10937
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   cache_last_updated         0 non-null      object
 1   cache_url                  7 non-null      object
 2   created                    10938 non-null  object
 3   description                10938 non-null  object
 4   format                     10938 non-null  object
 5   hash                       10938 non-null  object
 6   id                         10938 non-null  object
 7   language                   928 non-null    object
 8   last_modified              16 non-null     object
 9   metadata_modified          10938 non-null  object
 10  mimetype                   466 non-null    object
 11  mimetype_inner             352 non-null    object
 12  name                       9598 non-null   object
 13  package_id  

None

Unnamed: 0,cache_last_updated,cache_url,created,description,format,hash,id,language,last_modified,metadata_modified,...,main,resource_locator_function,resource_locator_protocol,weight,resource_group_id,webstore_last_updated,webstore_url,apiurl,Abteilung,Landesstelle
0,,,2024-08-03T00:28:38.320849,"Webseite von ""Suche nach anerkannten Veranstal...",HTML,,ccd6c622-8137-463f-b636-74e72c0787ca,de,,2024-08-03T00:28:38.290405,...,,,,,,,,,,
1,,,2024-08-03T00:28:38.320853,"REST-API mit JSON-Ausgabe für ""Suche nach aner...",JSON,,8d32c8ba-c9ba-49e2-aa4c-e41f0a731835,de,,2024-08-03T00:28:38.290607,...,,,,,,,,,,
2,,,2024-08-03T00:28:38.320855,"REST-API mit XML-Ausgabe für ""Suche nach anerk...",XML,,11c644dc-d341-4c3e-acdf-e17101e943c7,de,,2024-08-03T00:28:38.290741,...,,,,,,,,,,
3,,,2024-08-03T00:28:38.320856,"REST-API mit Excel(XLS)-Ausgabe für ""Suche nac...",XLS,,b076df5d-dfa1-41a5-8371-091937957d88,de,,2024-08-03T00:28:38.290862,...,,,,,,,,,,
4,,,2024-08-03T00:28:38.320858,"REST-API mit CSV-Ausgabe für ""Suche nach anerk...",CSV,,4a5ea596-fbbc-4da8-a438-810c19bb0cc1,de,,2024-08-03T00:28:38.290979,...,,,,,,,,,,
