# Merging and Transforming Parliamentary Data

This notebook consolidates all JSON data extracted from various state parliaments, transforms the structure, cleans up inconsistencies, and sorts entries chronologically. The result is a unified file `combined_transformed_data.json` ready for further processing.

In [1]:
# Import libraries 
import os      
import re      
import json    
from datetime import datetime  

**Define functions**

In [2]:
def parse_links(element_id: str):
    """
    Splits the 'ElementID' field into a list of dictionaries with 'label' and 'url'.
    The scraper uses '|' as a separator between link and link description.
    Returns a list of dictionaries with the keys 'label' and 'url'.
    """
    # Split the string at each '|' and remove whitespace
    parts = [p.strip() for p in element_id.split("|") if p.strip()]
    links = []
    for p in parts:
        # If there is a ':' in the substring, assume it is 'label:url'
        if ":" in p:
            label, url = p.split(":", 1)
            links.append({"label": label.strip(), "url": url.strip()})
        else:
            # If no label is present, only store the URL
            links.append({"label": None, "url": p})
    return links

def parse_date(s: str):
    """
    Converts a date string in the format 'DD.MM.YYYY' to a datetime object.
    If the date cannot be parsed, returns datetime.min as fallback.
    """
    try:
        # Try to parse the date
        return datetime.strptime(s, "%d.%m.%Y")
    except Exception:
        # Fallback: return a minimal datetime value
        return datetime.min

def extract_date_from_text(text):
    """
    Extracts dates from texts like '(vom 29. August 2023)' or 'vom 29. August 2023'.
    Returns the date in the format '29.08.2023' if successful, otherwise an empty string.
    """
    # Mapping of month names to month numbers
    months = {
        "Januar": "01", "Februar": "02", "März": "03", "April": "04",
        "Mai": "05", "Juni": "06", "Juli": "07", "August": "08",
        "September": "09", "Oktober": "10", "November": "11", "Dezember": "12"
    }
    
    # Search for a date pattern in the text
    match = re.search(r"vom\s*(\d{1,2})\.?\s*([A-Za-zäöüÄÖÜ]+)\s+(\d{4})", text)
    if match:
        day, month_str, year = match.groups()
        month = months.get(month_str, "")
        if month:
            # Return in the format 'DD.MM.YYYY'
            return f"{int(day):02d}.{month}.{year}"
    return ""

**Load and combine JSON files from all parliaments**

In [None]:
# List of paths to the JSON files from each parliament
json_paths = [
    "./Brandenburg/Brandenburg_Data.json",
    "./Sachsen/Sachsen_Data.json",
    "./Thueringen/Thueringen_Data.json",
    "./Sachsen-Anhalt/SachsenAnhalt_Data.json",
    "./MeckPom/MeckPom_Data.json",
]

raw_data = []  # List to store all loaded entries

# Iterate over each JSON file path
for path in json_paths:
    if not os.path.exists(path):
        # If the file does not exist, print a warning and skip to the next file
        print(f"File not found: {path}")
        continue
    # Open the JSON file and load its content
    with open(path, "r", encoding="utf-8") as f:
        content = json.load(f)
        # If the content is a list, extend the raw_data list with its items
        if isinstance(content, list):
            raw_data.extend(content)
        else:
            # If the content is a single object, append it as one entry
            raw_data.append(content)
    # Print how many entries were loaded from this file
    print(f"Loaded {path}: {len(content) if isinstance(content, list) else 1} entries")

# Print the total number of loaded entries from all files
print(f"Total loaded entries: {len(raw_data)}")

Loaded ./Brandenburg/Brandenburg_Data.json: 704 entries
Loaded ./Sachsen/Sachsen_Data.json: 1717 entries
Loaded ./Thueringen/Thueringen_Data.json: 763 entries
Loaded ./Sachsen/Sachsen_Data.json: 1717 entries
Loaded ./Thueringen/Thueringen_Data.json: 763 entries
Loaded ./Sachsen-Anhalt/SachsenAnhalt_Data.json: 870 entries
Loaded ./MeckPom/MeckPom_Data.json: 593 entries
Total loaded entries: 4647
Loaded ./Sachsen-Anhalt/SachsenAnhalt_Data.json: 870 entries
Loaded ./MeckPom/MeckPom_Data.json: 593 entries
Total loaded entries: 4647


**Transform structure, normalize URLs, assign unique (counter) IDs**

In [None]:
transformed = []  # List to store the transformed entries
counter = {}      # Dictionary to count entries per parliament for unique IDs

for entry in raw_data:
    parliament = entry.get("Landtag", "XX")  # Get parliament name, fallback 'XX' if missing
    counter.setdefault(parliament, 0)          # Initialize counter for this parliament if not present
    counter[parliament] += 1                   # Increment counter for this parliament
    new_id = f"{parliament}_{counter[parliament]}"  # Create unique ID for entry e.g. SN_7, BB_1...
    date = entry.get("Datum", "").strip()          # Get and clean date string
    if not date:
        # If no date is present, try to extract it from the description text as fallback
        date = extract_date_from_text(entry.get("Beschreibungstext", ""))

    links = parse_links(entry.get("ElementID", ""))  # Parse links from the ElementID field
    cleaned_links = []  # List to store cleaned/normalized links

    for link in links:
        # Special case: For MV and TH, fix links that start with '//' and have label 'https'
        if parliament in ["MV", "TH"] and link.get("label") == "https" and link.get("url", "").startswith("//"):
            link["url"] = "https:" + link["url"]  # Prepend 'https:' to the URL
            link["label"] = "Dokument"            # Set label to 'Dokument' for clarity
        cleaned_links.append(link)  # Add the (possibly fixed) link to the list

    # Build the transformed entry with normalized fields
    transformed_entry = {
        "ID": new_id,
        "Landtag": parliament,
        "Datum": date,
        "Beschreibungstext": entry.get("Beschreibungstext", ""),
        "FilterDetails": entry.get("FilterDetails", []),
        "Links": cleaned_links,
    }

    # Optionally add extracted text if present in the original entry
    if "ExtrahierterText" in entry:
        transformed_entry["ExtrahierterText"] = entry["ExtrahierterText"]

    transformed.append(transformed_entry)  # Add the transformed entry to the result list

print(f"Transformed entries: {len(transformed)}")

Transformed entries: 4647


**Final sorting and export**

In [None]:
#sort the transformed data by date
transformed.sort(key=lambda e: parse_date(e.get("Datum", "")))

# Write the combinded, transformed data to a new JSON file
with open("combined_transformed_data.json", "w", encoding="utf-8") as f:
    json.dump(transformed, f, ensure_ascii=False, indent=4)

print("combined_transformed_data.json written successfully.")


Combined_Data.json written successfully.
