# FZ8 PREP (2023 - 2025)

In [1]:
"""
Process all “regular” FZ-8 workbooks (2023-2025) and build the final,
clean data sets used downstream.

Overview
========
Unlike the legacy 2020-2022 file (single PDF-style workbook), the
2023-2025 data arrive month-by-month as normal Excel files
`fz8_YYYYMM.xlsx`.  
For every file we

1. **Detect & parse** sheets 8.2, 8.3, 8.6, 8.7, 8.8, 8.9 and 8.16  
   (sheet 8.1 no longer exists in this era).
2. **Clean** each sheet with a dedicated parser (remove totals, tidy
   headers, convert NaN → “”, etc.).
3. **Append** a `Date` column (the YYYYMM extracted from the filename).
4. **Accumulate** rows per sheet across all months.
5. **Export** one “raw” CSV per sheet for 2023-2025:

       ../data/raw/fz8/csv/fz_8.<N>_2023-2025_raw.csv

6. **Validate**: compare the new headers with those from the 2020-2022
   period; if they match 1:1, concatenate the two eras.
7. **Save** the final, merged files:

       ../data/processed/fz_8.<N>_raw.csv

Folder layout
-------------
* `../data/raw/fz8/`        monthly Excel workbooks  
* `../data/raw/fz8/csv/`    intermediate “raw” CSVs (both eras)  
* `../data/processed/`      final merged CSVs for analytics

Key points
----------
* **Robust sheet lookup** – tolerates “FZ 8.2” *and* “FZ8.2”.
* **Strict header check** – prevents silent schema drift.
* **All cells are stored as text** (`dtype=str`), no numeric coercion.
* **Console log** prints a concise ✓ for each successful step and a entry for any discrepancies.
"""



## ─────────────────────────────  IMPORTS & PATHS  ─────────────────────────────

In [2]:
import re
from pathlib import Path
import pandas as pd
from openpyxl import load_workbook
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

DATA_DIR = Path("../data/raw/fz8")          # input *.xlsx files
OUT_DIR  = Path("../data/raw/fz8/csv")      # will hold *_raw.csv
OUT_DIR.mkdir(parents=True, exist_ok=True)

DST_DIR = Path("../data/processed/,")         # final merged CSVs
DST_DIR.mkdir(parents=True, exist_ok=True)

## ─────────────────────────────  HELPERS  ─────────────────────────────

In [3]:
def _date_from_fname(p):
    """Return YYYYMM extracted from filename `fz8_YYYYMM.xlsx`."""
    return re.search(r"(\d{6})", p.name).group(1)

def _col(ws, letter, r0, r1):
    """Return values of column *letter* from rows *r0 … r1* inclusive."""
    return [ws[f"{letter}{row}"].value for row in range(r0, r1 + 1)]

# def _clean_header(s):
#     """Normalize header cell: collapse multiple spaces + remove newlines."""
#     return str(s).replace('\n', ' ').replace('  ', ' ').strip().upper() if s is not None else s

def _clean_header(s):
    """Normalize header cell: collapse multiple spaces + remove newlines."""
    return (str(s).translate(str.maketrans("äÄöÖüÜ", "aAoOuU")).replace("\n", " ").replace("  ", " ").strip().upper()) if s is not None else s

def _strip_cols(df):
    """Apply `clean_header` to every column name in-place and return df."""
    df.columns = [_clean_header(c) for c in df.columns]
    return df

def _unique(cols):
    """Ensure uniqueness by adding numeric suffixes."""
    seen, out = {}, []
    for c in cols:
        if c in seen:
            seen[c] += 1
            out.append(f"{c}{seen[c]}")
        else:
            seen[c] = 0
            out.append(c)
    return out

def _find_sheet(wb, num):
    """Locate sheet whose name matches *pattern* (case-insensitive)."""
    pattern = re.compile(fr"^FZ\s*8\.{re.escape(num)}$", flags=re.IGNORECASE)
    for name in wb.sheetnames:
        if pattern.match(name.strip()):
            return name
    return None

# def _strip_upper(df):
#     """
#     Trim whitespace and up-case every object column (in-place).
#     Faster & future-proof replacement for the old `applymap`.
#     """
#     for col in df.columns:
#         if df[col].dtype == "object":
#             df[col] = df[col].str.strip().str.upper()
#     return df

## ─────────────────────────────  PARSERS  ─────────────────────────────

In [4]:
"""
Each fz8_X returns a cleaned DataFrame for one sheet.
Only key points differ, so code is condensed for brevity.
If you need to adjust one sheet,
change the corresponding function only.
"""

'\nEach fz8_X returns a cleaned DataFrame for one sheet.\nOnly key points differ, so code is condensed for brevity.\nIf you need to adjust one sheet,\nchange the corresponding function only.\n'

In [5]:
def fz8_1(ws):
    """
    Parse sheet **“FZ 8.1”** – monthly total registrations by make.

    Cleaning workflow
    -----------------
    1. Header texts are normalised with `_clean_header()` and passed
       through `_unique()` to guarantee column-name uniqueness.
    2. Build DataFrame from the fixed cell ranges; drop rows that are
       completely empty (`dropna(how="all")`).
    3. `_strip_cols()` runs again for idempotent safety.
    4. Remove any row whose *Marke* column contains one of  
       “INSGESAMT, FLENSBURG, HINWEIS, UMBENANNT”.

    Parameters
    ----------
    ws : openpyxl.worksheet.Worksheet
        Worksheet instance for tab “FZ 8.1”.

    Returns
    -------
    pandas.DataFrame
        Clean table ready for vertical concatenation across months.
    """
    
    raw = [
        _clean_header(ws["B8"].value),  # Marke
        _clean_header(ws["C9"].value),  # Anzahl
    ]
    cols = _unique(raw)                 # avoid duplicate column names
    
    # read data block
    df = pd.DataFrame({
        cols[0]: _col(ws, "B", 10, 100),
        cols[1]: _col(ws, "C", 10, 100),
    }).dropna(how="all")

    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)

    # drop meta rows
    trash = r"INSGESAMT|FLENSBURG|HINWEIS|UMBENANNT"
    mask = df[cols[0]].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)
    
    # post-cleanup
    # 1. trim + upper-case every string cell
    df = df.applymap(lambda v: v.replace("  ", " ").strip().upper() if isinstance(v, str) else v)

    num_cols = cols[1]
    df[num_cols] = (
        df[num_cols]
          .replace({"-": "0", ".": "0"})
          .astype(str)                       # ensure string for next step
    )

    df.replace("0", "", inplace=True)
    
    return df

In [6]:
def fz8_2(ws):
    """
    Parse sheet **“FZ 8.2”** – environmental breakdown by make.

    Cleaning workflow
    --------------
    1. `_clean_header()` trims new-lines and duplicate spaces; `_unique()`
       appends numeric suffixes to duplicate titles (if any).
    2. Read the fixed cell ranges into a DataFrame; remove fully-empty
       rows (`dropna(how="all")`).
    3. Run `_strip_cols()` (idempotent but keeps headers tidy).
    4. Drop rows whose *Marke* column (first column) contains one of
       “INSGESAMT, FLENSBURG, HINWEIS, UMBENANNT”.

    Parameters
    ----------
    ws : openpyxl.worksheet.Worksheet
        The worksheet object for tab “FZ 8.2”.

    Returns
    -------
    pandas.DataFrame
        Clean table, ready to be concatenated with other months.
    """
    raw = [
        _clean_header(ws["B8"].value),  # Marke
        _clean_header(ws["C8"].value),  # Anzahl
        _clean_header(ws["D8"].value),  # CO2-Emission in g/km
        _clean_header(ws["F9"].value),  # Euro 6
        _clean_header(ws["I9"].value),  # Elektro (BEV)
        _clean_header(ws["J9"].value),  # Hybrid
        _clean_header(ws["K10"].value), # darunter Plug-in
    ]
    cols = _unique(raw)                 # avoid duplicate column names
    
    # read data block
    df = pd.DataFrame({
        cols[0]: _col(ws, "B", 11, 100),
        cols[1]: _col(ws, "C", 11, 100),
        cols[2]: _col(ws, "D", 11, 100),
        cols[3]: _col(ws, "F", 11, 100),
        cols[4]: _col(ws, "I", 11, 100),
        cols[5]: _col(ws, "J", 11, 100),
        cols[6]: _col(ws, "K", 11, 100),
    }).dropna(how="all")

    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)

    # drop meta rows
    trash = r"INSGESAMT|FLENSBURG|HINWEIS|UMBENANNT"
    mask = df[cols[0]].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)

    # post-cleanup
    # 1. trim + upper-case every string cell
    df = df.applymap(lambda v: v.replace("  ", " ").strip().upper() if isinstance(v, str) else v)

    num_cols = cols[1:]
    df[num_cols] = (
        df[num_cols]
          .replace({"-": "0", ".": "0"})
          .astype(str)                       # ensure string for next step
    )

    df.replace("0", "", inplace=True)
    
    return df

In [7]:
def fz8_3(ws):
    """
    Parse sheet **“FZ 8.3”** – drive-train mix by vehicle segment and model line.

    Cleaning workflow
    -----------------
    1. _Header normalisation_ – `_clean_header()` trims new-lines /
       double spaces; `_unique()` appends numeric suffixes if duplicates.
    2. Build DataFrame from column slices; drop rows that are completely
       empty (`dropna(how="all")`).
    3. `_strip_cols()`: idempotent header clean-up.
    4. Forward-fill **Segment** column so sub-rows inherit their segment.
    5. Remove meta / total rows whose *segment* column contains any of  
       “ZUSAMMEN, INSGESAMT, HINWEISE, AUSGEWIESEN, KRAFTSTOFFVERBRAUCH,  
        FLENSBURG, UMBENANNT”.
    6. If a segment row is marked “SONSTIGE”, force *Modellreihe* to the
       literal `"SONSTIGE"` for clarity.

    Parameters
    ----------
    ws : openpyxl.worksheet.Worksheet
        Worksheet handle for sheet “FZ 8.3”.

    Returns
    -------
    pandas.DataFrame
        Clean table, ready to be concatenated across months.
    """
    raw = [
        _clean_header(ws["B8"].value),   # Segment
        _clean_header(ws["C8"].value),   # Modellreihe
        _clean_header(ws["D8"].value),   # Insgesamt
        _clean_header(ws["E10"].value),  # CO2-Emission in g/km
        _clean_header(ws["F9"].value),   # Benzin
        _clean_header(ws["G10"].value),  # CO2-Emission in g/km
        _clean_header(ws["H10"].value),  # Kraftstoffverbrauch in l/100 km
        _clean_header(ws["I9"].value),   # Diesel
        _clean_header(ws["J10"].value),  # CO2-Emission in g/km
        _clean_header(ws["K10"].value),  # Kraftstoffverbrauch in l/100 km
        _clean_header(ws["L9"].value),   # Erdgas (CNG) (einschl. bivalent)
        _clean_header(ws["M9"].value),   # Flüssiggas (LPG) (einschl. bivalent)
        _clean_header(ws["N9"].value),   # Hybrid
        _clean_header(ws["O10"].value),  # dar. Plug-in
        _clean_header(ws["P9"].value),   # Elektro (BEV)
    ]
    cols = _unique(raw)                  # avoid duplicate column names
    
    # read data block
    df = pd.DataFrame({
        cols[0]:  _col(ws, "B", 11, 500),
        cols[1]:  _col(ws, "C", 11, 500),
        cols[2]:  _col(ws, "D", 11, 500),
        cols[3]:  _col(ws, "E", 11, 500),
        cols[4]:  _col(ws, "F", 11, 500),
        cols[5]:  _col(ws, "G", 11, 500),
        cols[6]:  _col(ws, "H", 11, 500),
        cols[7]:  _col(ws, "I", 11, 500),
        cols[8]:  _col(ws, "J", 11, 500),
        cols[9]:  _col(ws, "K", 11, 500),
        cols[10]: _col(ws, "L", 11, 500),
        cols[11]: _col(ws, "M", 11, 500),
        cols[12]: _col(ws, "N", 11, 500),
        cols[13]: _col(ws, "O", 11, 500),
        cols[14]: _col(ws, "P", 11, 500),
    }).dropna(how="all")

    
    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)

    # forward-fill Segment
    seg_col = next((c for c in df.columns if "segment" in str(c).lower()), df.columns[0])
    df[seg_col] = df[seg_col].ffill()

    # drop meta rows
    trash = r"ZUSAMMEN|INSGESAMT|HINWEISE|AUSGEWIESEN|KRAFTSTOFFVERBRAUCH|FLENSBURG|HINWEIS|UMBENANNT"
    mask = df[seg_col].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)

    # flag “SONSTIGE” rows
    mod_col = next((c for c in df.columns if "modellreihe" in str(c).lower()), None)
    if mod_col:
        is_sonstige = df[seg_col].astype(str).str.contains(r"\bSONSTIGE\b", case=False, na=False)
        df.loc[is_sonstige, mod_col] = "SONSTIGE"
    
    # post-cleanup
    # 1. trim + upper-case every string cell
    df = df.applymap(lambda v: v.replace("  ", " ").strip().upper() if isinstance(v, str) else v)

    num_cols = cols[2:]
    df[num_cols] = (
        df[num_cols]
          .replace({"-": "0", ".": "0"})
          .astype(str)                       # ensure string for next step
    )

    df.replace("0", "", inplace=True)
    
    return df

In [8]:
def fz8_6(ws):
    """
    Parse sheet **“FZ 8.6”** – power-train mix by federal state.

    Cleaning workflow
    -----------------
    1. Header texts are normalised via `_clean_header()`; duplicates get a
       numeric suffix via `_unique()`.
    2. Read the fixed range into a `DataFrame`; drop fully empty rows.
    3. Run `_strip_cols()` (idempotent header tidy-up).
    4. Remove any row whose first column contains one of the keywords  
       “FLENSBURG, HINWEIS, UMBENANNT”.

    Parameters
    ----------
    ws : openpyxl.worksheet.Worksheet
        Worksheet object for tab “FZ 8.6”.

    Returns
    -------
    pandas.DataFrame
        Clean table ready for monthly concatenation.
    """
    raw = [
        _clean_header(ws["B8"].value),  # Bundesland
        _clean_header(ws["C8"].value),  # Benzin insgesamt
        _clean_header(ws["D8"].value),  # Darunter Euro 6
        _clean_header(ws["G8"].value),  # Diesel insgesamt
        _clean_header(ws["H8"].value),  # Darunter Euro 6
        _clean_header(ws["K8"].value),  # Flüssiggas (LPG)  (einschl. bivalent)
        _clean_header(ws["L8"].value),  # Erdgas (CNG) (einschl. bivalent)
        _clean_header(ws["M8"].value),  # Elektro (BEV)
        _clean_header(ws["N8"].value),  # Hybrid
        _clean_header(ws["O9"].value),  # darunter Plug-in
        _clean_header(ws["P8"].value),  # Sonstige
    ]
    cols = _unique(raw)                 # avoid duplicate column names
    
    # read data block
    df = pd.DataFrame({
        cols[0]:  _col(ws, "B", 10, 30),
        cols[1]:  _col(ws, "C", 10, 30),
        cols[2]:  _col(ws, "D", 10, 30),
        cols[3]:  _col(ws, "G", 10, 30),
        cols[4]:  _col(ws, "H", 10, 30),
        cols[5]:  _col(ws, "K", 10, 30),
        cols[6]:  _col(ws, "L", 10, 30),
        cols[7]:  _col(ws, "M", 10, 30),
        cols[8]:  _col(ws, "N", 10, 30),
        cols[9]:  _col(ws, "O", 10, 30),
        cols[10]: _col(ws, "P", 10, 30),
    }).dropna(how="all")

    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)

    # drop meta rows
    trash = r"FLENSBURG|HINWEIS|UMBENANNT"
    mask = df[cols[0]].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)
    
    # post-cleanup
    # 1. trim + upper-case every string cell
    df = df.applymap(lambda v: v.replace("  ", " ").strip().upper() if isinstance(v, str) else v)

    num_cols = cols[1:]
    df[num_cols] = (
        df[num_cols]
          .replace({"-": "0", ".": "0"})
          .astype(str)                       # ensure string for next step
    )

    df.replace("0", "", inplace=True)
    
    return df

In [9]:
def fz8_7(ws):
    """
    Parse sheet **“FZ 8.7”** – colour distribution by make.

    Cleaning workflow
    -----------------
    1. Normalise header texts with `_clean_header()`.  
       Duplicate titles get a numbered suffix via `_unique()`.
    2. Build a DataFrame from column slices; drop fully-empty rows.
    3. Run `_strip_cols()` again (idempotent safeguard).
    4. Remove any row whose first column contains one of the keywords  
       “INSGESAMT, DARUNTER, FLENSBURG, HINWEIS, UMBENANNT”.

    Parameters
    ----------
    ws : openpyxl.worksheet.Worksheet
        Worksheet object pointing to “FZ 8.7”.

    Returns
    -------
    pandas.DataFrame
        Cleaned table ready to be concatenated with monthly data.
    """
    raw = [
        _clean_header(ws["B8"].value),  # Marke
        _clean_header(ws["C8"].value),  # Insgesamt
        _clean_header(ws["D9"].value),  # weiß
        _clean_header(ws["E9"].value),  # gelb
        _clean_header(ws["F9"].value),  # orange
        _clean_header(ws["G9"].value),  # rot
        _clean_header(ws["H9"].value),  # lila/violett
        _clean_header(ws["I9"].value),  # blau
        _clean_header(ws["J9"].value),  # grün
        _clean_header(ws["K9"].value),  # grau
        _clean_header(ws["L9"].value),  # braun
        _clean_header(ws["M9"].value),  # schwarz
        _clean_header(ws["N9"].value),  # sonstige
    ]
    cols = _unique(raw)                 # avoid duplicate column names
    
    # read data block
    df = pd.DataFrame({
        cols[0]:  _col(ws, "B", 10, 100),
        cols[1]:  _col(ws, "C", 10, 100),
        cols[2]:  _col(ws, "D", 10, 100),
        cols[3]:  _col(ws, "E", 10, 100),
        cols[4]:  _col(ws, "F", 10, 100),
        cols[5]:  _col(ws, "G", 10, 100),
        cols[6]:  _col(ws, "H", 10, 100),
        cols[7]:  _col(ws, "I", 10, 100),
        cols[8]:  _col(ws, "J", 10, 100),
        cols[9]:  _col(ws, "K", 10, 100),
        cols[10]: _col(ws, "L", 10, 100),
        cols[11]: _col(ws, "M", 10, 100),
        cols[12]: _col(ws, "N", 10, 100),
    }).dropna(how="all")

    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)

    # drop meta rows
    trash = r"INSGESAMT|DARUNTER|FLENSBURG|HINWEIS|UMBENANNT"
    mask = df[cols[0]].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)
    
    # post-cleanup
    # 1. trim + upper-case every string cell
    df = df.applymap(lambda v: v.replace("  ", " ").strip().upper() if isinstance(v, str) else v)

    num_cols = cols[1:]
    df[num_cols] = (
        df[num_cols]
          .replace({"-": "0", ".": "0"})
          .astype(str)                       # ensure string for next step
    )

    df.replace("0", "", inplace=True)
    
    return df

In [10]:
def fz8_8(ws):
    """
    Parse sheet **“FZ 8.8”** – vehicle distribution by engine-capacity class.

    Cleaning workflow
    -----------------
    1. **Header normalisation** – `_clean_header()` removes new-lines and
       duplicate spaces; `_unique()` adds numeric suffixes to duplicates.
    2. **Drop empty rows** – `dropna(how="all")`.
    3. **Filter meta rows** – any row whose first column contains one of
       the keywords  
       “HINWEIS, HUBRAUM, FLENSBURG, UMBENANNT” is discarded.

    Parameters
    ----------
    ws : openpyxl.worksheet.Worksheet
        Handle to sheet “FZ 8.8”.

    Returns
    -------
    pandas.DataFrame
        Clean table ready to be concatenated with other months.
    """
    raw = [
        _clean_header(ws["B8"].value),   # Lebensalter der Halterinnen und Halter
        _clean_header(ws["C10"].value),  # bis 1399
        _clean_header(ws["D10"].value),  # 1400 bis 1999
        _clean_header(ws["E10"].value),  # 2000 und mehr
        _clean_header(ws["F10"].value),  # unbekannt
        _clean_header(ws["G9"].value),   # Insgesamt
        _clean_header(ws["H9"].value),   # darunter Halterinnen
    ]
    cols = _unique(raw)                  # avoid duplicate column names
    
    # read data block
    df = pd.DataFrame({
        cols[0]: _col(ws, "B", 11, 35),
        cols[1]: _col(ws, "C", 11, 35),
        cols[2]: _col(ws, "D", 11, 35),
        cols[3]: _col(ws, "E", 11, 35),
        cols[4]: _col(ws, "F", 11, 35),
        cols[5]: _col(ws, "G", 11, 35),
        cols[6]: _col(ws, "H", 11, 35),
    }).dropna(how="all")

    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)

    # drop meta rows
    trash = r"HINWEIS|HUBRAUM|FLENSBURG|UMBENANNT"
    mask = df[cols[0]].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)
    
    # post-cleanup
    # 1. trim + upper-case every string cell
    df = df.applymap(lambda v: v.replace("  ", " ").strip().upper() if isinstance(v, str) else v)

    num_cols = cols[1:]
    df[num_cols] = (
        df[num_cols]
          .replace({"-": "0", ".": "0"})
          .astype(str)                       # ensure string for next step
    )

    df.replace("0", "", inplace=True)
    
    return df

In [11]:
def fz8_9(ws):
    """
    Parse sheet **“FZ 8.9”** – distribution of holders (private vs. business).

    Cleaning rules
    --------------
    1. Header texts are normalised with `_clean_header()`; duplicates
       receive a numeric suffix via `_unique()`.
    2. Entirely empty rows are dropped (`dropna(how="all")`).
    3. Rows whose first column matches one of the keywords
       “INSGESAMT, HINWEIS, ERBRINGUNG, FLENSBURG, UMBENANNT”
       are removed (totals / footnotes).

    Parameters
    ----------
    ws : openpyxl.worksheet.Worksheet
        Worksheet object for the “FZ 8.9” tab.

    Returns
    -------
    pandas.DataFrame
        Cleaned table ready for monthly concatenation.
    """
    raw = [
        _clean_header(ws["B8"].value),  # Marke
        _clean_header(ws["C8"].value),  # Insgesamt
        _clean_header(ws["D8"].value),  # Private Halterinnen und Halter
        _clean_header(ws["F8"].value),  # Gewerbliche Halterinnen und Halter
        _clean_header(ws["H9"].value),  # Kfz-Handel
        _clean_header(ws["J9"].value),  # Kfz-Herstellung
        _clean_header(ws["L9"].value),  # Kfz-Vermietung und Carsharing
        _clean_header(ws["N9"].value),  # Erbringung sonstiger Dienstleistungen 
        _clean_header(ws["P9"].value),  # sonstige gewerbliche Halterinnen und Halter
        _clean_header(ws["R8"].value),  # Unbekannt
    ]
    cols = _unique(raw)                 # avoid duplicate column names
    
    # read data block
    df = pd.DataFrame({
        cols[0]:  _col(ws, "B", 11, 100),
        cols[1]:  _col(ws, "C", 11, 100),
        cols[2]:  _col(ws, "D", 11, 100),
        cols[3]:  _col(ws, "F", 11, 100),
        cols[4]:  _col(ws, "H", 11, 100),
        cols[5]:  _col(ws, "J", 11, 100),
        cols[6]:  _col(ws, "L", 11, 100),
        cols[7]:  _col(ws, "N", 11, 100),
        cols[8]:  _col(ws, "P", 11, 100),
        cols[9]:  _col(ws, "R", 11, 100),
    }).dropna(how="all")

    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)

    # drop meta rows
    trash = r"INSGESAMT|HINWEIS|ERBRINGUNG|FLENSBURG|UMBENANNT"
    mask = df[cols[0]].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)
    
    # post-cleanup
    # 1. trim + upper-case every string cell
    df = df.applymap(lambda v: v.replace("  ", " ").strip().upper() if isinstance(v, str) else v)

    num_cols = cols[1:]
    df[num_cols] = (
        df[num_cols]
          .replace({"-": "0", ".": "0"})
          .astype(str)                       # ensure string for next step
    )

    df.replace("0", "", inplace=True)
    
    return df

In [12]:
def fz8_16(ws):
    """
    Parse sheet **“FZ 8.16”** (zulässige Gesamtmasse).

    Cleaning rules
    --------------
    1. Header texts are normalised via `_clean_header()` to remove
       new-lines and double spaces.  Duplicate names get a numeric
       suffix from `_unique()`.
    2. Entirely empty rows are dropped (`dropna(how="all")`).
    3. Rows whose first column contains any of the keywords  
       “INSGESAMT, HINWEIS, SATTELANHÄNGER, VERORDNUNG, FLENSBURG”
       are regarded as meta / totals and removed.

    Parameters
    ----------
    ws : openpyxl.worksheet.Worksheet
        Worksheet object pointing to sheet “FZ 8.16”.

    Returns
    -------
    pandas.DataFrame
        Cleaned table ready for concatenation with other months.
    """
    raw = [
        _clean_header(ws["B8"].value),  # Zulässige Gesamtmasse in kg
        _clean_header(ws["C8"].value),  # Personenkraftwagen
        _clean_header(ws["D9"].value),  # darunter  Wohnmobile
    ]
    cols = _unique(raw)                 # avoid duplicate column names
    
    # read data block
    df = pd.DataFrame({
        cols[0]: _col(ws, "B", 11, 50),
        cols[1]: _col(ws, "C", 11, 50),
        cols[2]: _col(ws, "D", 11, 50),
    }).dropna(how="all")

    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)

    # drop meta rows
    trash = r"INSGESAMT|HINWEIS|SATTELANHÄNGER|VERORDNUNG|FLENSBURG"
    mask = df[cols[0]].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)
    
    # post-cleanup
    # 1. trim + upper-case every string cell
    df = df.applymap(lambda v: v.replace("  ", " ").strip().upper() if isinstance(v, str) else v)

    num_cols = cols[1:]
    df[num_cols] = (
        df[num_cols]
          .replace({"-": "0", ".": "0"})
          .astype(str)                       # ensure string for next step
    )

    df.replace("0", "", inplace=True)
    
    return df

## ─────────────────────────────  VALIDATION FUNCTION  ─────────────────────────────

In [13]:
""""
-------------------------------------------------------------------------
LAYOUT-VALIDATION UTILITY FOR FZ-8 WORKBOOKS
-------------------------------------------------------------------------
Goal
────
Some Excel files (fz8_YYYYMM.xlsx) may have slightly shifted headers or
data blocks.  This helper checks every workbook in *DATA_DIR* and tells
you whether the coordinates and header texts are **exactly** the same
for each sheet type (8.1 … 8.16).  Run it once before you parse data so
you can fix misaligned sources early.

How it works
────────────
1. `header_map`   – expected cell addresses that contain column titles.
2. `data_start_row` – first row where real data (not headers) should start.
3. For each sheet number N:
      • read every workbook
      • collect header texts from those coordinates
      • compare to the first workbook (acts as “reference”)
      • verify that *at least one* value exists in the first data row

If anything is different, a human-readable list of issues is printed.
Otherwise you get a green message that all layouts match.

Extend / modify
───────────────
• Add / remove sheet numbers in `header_map`.
• Update coordinates if the KBA changes its template.
• Adjust `data_start_row` if header block grows/shrinks.
-------------------------------------------------------------------------
"""

header_map = {
    '1':  ["B8", "C9"],
    '2':  ["B8", "C8", "D8", "F9", "I9", "J9", "K10"],
    '3':  ["B8", "C8", "D8", "E10", "F9", "G10", "H10", "I9",
           "J10", "K10", "L9", "M9", "N9", "O10", "P9"],
    '6':  ["B8", "C8", "D8", "G8", "H8", "K8", "L8", "M8", "N8", "O9", "P8"],
    '7':  ["B8", "C8", "D9", "E9", "F9", "G9", "H9", "I9", "J9",
           "K9", "L9", "M9", "N9"],
    '8':  ["B8", "C10", "D10", "E10", "F10", "G9", "H9"],
    '9':  ["B8", "C8", "D8", "F8", "H9", "J9", "L9", "N9", "P9", "R8"],
    '16': ["B8", "C8", "D9"],
}
data_start_row = {'1':10,'2':11,'3':11,'6':10,'7':10,'8':11,'9':11,'16':11}

def check_fz8_layout():
    """
    Validate that every “FZ 8.x” sheet in `DATA_DIR` is aligned exactly
    like the first file encountered for that sheet number.

    Prints a bullet list with every discrepancy; prints a success
    message when no issues are found.

    Returns
    -------
    None
        The function is purely side-effecting (console output only).
    """
    issues = []
    for num, coords in header_map.items():
        ref_names = None        # header texts from the first workbook
        ref_file  = None        # its filename (for reference print)

        for path in sorted(DATA_DIR.glob("fz8_*.xlsx")):
            wb  = load_workbook(path, data_only=True)
            sn  = _find_sheet(wb, num)
            if not sn:
                issues.append(f"{path.name}: workbook 8.{num} not found")
                continue
            
            # collect header texts at the expected coordinates
            ws = wb[sn]
            names = [_clean_header(ws[c].value) for c in coords]

            # (1) compare to reference workbook
            if ref_names is None:
                ref_names, ref_file = names, path.name
            elif names != ref_names:
                issues.append(f"{path.name}: 8.{num} – {names} ≠ {ref_names} (reference {ref_file})")

            # (2) make sure the first data row is populated
            r0 = data_start_row[num]
            if not any(ws[f"{c[0]}{r0}"].value for c in coords):
                issues.append(f"{path.name}: 8.{num} – row {r0} is empty, first data row shifted?")
    
    # Report
    if issues:
        print("⚠️  Discrepancies have been detected:")
        for msg in issues:
            print(" •", msg)
    else:
        print("The layouts of all FZ8 sheets are identical (coordinates, headers, first data row)")

# Run the check once:
check_fz8_layout()

The layouts of all FZ8 sheets are identical (coordinates, headers, first data row)


## ─────────────────────────────  DICT WORKBOOK → PARSER  ─────────────────────────────

In [14]:
"""
-------------------------------------------------------------------------
Sheet-ID → parser mapping
-------------------------------------------------------------------------
• Keys are the trailing digits of the sheet name “FZ 8.<N>”
  kept as *strings*, because later we build filenames like:
      f"fz_8.{num}_raw.csv"

• Values are callables that take an `openpyxl.Worksheet` instance
  and return a *cleaned* `pandas.DataFrame` for that sheet.

Example
-------
    ws = workbook["FZ 8.3"]
    df = sheet_parsers["3"](ws)

`globals_by_sheet`
------------------
Pre-allocates one empty DataFrame per sheet ID.  
During the main loop we simply `pd.concat()` every monthly chunk onto
its accumulator, so when the loop ends each key contains **all rows**
for that sheet across every workbook.
-------------------------------------------------------------------------
"""

sheet_parsers = {
    '2':  fz8_2, '3':  fz8_3, '6':  fz8_6,
    '7':  fz8_7, '8':  fz8_8, '9':  fz8_9, '16': fz8_16,
}

# Accumulators: one global DataFrame per sheet number
globals_by_sheet = {num: pd.DataFrame() for num in sheet_parsers}

## ─────────────────────────────  MAIN LOOP  ─────────────────────────────

In [15]:
"""
-------------------------------------------------------------------------
Harvest every monthly workbook ─ append rows to the global accumulators
-------------------------------------------------------------------------
Workflow
========
1. Walk through all files that match “fz8_YYYYMM.xlsx”.
   `sorted()` keeps them in chronological order (optional, but tidy).

2. For each workbook:
     • Load with `data_only=True` so formulas are resolved to values.
     • Derive the period tag `YYYYMM` from the filename; this becomes
       a new column **Date** in every parsed row.

3. For each sheet ID in `sheet_parsers` (2, 3, 6, 7, 8, 9, 16):
     • Locate the actual worksheet via `find_sheet()` — tolerant of
       both “FZ 8.2” and “FZ8.2”.
     • If the sheet is missing, print a warning and continue gracefully.
     • Otherwise:
         ▸ run its parser → clean `DataFrame`
         ▸ prepend the **Date** column
         ▸ `pd.concat()` onto the corresponding accumulator
           in `globals_by_sheet`.

Outcome
-------
After the loops finish, each entry in `globals_by_sheet`
holds **all rows from every workbook** for that sheet type.
They can be saved to disk or processed further in one go.
-------------------------------------------------------------------------
"""

for path in sorted(DATA_DIR.glob("fz8_*.xlsx")):
    wb   = load_workbook(path, data_only=True)      # read Excel as values
    date = _date_from_fname(path)                   # e.g. "202401"

    for num, parser in sheet_parsers.items():
        sname = _find_sheet(wb, num)                 # locate “FZ 8.<num>”
        if not sname:                               # skip missing sheets
            print(f"{path.name}: workbook 8.{num} not found")
            continue

        df = parser(wb[sname])                      # parse & clean
        df.insert(0, "DATE", date)                  # add period column

        # append to the global accumulator for this sheet
        globals_by_sheet[num] = pd.concat([globals_by_sheet[num], df], ignore_index=True)

## ─────────────────────────────  SAVE RAW CSVs  ─────────────────────────────

In [16]:
"""
-------------------------------------------------------------------------
Export 2023-2025 “raw” CSVs — one per sheet
-------------------------------------------------------------------------
• Each accumulator in `globals_by_sheet` now contains ONLY rows parsed
  from the 2023-2025 Excel workbooks.
• We store them as text-only CSV files named
        fz_8.<N>_2023-2025_raw.csv
  inside `OUT_DIR` (../data/raw/fz8/csv).
• Prior to saving we
      1. Convert every NaN → ""     (`fillna('')`)
      2. Cast every column to str   (`astype(str)`)
  so downstream tools will not see mixed dtypes.
• A compact log message is printed for each file, followed by
  `df.info()` (memory footprint + dtype overview) for manual sanity
  checks.
-------------------------------------------------------------------------
"""

for num, df in globals_by_sheet.items():
    # ensure 100 % string representation, no NaN
    df = df.fillna('').astype(str)

    # path …/csv/fz_8.<num>_2023-2025_raw.csv
    out_csv = OUT_DIR / f"fz_8.{num}_2023-2025_raw.csv"
    df.to_csv(out_csv, index=False, encoding="utf-8")

    # console log
    print(f"• Saved {out_csv.name}  →  {df.shape}\n")
    df.info()           # quick dtype audit
    print("\n\n")       # visual separator

• Saved fz_8.2_2023-2025_raw.csv  →  (1653, 8)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1653 entries, 0 to 1652
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   DATE                  1653 non-null   object
 1   MARKE                 1653 non-null   object
 2   ANZAHL                1653 non-null   object
 3   CO2-EMISSION IN G/KM  1653 non-null   object
 4   EURO 6                1653 non-null   object
 5   ELEKTRO (BEV)         1653 non-null   object
 6   HYBRID                1653 non-null   object
 7   DARUNTER PLUG-IN      1653 non-null   object
dtypes: object(8)
memory usage: 103.4+ KB



• Saved fz_8.3_2023-2025_raw.csv  →  (10146, 16)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10146 entries, 0 to 10145
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   DATE 

## ──────────────────────────  CONCAT WITH 2020-2022  ──────────────────────────

In [17]:
"""
-------------------------------------------------------------------------
Merge “old” (2020-2022) and “new” (2023-2025) raw CSVs
-------------------------------------------------------------------------
Why two eras?
  • 2020-2022 data come from a single “PDF layout” workbook (§1 script).
  • 2023-2025 data are parsed month-by-month (§2 script).

Guard-rails
  1. We merge only if **both** CSV files exist.
  2. Headers must match *exactly* (same count, order, spelling).
     Otherwise we log a mismatch and skip that sheet.

Result
  One unified CSV per sheet:
      ../data/processed/fz_8.<N>_raw.csv
  All cells stored as text; NaN → "".
-------------------------------------------------------------------------
"""

sheet_nums = ["2", "3", "6", "7", "8", "9", "16"]
issues = []

for num in sheet_nums:
    f_old = OUT_DIR / f"fz_8.{num}_2020-2022_raw.csv"
    f_new = OUT_DIR / f"fz_8.{num}_2023-2025_raw.csv"

    # existence check
    if not f_old.exists() or not f_new.exists():
        issues.append(f"8.{num}: missing {'old' if not f_old.exists() else 'new'} file")
        continue

    df_old = pd.read_csv(f_old, dtype=str)
    df_new = pd.read_csv(f_new, dtype=str)

    # header consistency check
    if list(df_old.columns) != list(df_new.columns):
        issues.append(
            f"8.{num}: header mismatch.\n"
            f"  old: {list(df_old.columns)}\n"
            f"  new: {list(df_new.columns)}"
        )
        continue

    # concatenate & export
    df_all = (
        pd.concat([df_old, df_new], ignore_index=True)
          .fillna("")       # NaN → empty string
          .astype(str)      # guarantee text dtype
    )

    out_csv = DST_DIR / f"fz_08.{num}_raw.csv"
    df_all.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"fz_8.{num}_raw.csv  →  {df_all.shape}")

# summary report
if issues:
    print("\nDiscrepancies detected:")
    for msg in issues:
        print(" •", msg)
else:
    print("\nAll sheets have been successfully validated and merged.")

fz_8.2_raw.csv  →  (2949, 8)
fz_8.3_raw.csv  →  (21820, 16)
fz_8.6_raw.csv  →  (936, 12)
fz_8.7_raw.csv  →  (2953, 14)
fz_8.8_raw.csv  →  (1344, 8)
fz_8.9_raw.csv  →  (2949, 11)
fz_8.16_raw.csv  →  (2112, 4)

All sheets have been successfully validated and merged.
