# FZ10 PREP (2020 - 2025)

In [1]:
# ============================================================================#
#  FZ-10 WORKBOOK INGEST (2020-2025)                                          #
#  -------------------------------------------------------------------------  #
#  • Reads every “fz10_YYYY_MM.xlsx” in ../data/raw/fz10/                     #
#  • Sheet of interest:  **FZ 10.1**  (both “FZ10.1” and “FZ 10.1” accepted)  #
#  • 2020 files have a *short* column set (B-S); newer files use B-AQ.        #
#  • Cleans, de-dupes, forward-fills “Marke”, flags “SONSTIGE”, builds        #
#    composite column “Modellreihe” = Marke + Modell.                         #
#  • Saves raw CSV (all cells as text) to ../data/raw/fz10/csv/               #
#    and a convenience copy to ../data/processed/                             #
# ============================================================================#

## ─────────────────────────────  IMPORTS & PATHS  ─────────────────────────────

In [2]:
import re
from pathlib import Path
import pandas as pd
from openpyxl import load_workbook

DATA_DIR = Path("../data/raw/fz10")             # source workbooks
OUT_DIR  = Path("../data/raw/fz10/csv")         # intermediate CSVs
OUT_DIR.mkdir(parents=True, exist_ok=True)

DST_DIR = Path("../data/processed/,")             # final analytics CSV
DST_DIR.mkdir(parents=True, exist_ok=True)

## ─────────────────────────────  HELPERS  ─────────────────────────────

In [3]:
def _date_from_fname(p):
    """Return YYYYMM extracted from filename `fz10_YYYY_MM.xlsx`."""
    y, m = re.search(r"(\d{4})_(\d{2})", p.name).groups()
    return y + m

def _clean_header(s):
    """Normalize header cell: collapse multiple spaces + remove newlines."""
    return str(s).replace('\n', ' ').replace('  ', ' ').strip() if s is not None else s

def _strip_cols(df):
    """Apply `clean_header` to every column name in-place and return df."""
    df.columns = [_clean_header(c) for c in df.columns]
    return df

def _unique(cols):
    """Ensure uniqueness by adding numeric suffixes."""
    seen, out = {}, []
    for c in cols:
        if c in seen:
            seen[c] += 1
            out.append(f"{c}{seen[c]}")
        else:
            seen[c] = 0
            out.append(c)
    return out

def _col(ws, letter, r0, r1):
    """Return values of column *letter* from rows *r0 … r1* inclusive."""
    return [ws[f"{letter}{row}"].value for row in range(r0, r1 + 1)]

def _find_sheet(wb, pattern=r"^FZ\s*10\.1$"):
    """Locate sheet whose name matches *pattern* (case-insensitive)."""
    regex = re.compile(pattern, flags=re.IGNORECASE)
    for name in wb.sheetnames:
        if regex.match(name.strip()):
            return name
    return None

## ─────────────────────────────  PARSER  ─────────────────────────────

In [4]:
def fz10_1(ws, is_2020):
    """
    Parse sheet **FZ 10.1**.

    Parameters
    ----------
    ws : openpyxl.worksheet.Worksheet
    is_2020 : bool
        • True  → use legacy 2020 column map (letters B … S)  
        • False → use current map      (letters B … AQ)

    Returns
    -------
    pandas.DataFrame
        Clean table (rows 10-1000) with extra column “Modellreihe”.
    """

    # coordinate maps
    letters_2020 = ["B", "C", "D", "G", "J", "M", "P", "S"]
    hdr_map_2020 = [
        _clean_header(ws["B9"].value),    # Marke
        _clean_header(ws["C9"].value),    # Modellreihe
        _clean_header(ws["D8"].value),    # Insgesamt
        _clean_header(ws["G8"].value),    # mit Dieselantrieb
        _clean_header(ws["J8"].value),    # mit Hybridantrieb (incl. Plug-in-Hybrid)
        _clean_header(ws["M8"].value),    # mit Elektroantrieb (BEV)
        _clean_header(ws["P8"].value),    # mit Allradantrieb
        _clean_header(ws["S8"].value),    # Cabriolets
    ]

    letters_new = ["B", "C", "D", "G", "J", "AK", "AN", "AQ"]
    hdr_map_new = [
        _clean_header(ws["B9"].value),    # Marke
        _clean_header(ws["C9"].value),    # Modellreihe
        _clean_header(ws["D8"].value),    # Insgesamt
        _clean_header(ws["G8"].value),    # mit Dieselantrieb
        _clean_header(ws["J8"].value),    # mit Hybridantrieb (incl. Plug-in-Hybrid)
        _clean_header(ws["AK8"].value),   # mit Elektroantrieb (BEV)
        _clean_header(ws["AN8"].value),   # mit Allradantrieb
        _clean_header(ws["AQ8"].value),   # Cabriolets
    ]

    letters = letters_2020 if is_2020 else letters_new
    headers = hdr_map_2020 if is_2020 else hdr_map_new

    # remove verbose tail fragments
    headers = [h.replace("(incl. Plug-in-Hybrid)", "").strip() for h in headers]
    headers = [h.replace("(BEV)", "").strip() for h in headers]
    cols    = _unique(headers)
    
    # read data block
    df = pd.DataFrame({
        name: _col(ws, col, 10, 1000)
        for name, col in zip(cols, letters)
    }).dropna(how="all")

    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)

    # rename “Modellreihe” → “Modell”
    modell_col = None
    for c in list(df.columns):
        if "modellreihe" in c.lower():
            df.rename(columns={c: "Modell"}, inplace=True)
            modell_col = "Modell"

    marke_col = next((c for c in df.columns if "marke" in c.lower()), df.columns[0])

    # drop meta rows
    trash = r"INSGESAMT|ZUSAMMEN|FLENSBURG|ANZAHL|HINWEIS|UMBENANNT"
    df = df[~df[marke_col].astype(str).str.contains(trash, case=False, na=False)].reset_index(drop=True)

    # forward-fill Marke + flag SONSTIGE
    df[marke_col] = df[marke_col].ffill()
    sonstige_mask = df[marke_col].str.contains(r"\bSONSTIGE\b", case=False, na=False)
    if sonstige_mask.any():
        next_col = df.columns[df.columns.get_loc(marke_col) + 1]
        df.loc[sonstige_mask, next_col] = "SONSTIGE"

    # insert composite “Modellreihe”
    if modell_col:
        pos = df.columns.get_loc(modell_col) + 1
        df.insert(pos, "Modellreihe",
                  (df[marke_col].fillna("") + " " + df[modell_col].fillna("")).str.strip())

    return df

## ─────────────────────────────  VALIDATION FUNCTION  ─────────────────────────────

In [5]:
"""
─────────────────────────────  HEADER VALIDATION  ──────────────────────────
Two header layouts exist:
  • 2020 release  → letters  B-S   (M, P, S instead of AK, AN, AQ)
  • 2021-2025     → letters  B-AQ
"""

HDR_2020 = ["B9", "C9", "D8", "G8", "J8", "M8", "P8", "S8"]
HDR_NEW  = ["B9", "C9", "D8", "G8", "J8", "AK8", "AN8", "AQ8"]
DATA_FIRST_ROW = 10             # first row of real data for *all* years

def _header_coords(path):
    """
    Choose the correct coordinate list for a workbook.

    The year is encoded in the filename: e.g. `fz10_2020_05.xlsx`.
    If the filename contains “2020” → legacy layout is returned,
    otherwise the new (2021-2025) layout.
    """
    return HDR_2020 if "2020" in path.name else HDR_NEW

def check_fz10_layout():
    """
    Sanity-check *all* FZ-10 workbooks in `DATA_DIR`.

    What is verified?
    -----------------
    1.  **Sheet presence** – every file must contain “FZ 10.1”.  
    2.  **Header texts**   – all files *from 2021 onward* must share the
        exact same header strings (2020 is ignored because its schema differs).  
    3.  **Data anchor**    – cell block starting at row 10 must not be empty
        (catches accidental vertical shifts).

    This function is called once before the main ingest loop so that
    structural problems are surfaced early.
    """
    issues = []
    ref_names = None        # header snapshot from first “new” workbook
    ref_file  = None        # file that provided that snapshot

    # iterate newest→oldest for a cleaner log
    for path in sorted(DATA_DIR.glob("fz10_*.xlsx"), reverse=True):
        wb = load_workbook(path, data_only=True)
        sn = _find_sheet(wb)
        if not sn:
            issues.append(f"{path.name}: sheet FZ 10.1 not found")
            continue
        
        coords = _header_coords(path)          # choose correct layout
        ws     = wb[sn]
        names  = [_clean_header(ws[c].value) for c in coords]

        # (1) compare header texts with first file in the same *layout class*
        if ref_names is None and coords is HDR_NEW:
            ref_names, ref_file = names, path.name
        elif coords is HDR_NEW and names != ref_names:
            issues.append(f"{path.name}: headers {names} ≠ {ref_names} (ref {ref_file})")

        # (2) verify first data row is filled
        if not any(ws[f"{c[0]}{DATA_FIRST_ROW}"].value for c in coords):
            issues.append(f"{path.name}: row {DATA_FIRST_ROW} is empty — data block shifted?")

    # Report
    if issues:
        print("⚠️ Discrepancies detected:")
        for m in issues:
            print(" •", m)
    else:
        print("✓ All FZ-10 workbooks share identical layout per year-class")

# Run the check once:
check_fz10_layout()

✓ All FZ-10 workbooks share identical layout per year-class


## ─────────────────────────────  DICT WORKBOOK → PARSER  ─────────────────────────────

In [6]:
"""
---------------------------------------------------------------------------
 DISPATCH TABLE & ACCUMULATORS
 --------------------------------------------------------------------------
 • `sheet_parsers` keeps a mapping  {sheet_id → parser_function}.
   Right now we only care about sheet “FZ 10.1”, hence one entry.
   If later you need “FZ 10.2” etc. just add:
       sheet_parsers["2"] = fz10_2

 • `globals_by_sheet` is a dict of *empty* DataFrames, one per sheet-id.
   During the main loop we keep appending monthly chunks to the correct
   DataFrame, so at the end `globals_by_sheet["1"]` holds the full
   2020-2025 history for sheet 10.1.
---------------------------------------------------------------------------
"""

sheet_parsers = {"1": fz10_1}

# Accumulators: one global DataFrame per sheet number
globals_by_sheet = {n: pd.DataFrame() for n in sheet_parsers}

## ─────────────────────────────  MAIN LOOP  ─────────────────────────────

In [7]:
"""
──────────────────────────────────────────────────────────────────────────────
Ingest loop ─ read every monthly **FZ-10** workbook and append to one table
──────────────────────────────────────────────────────────────────────────────
Workflow
========
1.  Iterate over every file that matches  `fz10_YYYY_MM.xlsx`
    (sorted in **reverse** order so the console log starts with
    the most recent month).

2.  For each workbook:
      • Load via *openpyxl* with  `data_only=True`  → we get pure values  
        (no formula objects).  
      • Derive the period tag **YYYYMM** from the filename – this becomes
        the first column **Date** in every parsed row.  
      • Decide whether the file is a **2020-layout** book  
        (keyword “2020” in the filename) – the parser needs this flag.

3.  Locate the worksheet *FZ 10.1* using `_find_sheet()`; the helper
    accepts both spelling variants “FZ10.1” and “FZ 10.1”.
      • If the sheet is missing: emit a warning and skip the file.  
      • Otherwise:  
          ▸ call `fz10_1(ws, is_2020=…)` to obtain a cleaned `DataFrame`  
          ▸ insert the **Date** column at position 0  
          ▸ concatenate the rows onto the single accumulator
            `globals_by_sheet["1"]` (ignore the old index).

Outcome
-------
When the loop completes, `globals_by_sheet["1"]` contains **every row from
every workbook (2020-2025) of sheet FZ 10.1** – ready for CSV export or
further analysis.
──────────────────────────────────────────────────────────────────────────────
"""

for path in sorted(DATA_DIR.glob("fz10_*.xlsx"), reverse=True):
    wb   = load_workbook(path, data_only=True)      # open workbook
    date = _date_from_fname(path)                   # e.g. "202403"

    is_2020 = "2020" in path.name                   # old vs new layout

    sheet = _find_sheet(wb)                         # find FZ 10.1
    if not sheet:                                   # skip if absent
        print(f"{path.name}: sheet FZ 10.1 not found — skipped")
        continue

    df = fz10_1(wb[sheet], is_2020)                 # parse sheet
    df.insert(0, "Date", date)                      # add date column
    
    # accumulate in dict entry "1"
    globals_by_sheet["1"] = pd.concat([globals_by_sheet["1"], df], ignore_index=True)

## ─────────────────────────────  SAVE RAW CSVs  ─────────────────────────────

In [8]:
for num, df in globals_by_sheet.items():
    # ensure 100 % string representation, no NaN
    df = df.fillna('').astype(str)

    # path …/csv/fz_10.<num>_raw.csv
    out_csv = OUT_DIR / f"fz_10.{num}_raw.csv"
    df.to_csv(out_csv, index=False, encoding="utf-8")

    out_csv = DST_DIR / f"fz_10.{num}_raw.csv"
    df.to_csv(out_csv, index=False, encoding="utf-8")

    # console log
    print(f"• Saved {out_csv.name}  →  {df.shape}\n")
    df.info()           # quick dtype audit
    print("\n\n")       # visual separator

• Saved fz_10.1_raw.csv  →  (22682, 10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22682 entries, 0 to 22681
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Date                22682 non-null  object
 1   Marke               22682 non-null  object
 2   Modell              22682 non-null  object
 3   Modellreihe         22682 non-null  object
 4   Insgesamt           22682 non-null  object
 5   mit Dieselantrieb   22682 non-null  object
 6   mit Hybridantrieb   22682 non-null  object
 7   mit Elektroantrieb  22682 non-null  object
 8   mit Allradantrieb   22682 non-null  object
 9   Cabriolets          22682 non-null  object
dtypes: object(10)
memory usage: 1.7+ MB



