### FZ10

In [None]:
import re
from pathlib import Path
import pandas as pd
from openpyxl import load_workbook

DATA_DIR = Path("./data/raw/neuzulassungen/fz10")

In [None]:
def _date_from_fname(p):
    y, m = re.search(r"(\d{4})_(\d{2})", p.name).groups()
    return y + m

def _strip_cols(df):
    df.columns = [
        re.sub(r'\s{2,}', ' ', str(c).replace('\n', ' ').strip()) if isinstance(c, str) else c for c in df.columns]
    return df

def _unique(cols):
    seen, out = {}, []
    for c in cols:
        if c in seen:
            seen[c] += 1
            out.append(f"{c}{seen[c]}")
        else:
            seen[c] = 0
            out.append(c)
    return out

def _col(ws, letter, r0, r1):
    return [ws[f"{letter}{row}"].value for row in range(r0, r1 + 1)]

def _find_sheet(wb, pattern=r"^FZ\s*10\.1$"):
    regex = re.compile(pattern, flags=re.IGNORECASE)
    for name in wb.sheetnames:
        if regex.match(name.strip()):
            return name
    return None

In [None]:
def fz10_1(ws):
    hdr_map = {
        "B": 9,  "C": 9,  "D": 8,  "G": 8,  "J": 8,  "M": 8,
        "P": 8,  "S": 8,  "V": 8,  "Y": 8,  "AB": 8, "AE": 8,
        "AH": 8, "AK": 8, "AN": 8, "AQ": 8,
    }
    headers = [ws[f"{col}{row}"].value for col, row in hdr_map.items()]
    cols    = _unique(headers)

    df = pd.DataFrame(
        {col: _col(ws, let, 10, 1000) for col, let in zip(cols, hdr_map.keys())}
    ).dropna(how="all")
    df = _strip_cols(df)

    rename_map = {}
    for c in df.columns:
        cl = str(c).strip()
        if cl.lower() == "mit hybridantrieb (incl. plug-in-hybrid)":
            rename_map[c] = "mit Hybridantrieb"
        elif cl.lower() == "mit elektroantrieb (bev)":
            rename_map[c] = "mit Elektroantrieb"
    if rename_map:
        df.rename(columns=rename_map, inplace=True)

    modell_col = None
    for c in list(df.columns):
        if "modellreihe" in str(c).lower():
            df.rename(columns={c: "Modell"}, inplace=True)
            modell_col = "Modell"

    marke_col = next((c for c in df.columns if "marke" in str(c).lower()), df.columns[0])

    mask = df[marke_col].astype(str).str.contains(
        r"INSGESAMT|ZUSAMMEN|FLENSBURG|ANZAHL|HINWEIS", case=False, na=False
    )
    df = df[~mask].reset_index(drop=True)

    df[marke_col] = df[marke_col].ffill()

    sonstige_mask = df[marke_col].astype(str).str.contains(r"\bSONSTIGE\b", case=False, na=False)
    seg_idx = df.columns.get_loc(marke_col)
    if seg_idx + 1 < len(df.columns):
        next_col = df.columns[seg_idx + 1]
        df.loc[sonstige_mask, next_col] = "SONSTIGE"

    if modell_col and marke_col:
        insert_pos = df.columns.get_loc(modell_col) + 1
        df.insert(
            insert_pos,
            "Modellreihe",
            (df[marke_col].fillna("") + " " + df[modell_col].fillna("")).str.strip(),
        )

    return df

In [None]:
df_fz10_global = pd.DataFrame()

for path in sorted(DATA_DIR.glob("fz10_*.xlsx")):
    wb   = load_workbook(path, data_only=True)
    date = _date_from_fname(path)

    sheet_name = _find_sheet(wb)
    if not sheet_name:
        print(f"'FZ10.1' not found in {path.name} — skipped")
        continue

    df_fz101 = fz10_1(wb[sheet_name]); df_fz101.insert(0, "Date", date)

    globals()[f"{date}_fz10_1"] = df_fz101
    df_fz10_global = pd.concat([df_fz10_global, df_fz101], ignore_index=True)

    df_fz10_global.drop(columns=df_fz10_global.columns[10:], inplace=True)

In [None]:
OUT_DIR = "./data/processed"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

df_fz10_global.to_csv(f"{OUT_DIR}/df_fz10.csv", index=False, encoding="utf-8")
print("✓ Saved df_fz10_global.csv")

In [None]:
df_fz10_global

In [None]:
df_fz10_global.info()

In [None]:
df_fz10_global.columns