### FZ8 + 83

In [None]:
import re
from pathlib import Path
import pandas as pd
from openpyxl import load_workbook

DATA_DIR = Path("./data/raw/neuzulassungen/fz8")

In [None]:
def _date_from_fname(p):
    return re.search(r"(\d{6})", p.name).group(1)

def _col(ws, letter, r0, r1):
    return [ws[f"{letter}{row}"].value for row in range(r0, r1 + 1)]

def _unique(cols):
    seen, out = {}, []
    for c in cols:
        if c in seen:
            seen[c] += 1
            out.append(f"{c}{seen[c]}")
        else:
            seen[c] = 0
            out.append(c)
    return out

def _strip_cols(df):
    df.columns = [str(c).replace('\n', ' ').replace('  ', ' ').strip() if isinstance(c, str) else c for c in df.columns]
    return df

def find_sheet(wb, num: str):
    pattern = re.compile(fr"^FZ\s*8\.{re.escape(num)}$", flags=re.IGNORECASE)
    for name in wb.sheetnames:
        if pattern.match(name.strip()):
            return name
    return None

In [None]:
def fz8_1(ws):
    cols = _unique([ws["B8"].value, ws["C9"].value])
    df = pd.DataFrame({
        cols[0]: _col(ws, "B", 10, 100),
        cols[1]: _col(ws, "C", 10, 100),
    }).dropna(how="all")

    df = _strip_cols(df)

    mask = df[cols[0]].astype(str).str.contains(r"INSGESAMT|FLENSBURG", case=False, na=False)

    return df[~mask].reset_index(drop=True)

In [None]:
def fz8_2(ws):
    raw = [
        ws["B8"].value, ws["C8"].value,
        f"{ws['D8'].value} {ws['D9'].value}".strip(),
        ws["F9"].value, ws["G10"].value, ws["H10"].value,
        ws["I9"].value, ws["J9"].value, ws["K10"].value,
    ]
    cols = _unique(raw)
    df = pd.DataFrame({
        cols[0]: _col(ws, "B", 11, 100),
        cols[1]: _col(ws, "C", 11, 100),
        cols[2]: _col(ws, "D", 11, 100),
        cols[3]: _col(ws, "F", 11, 100),
        cols[4]: _col(ws, "G", 11, 100),
        cols[5]: _col(ws, "H", 11, 100),
        cols[6]: _col(ws, "I", 11, 100),
        cols[7]: _col(ws, "J", 11, 100),
        cols[8]: _col(ws, "K", 11, 100),
    }).dropna(how="all")

    df = _strip_cols(df)

    mask = df[cols[0]].astype(str).str.contains(r"INSGESAMT|FLENSBURG", case=False, na=False)
    
    return df[~mask].reset_index(drop=True)

In [None]:
def fz8_3(ws):
    raw = [
        ws["B8"].value, ws["C8"].value, ws["D10"].value, ws["E10"].value,
        ws["F9"].value, ws["G10"].value, ws["H10"].value,
        ws["I9"].value, ws["J10"].value, ws["K10"].value,
        ws["L9"].value, ws["M9"].value, ws["N9"].value,
        ws["O10"].value, ws["P9"].value,
    ]
    cols = _unique(raw)
    letters = "BCDEFGHIJKLMNOP"[:15]
    rows = [
        [ws[f"{c}{r}"].value for c in letters]
        for r in range(11, 501)
        if not all(ws[f"{c}{r}"].value is None for c in letters)
    ]
    df = pd.DataFrame(rows, columns=cols)
    df = _strip_cols(df)

    seg_col = next((c for c in df.columns if "segment" in str(c).lower()), df.columns[0])
    df[seg_col] = df[seg_col].ffill()

    trash = r"ZUSAMMEN|INSGESAMT|HINWEISE|AUSGEWIESEN|KRAFTSTOFFVERBRAUCH|FLENSBURG|UMBENANNT"
    mask = df[seg_col].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)

    mod_col = next((c for c in df.columns if "modellreihe" in str(c).lower()), None)
    if mod_col:
        is_sonstige = df[seg_col].astype(str).str.contains(r"\bSONSTIGE\b", case=False, na=False)
        df.loc[is_sonstige, mod_col] = "SONSTIGE"

    return df

In [None]:
def fz8_7(ws):
    raw = [
        ws["B8"].value, ws["C8"].value, ws["D9"].value, ws["E9"].value,
        ws["F9"].value, ws["G9"].value, ws["H9"].value, ws["I9"].value,
        ws["J9"].value, ws["K9"].value, ws["L9"].value, ws["M9"].value, ws["N9"].value,
    ]
    cols = _unique(raw)
    letters = "BCDEFGHIJKLMN"[:13]
    rows = [
        [ws[f"{c}{r}"].value for c in letters]
        for r in range(10, 101)
        if not all(ws[f"{c}{r}"].value is None for c in letters)
    ]
    
    df = pd.DataFrame(rows, columns=cols)
    df = _strip_cols(df)

    for c in df.columns:
        if "insgesamt" in str(c).lower():
            df.rename(columns={c: "Anzahl"}, inplace=True)
            break

    mask = df[cols[0]].astype(str).str.contains(
        r"INSGESAMT|DARUNTER|FLENSBURG", case=False, regex=True, na=False)
    
    return df[~mask].reset_index(drop=True)

In [None]:
def fz8_9(ws):
    raw = [
        ws["B8"].value, ws["C8"].value, ws["D8"].value, ws["F8"].value,
        ws["H9"].value, ws["J9"].value, ws["L9"].value,
        ws["N9"].value, ws["P9"].value, ws["R8"].value,
    ]
    cols = _unique(raw)
    letters = ["B","C","D","F","H","J","L","N","P","R"]
    rows = [
        [ws[f"{c}{r}"].value for c in letters]
        for r in range(11, 101)
        if not all(ws[f"{c}{r}"].value is None for c in letters)
    ]
    
    df = pd.DataFrame(rows, columns=cols)
    df = _strip_cols(df)

    for c in df.columns:
        if "insgesamt" in str(c).lower():
            df.rename(columns={c: "Anzahl"}, inplace=True)
            break

    mask = df[cols[0]].astype(str).str.contains(
        r"INSGESAMT|HINWEIS|ERBRINGUNG|FLENSBURG", case=False, regex=True, na=False)
    
    return df[~mask].reset_index(drop=True)

In [None]:
df_fz8_global = pd.DataFrame()
df_fz83_global = pd.DataFrame()

for path in sorted(DATA_DIR.glob("fz8_*.xlsx")):
    wb   = load_workbook(path, data_only=True)
    date = _date_from_fname(path)

    s81 = find_sheet(wb, '1'); s82 = find_sheet(wb, '2')
    s83 = find_sheet(wb, '3'); s87 = find_sheet(wb, '7')
    s89 = find_sheet(wb, '9')

    if not all([s81, s82, s83, s87, s89]):
        print(f"'FZ 8.x' not found in {path.name} — skipped")
        continue

    df_fz83 = fz8_3(wb[s83])
    df_fz83.insert(0, "Date", date)
    globals()[f"{date}_fz8_3"] = df_fz83
    df_fz83_global = pd.concat([df_fz83_global, df_fz83], ignore_index=True)

    df_fz83_global.drop(columns=df_fz83_global.columns[16:], inplace=True)

    drop_cols = [c for c in df_fz83_global.columns if "Emission" in str(c)
        or "Kraftstoffverbrauch" in str(c)]
    df_fz83_global.drop(columns=drop_cols, inplace=True)

    df_fz81 = fz8_1(wb[s81]); df_fz81.insert(0, "Date", date)
    df_fz82 = fz8_2(wb[s82]); df_fz82.insert(0, "Date", date)
    df_fz87 = fz8_7(wb[s87]); df_fz87.insert(0, "Date", date)
    df_fz89 = fz8_9(wb[s89]); df_fz89.insert(0, "Date", date)

    keys = ['Date', 'Marke']

    tmp = df_fz81.merge(df_fz82, on=keys, how='outer', suffixes=('', '_dup'))
    tmp = tmp.loc[:, ~tmp.columns.str.endswith('_dup')]
    tmp = tmp.merge(df_fz87, on=keys, how='outer', suffixes=('', '_dup'))
    tmp = tmp.loc[:, ~tmp.columns.str.endswith('_dup')]
    df_fz8_total = tmp.merge(df_fz89, on=keys, how='outer', suffixes=('', '_dup'))
    df_fz8_total = df_fz8_total.loc[:, ~df_fz8_total.columns.str.endswith('_dup')]

    globals()[f"{date}_fz8_total"] = df_fz8_total

    df_fz8_global = pd.concat([df_fz8_global, df_fz8_total], ignore_index=True)

    # df_fz8_global.drop(columns=df_fz8_global.columns[23:], inplace=True)

    drop_cols = [c for c in df_fz8_global.columns if "Emission" in str(c)
        or "darunter" in str(c) or "Euro " in str(c)]
    df_fz8_global.drop(columns=drop_cols, inplace=True)

In [None]:
OUT_DIR = "./data/processed"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

df_fz8_global.to_csv(f"{OUT_DIR}/df_fz8.csv", index=False, encoding="utf-8")
df_fz83_global.to_csv(f"{OUT_DIR}/df_fz83.csv", index=False, encoding="utf-8")
print("✓ Saved df_fz8_global.csv")
print("✓ Saved df_fz83_global.csv")

In [None]:
df_fz8_global

In [None]:
df_fz83_global

In [None]:
df_fz8_global.info()

In [None]:
df_fz83_global.info()

In [None]:
df_fz8_global.columns

In [None]:
df_fz83_global.columns