In [None]:
import re
from pathlib import Path
import pandas as pd
from openpyxl import load_workbook

DATA_DIR = Path("./data/raw/neuzulassungen/fz8")

In [None]:
def _date_from_fname(p):
    return re.search(r"(\d{6})", p.name).group(1)

def _col(ws, letter, r0, r1):
    return [ws[f"{letter}{row}"].value for row in range(r0, r1 + 1)]

def _unique(cols):
    seen, out = {}, []
    for c in cols:
        if c in seen:
            seen[c] += 1
            out.append(f"{c}{seen[c]}")
        else:
            seen[c] = 0
            out.append(c)
    return out

In [None]:
def fz8_1(ws):
    cols = _unique([ws["B8"].value, ws["C9"].value])
    df = (
        pd.DataFrame({
            cols[0]: _col(ws, "B", 10, 100),
            cols[1]: _col(ws, "C", 10, 100),
        })
        .dropna(how="all")
    )

    mask = df[cols[0]].astype(str).str.contains(
        r"INSGESAMT|FLENSBURG", case=False, regex=True, na=False
    )
    df = df[~mask].reset_index(drop=True)

    return df

In [None]:
def fz8_2(ws):
    raw = [
        ws["B8"].value, ws["C8"].value,
        f"{ws['D8'].value} {ws['D9'].value}".strip(),
        ws["F9"].value, ws["G10"].value, ws["H10"].value,
        ws["I9"].value, ws["J9"].value, ws["K10"].value,
    ]
    cols = _unique(raw)
    df = (
        pd.DataFrame({
            cols[0]: _col(ws, "B", 11, 100),
            cols[1]: _col(ws, "C", 11, 100),
            cols[2]: _col(ws, "D", 11, 100),
            cols[3]: _col(ws, "F", 11, 100),
            cols[4]: _col(ws, "G", 11, 100),
            cols[5]: _col(ws, "H", 11, 100),
            cols[6]: _col(ws, "I", 11, 100),
            cols[7]: _col(ws, "J", 11, 100),
            cols[8]: _col(ws, "K", 11, 100),
        })
        .dropna(how="all")
    )

    mask = df[cols[0]].astype(str).str.contains(
        r"INSGESAMT|FLENSBURG", case=False, regex=True, na=False
    )
    df = df[~mask].reset_index(drop=True)

    return df

In [None]:
# def fz8_3(ws):
#     raw_headers = [
#         ws["B8"].value, ws["C8"].value, ws["D10"].value, ws["E10"].value,
#         ws["F9"].value, ws["G10"].value, ws["H10"].value,
#         ws["I9"].value, ws["J10"].value, ws["K10"].value,
#         ws["L9"].value, ws["M9"].value, ws["N9"].value,
#         ws["O10"].value, ws["P9"].value,
#     ]
#     cols = _unique(raw_headers)

#     letters = "BCDEFGHIJKLMNOP"[:15]
#     rows = [
#         [ws[f"{c}{r}"].value for c in letters]
#         for r in range(11, 501)
#         if not all(ws[f"{c}{r}"].value is None for c in letters)
#     ]
#     df = pd.DataFrame(rows, columns=cols)

#     seg_col = next((c for c in df.columns if c and "segment" in str(c).lower()), df.columns[0])
#     df[seg_col] = df[seg_col].ffill()

#     trash = r"ZUSAMMEN|INSGESAMT|HINWEISE|AUSGEWIESEN|KRAFTSTOFFVERBRAUCH|FLENSBURG"
#     mask = df[seg_col].astype(str).str.contains(trash, case=False, regex=True, na=False)
#     df = df[~mask].reset_index(drop=True)

#     mod_col = next((c for c in df.columns if c and "modellreihe" in str(c).lower()), None)
#     if mod_col:
#         is_sonstige = df[seg_col].astype(str).str.contains(r"\bSONSTIGE\b", case=False, regex=True, na=False)
#         df.loc[is_sonstige, mod_col] = "SONSTIGE"

#     return df

In [None]:
def fz8_7(ws):
    raw = [
        ws["B8"].value, ws["C8"].value, ws["D9"].value, ws["E9"].value,
        ws["F9"].value, ws["G9"].value, ws["H9"].value, ws["I9"].value,
        ws["J9"].value, ws["K9"].value, ws["L9"].value, ws["M9"].value, ws["N9"].value,
    ]
    cols = _unique(raw)
    letters = "BCDEFGHIJKLMN"[:13]
    rows = [
        [ws[f"{c}{r}"].value for c in letters]
        for r in range(10, 101)
        if not all(ws[f"{c}{r}"].value is None for c in letters)
    ]
    df = pd.DataFrame(rows, columns=cols)

    for c in list(df.columns):
        if isinstance(c, str) and "insgesamt" in c.lower():
            df.rename(columns={c: "Anzahl"}, inplace=True)
            break

    mask = df[cols[0]].astype(str).str.contains(
        r"INSGESAMT|DARUNTER|FLENSBURG", case=False, regex=True, na=False
    )
    return df[~mask].reset_index(drop=True)

In [None]:
def fz8_9(ws):
    raw = [
        ws["B8"].value, ws["C8"].value, ws["D8"].value, ws["F8"].value,
        ws["H9"].value, ws["J9"].value, ws["L9"].value,
        ws["N9"].value, ws["P9"].value, ws["R8"].value,
    ]
    cols = _unique(raw)
    letters = ["B","C","D","F","H","J","L","N","P","R"]
    rows = [
        [ws[f"{c}{r}"].value for c in letters]
        for r in range(11, 101)
        if not all(ws[f"{c}{r}"].value is None for c in letters)
    ]
    df = pd.DataFrame(rows, columns=cols)

    for c in list(df.columns):
        if isinstance(c, str) and "insgesamt" in c.lower():
            df.rename(columns={c: "Anzahl"}, inplace=True)
            break

    mask = df[cols[0]].astype(str).str.contains(
        r"INSGESAMT|HINWEIS|ERBRINGUNG|FLENSBURG",
        case=False, regex=True, na=False
    )
    return df[~mask].reset_index(drop=True)

In [None]:
df_fz8_global = pd.DataFrame()

for path in sorted(DATA_DIR.glob("fz8_*.xlsx")):
    wb   = load_workbook(path, data_only=True)
    date = _date_from_fname(path)

    df_fz81 = fz8_1(wb["FZ 8.1"]); df_fz81.insert(0, "Date", date)
    df_fz82 = fz8_2(wb["FZ 8.2"]); df_fz82.insert(0, "Date", date)
    # df_fz83 = fz8_3(wb["FZ 8.3"]); df_fz83.insert(0, "Date", date)
    df_fz87 = fz8_7(wb["FZ 8.7"]); df_fz87.insert(0, "Date", date)
    df_fz89 = fz8_9(wb["FZ 8.9"]); df_fz89.insert(0, "Date", date)

    keys = ['Date', 'Marke', 'Anzahl']
    tmp = df_fz81.merge(df_fz82, on=keys, how='outer', suffixes=('', '_dup'))
    tmp = tmp.loc[:, ~tmp.columns.str.endswith('_dup')]

    tmp = tmp.merge(df_fz87, on=keys, how='outer', suffixes=('', '_dup'))
    tmp = tmp.loc[:, ~tmp.columns.str.endswith('_dup')]

    df_fz8_total = tmp.merge(df_fz89, on=keys, how='outer', suffixes=('', '_dup'))
    df_fz8_total = df_fz8_total.loc[:, ~df_fz8_total.columns.str.endswith('_dup')]

    globals()[f"{date}_fz8_total"] = df_fz8_total

    df_fz8_global = pd.concat([df_fz8_global, df_fz8_total], ignore_index=True)

In [None]:
df_fz81

In [None]:
df_fz82

In [None]:
# df_fz83

In [None]:
df_fz87

In [None]:
df_fz89

In [None]:
df_fz8_total

In [None]:
df_fz8_global

In [None]:
df_fz8_global.info()