In [1]:
import re
from pathlib import Path
import pandas as pd
from openpyxl import load_workbook
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

DATA_DIR = Path("../data/raw/fz2")          # input *.xlsx files
OUT_DIR  = Path("../data/raw/fz2/csv")      # will hold *_raw.csv
OUT_DIR.mkdir(parents=True, exist_ok=True)

DST_DIR = Path("../data/processed/,")         # final merged CSVs
DST_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
def _date_from_fname(p):
    """Return YYYYMM extracted from filename `fz2_YYYY.xlsx`."""
    return re.search(r"(\d{4})", p.name).group(1)

def _col(ws, letter, r0, r1):
    """Return values of column *letter* from rows *r0 … r1* inclusive."""
    return [ws[f"{letter}{row}"].value for row in range(r0, r1 + 1)]

# def _clean_header(s):
#     """Normalize header cell: collapse multiple spaces + remove newlines."""
#     return str(s).replace('\n', ' ').replace('  ', ' ').strip() if s is not None else s

def _clean_header(s):
    """Normalize header cell: collapse multiple spaces + remove newlines."""
    return (str(s).translate(str.maketrans("äÄöÖüÜ", "aAoOuU")).replace("\n", " ").replace("  ", " ").strip().upper()) if s is not None else s

def _strip_cols(df):
    """Apply `clean_header` to every column name in-place and return df."""
    df.columns = [_clean_header(c) for c in df.columns]
    return df

def _unique(cols):
    """Ensure uniqueness by adding numeric suffixes."""
    seen, out = {}, []
    for c in cols:
        if c in seen:
            seen[c] += 1
            out.append(f"{c}{seen[c]}")
        else:
            seen[c] = 0
            out.append(c)
    return out

def _find_sheet(wb, num):
    """Locate sheet whose name matches *pattern* (case-insensitive)."""
    pattern = re.compile(fr"^FZ\s*2\.{re.escape(num)}$", flags=re.IGNORECASE)
    for name in wb.sheetnames:
        if pattern.match(name.strip()):
            return name
    return None

# def _strip_upper(df):
#     """
#     Trim whitespace and up-case every object column (in-place).
#     Faster & future-proof replacement for the old `applymap`.
#     """
#     for col in df.columns:
#         if df[col].dtype == "object":
#             df[col] = df[col].str.strip().str.upper()
#     return df

In [3]:
def fz2_2(ws):
    raw = [
        _clean_header(ws["B8"].value),   # 
        _clean_header(ws["C8"].value),   # 
        _clean_header(ws["D8"].value),  # 
        _clean_header(ws["E8"].value),   # 
        _clean_header(ws["F8"].value),  # 
        _clean_header(ws["G8"].value),  # 
        _clean_header(ws["H8"].value),   # 
        _clean_header(ws["I8"].value),  # 
        _clean_header(ws["J9"].value),  # 
        _clean_header(ws["K9"].value),   # 
        _clean_header(ws["L9"].value),   # 
        _clean_header(ws["M9"].value),   # 
        _clean_header(ws["N9"].value),  # 
    ]
    cols = _unique(raw)                  # avoid duplicate column names


    # read data block
    df = pd.DataFrame({
        cols[0]:  _col(ws, "B", 10, 20000),
        cols[1]:  _col(ws, "C", 10, 20000),
        cols[2]:  _col(ws, "D", 10, 20000),
        cols[3]:  _col(ws, "E", 10, 20000),
        cols[4]:  _col(ws, "F", 10, 20000),
        cols[5]:  _col(ws, "G", 10, 20000),
        cols[6]:  _col(ws, "H", 10, 20000),
        cols[7]:  _col(ws, "I", 10, 20000),
        cols[8]:  _col(ws, "J", 10, 20000),
        cols[9]:  _col(ws, "K", 10, 20000),
        cols[10]: _col(ws, "L", 10, 20000),
        cols[11]: _col(ws, "M", 10, 20000),
        cols[12]: _col(ws, "N", 10, 20000),
    }).dropna(how="all")

    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)
    
    # find the actual column name
    seg_col = next(c for c in df.columns if "HERSTELLER" in c)
    df[seg_col] = df[seg_col].ffill()

    # drop meta rows
    trash = r"ZUSAMMEN|INSGESAMT|HINWEIS|FLENSBURG|SONSTIGE"
    mask = df[seg_col].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)

    # find the actual column name
    seg_col = next(c for c in df.columns if "HANDELSNAME" in c)
    df[seg_col] = df[seg_col].ffill()

    df[seg_col] = (df[seg_col].astype(str).str.replace(",", ";", regex=False))

    # drop meta rows
    trash = r"SONSTIGE"
    mask = df[seg_col].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)

    # post-cleanup
    # 1. trim + upper-case every string cell
    df = df.applymap(lambda v: v.replace("  ", " ").strip().upper() if isinstance(v, str) else v)

    num_cols = cols[3:]
    df[num_cols] = (
        df[num_cols]
          .replace({"-": "0", ".": "0"})
          .astype(str)                       # ensure string for next step
    )

    df.replace("0", "", inplace=True)
    
    return df

In [4]:
def fz2_4(ws):
    raw = [
        _clean_header(ws["B8"].value),   # 
        _clean_header(ws["C8"].value),   # 
        _clean_header(ws["D8"].value),  # 
        _clean_header(ws["E8"].value),   # 
        _clean_header(ws["F8"].value),  # 
        _clean_header(ws["G8"].value),  # 
        _clean_header(ws["H8"].value),   # 
        _clean_header(ws["I8"].value),  # 
        _clean_header(ws["J8"].value),  # 
        _clean_header(ws["K8"].value),   # 
        _clean_header(ws["L8"].value),   # 
        _clean_header(ws["M8"].value),   # 
        _clean_header(ws["N8"].value),  # 
        _clean_header(ws["O8"].value),  # 
        _clean_header(ws["P8"].value),  # 
        _clean_header(ws["Q8"].value),   # 
        _clean_header(ws["R8"].value),  # 
        _clean_header(ws["S8"].value),  # 
        _clean_header(ws["T8"].value),   # 
        _clean_header(ws["U8"].value),   # 
        _clean_header(ws["V8"].value),   # 
    ]
    cols = _unique(raw)                  # avoid duplicate column names


    # read data block
    df = pd.DataFrame({
        cols[0]:  _col(ws, "B", 9, 20000),
        cols[1]:  _col(ws, "C", 9, 20000),
        cols[2]:  _col(ws, "D", 9, 20000),
        cols[3]:  _col(ws, "E", 9, 20000),
        cols[4]:  _col(ws, "F", 9, 20000),
        cols[5]:  _col(ws, "G", 9, 20000),
        cols[6]:  _col(ws, "H", 9, 20000),
        cols[7]:  _col(ws, "I", 9, 20000),
        cols[8]:  _col(ws, "J", 9, 20000),
        cols[9]:  _col(ws, "K", 9, 20000),
        cols[10]: _col(ws, "L", 9, 20000),
        cols[11]: _col(ws, "M", 9, 20000),
        cols[12]: _col(ws, "N", 9, 20000),
        cols[13]: _col(ws, "O", 9, 20000),
        cols[14]: _col(ws, "P", 9, 20000),
        cols[15]: _col(ws, "Q", 9, 20000),
        cols[16]: _col(ws, "R", 9, 20000),
        cols[17]: _col(ws, "S", 9, 20000),
        cols[18]: _col(ws, "T", 9, 20000),
        cols[19]: _col(ws, "U", 9, 20000),
        cols[20]: _col(ws, "V", 9, 20000),
    }).dropna(how="all")

    # normalise header texts once more (harmless if already clean)
    df = _strip_cols(df)
    
    # find the actual column name
    seg_col = next(c for c in df.columns if "HERSTELLER" in c)
    df[seg_col] = df[seg_col].ffill()

    # drop meta rows
    trash = r"ZUSAMMEN|INSGESAMT|HINWEIS|FLENSBURG|SONSTIGE"
    mask = df[seg_col].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)

    # find the actual column name
    seg_col = next(c for c in df.columns if "HANDELSNAME" in c)
    df[seg_col] = df[seg_col].ffill()

    df[seg_col] = (df[seg_col].astype(str).str.replace(",", ";", regex=False))

    # drop meta rows
    trash = r"SONSTIGE"
    mask = df[seg_col].astype(str).str.contains(trash, case=False, na=False)
    df = df[~mask].reset_index(drop=True)

    # post-cleanup
    # 1. trim + upper-case every string cell
    df = df.applymap(lambda v: v.replace("  ", " ").strip().upper() if isinstance(v, str) else v)

    num_cols = cols[2:]
    df[num_cols] = (
        df[num_cols]
          .replace({"-": "0", ".": "0"})
          .astype(str)                       # ensure string for next step
    )

    df.replace("0", "", inplace=True)
    
    return df

In [5]:
header_map = {
    '2':  ["B8", "C8", "D8", "E8", "F8", "G8", "H8", "I8", "J9", "K9", "L9", "M9", "N9"],
    '4':  ["B8", "C8", "D8", "E8", "F8", "G8", "H8", "I8", "J8", "K8", "L8", "M8", "N8", "O8", "P8", "Q8", "R8", "S8", "T8", "U8", "V8"],
}

data_start_row = {'2':10, '4':9}

def check_fz2_layout():
    issues = []
    for num, coords in header_map.items():
        ref_names = None        # header texts from the first workbook
        ref_file  = None        # its filename (for reference print)

        for path in sorted(DATA_DIR.glob("fz2_*.xlsx")):
            wb  = load_workbook(path, data_only=True)
            sn  = _find_sheet(wb, num)
            if not sn:
                issues.append(f"{path.name}: workbook 2.{num} not found")
                continue
            
            # collect header texts at the expected coordinates
            ws = wb[sn]
            names = [_clean_header(ws[c].value) for c in coords]

            # (1) compare to reference workbook
            if ref_names is None:
                ref_names, ref_file = names, path.name
            elif names != ref_names:
                issues.append(f"{path.name}: 2.{num} – {names} ≠ {ref_names} (reference {ref_file})")

            # (2) make sure the first data row is populated
            r0 = data_start_row[num]
            if not any(ws[f"{c[0]}{r0}"].value for c in coords):
                issues.append(f"{path.name}: 2.{num} – row {r0} is empty, first data row shifted?")
    
    # Report
    if issues:
        print("⚠️  Discrepancies have been detected:")
        for msg in issues:
            print(" •", msg)
    else:
        print("The layouts of all FZ2 sheets are identical (coordinates, headers, first data row)")

# Run the check once:
check_fz2_layout()

The layouts of all FZ2 sheets are identical (coordinates, headers, first data row)


In [6]:
sheet_parsers = {'2':  fz2_2, '4':  fz2_4,}

# Accumulators: one global DataFrame per sheet number
globals_by_sheet = {num: pd.DataFrame() for num in sheet_parsers}

In [7]:
for path in sorted(DATA_DIR.glob("fz2_*.xlsx")):
    wb   = load_workbook(path, data_only=True)      # read Excel as values
    date = _date_from_fname(path)                   # e.g. "2024"

    for num, parser in sheet_parsers.items():
        sname = _find_sheet(wb, num)                 # locate “FZ 2.<num>”
        if not sname:                               # skip missing sheets
            print(f"{path.name}: workbook 2.{num} not found")
            continue

        df = parser(wb[sname])                      # parse & clean
        df.insert(0, "Date", date)                  # add period column

        # append to the global accumulator for this sheet
        globals_by_sheet[num] = pd.concat([globals_by_sheet[num], df], ignore_index=True)

In [8]:
for num, df in globals_by_sheet.items():
    # ensure 100 % string representation, no NaN
    df = df.fillna('').astype(str)

    # path …/csv/fz_2.<num>_raw.csv
    out_csv = OUT_DIR / f"fz_2.{num}_raw.csv"
    df.to_csv(out_csv, index=False, encoding="utf-8")

    out_csv = DST_DIR / f"fz_2.{num}_raw.csv"
    df.to_csv(out_csv, index=False, encoding="utf-8")

    # console log
    print(f"• Saved {out_csv.name}  →  {df.shape}\n")
    df.info()           # quick dtype audit
    print("\n\n")       # visual separator

• Saved fz_2.2_raw.csv  →  (77912, 14)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77912 entries, 0 to 77911
Data columns (total 14 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   Date                                 77912 non-null  object
 1   HERSTELLER                           77912 non-null  object
 2   HANDELSNAME                          77912 non-null  object
 3   TYP-SCHL.-NR.                        77912 non-null  object
 4   KW                                   77912 non-null  object
 5   KRAFTSTOFFART                        77912 non-null  object
 6   ALLRAD                               77912 non-null  object
 7   AUFBAUART                            77912 non-null  object
 8   INSGESAMT                            77912 non-null  object
 9   WOHNMOBILE                           77912 non-null  object
 10  PRIVATE HALTERINNEN UND HALTER       77912 non-null  object
 11  H