In [1]:
import pandas as pd
import csv
from pathlib import Path

INPUT = "population_projections_LAD_raw.csv"
OUTPUT = "lad_population_projections_tidy.csv"

rows = []

current_sex = None
current_age = None
reading_table = False
header = None

with open(INPUT, newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    for line in reader:

        # Skip empty lines
        if not line or all(x.strip() == "" for x in line):
            continue

        # --- Metadata lines ---
        if line[0].startswith("Gender"):
            current_sex = line[1].strip()
            continue

        if line[0].startswith("Age"):
            current_age = line[1].strip()
            continue

        # --- Column header ---
        if line[0].startswith("local authority"):
            header = line
            reading_table = True
            continue

        # --- Footnotes ---
        if len(line) > 1 and line[1].startswith("Figures may not sum"):
            reading_table = False
            continue

        # --- Data rows ---
        if reading_table and header:
            lad_name = line[0].strip()
            lad_code = line[1].strip()

            for col_idx in range(2, len(header)):
                year = int(header[col_idx])
                pop = line[col_idx]

                if pop == "":
                    continue

                rows.append({
                    "lad_code": lad_code,
                    "lad_name": lad_name,
                    "sex": current_sex,
                    "age_group": current_age,
                    "year": year,
                    "population": int(pop)
                })

# Build DataFrame
df = pd.DataFrame(rows)

# Clean age labels (optional but recommended)
age_map = {
    "All Ages": "all",
    "Aged 0 to 15": "0_15",
    "Aged 16 to 24": "16_24",
    "Aged 25 to 49": "25_49",
    "Aged 50 to 64": "50_64",
    "Aged 65+": "65+",
}
df["age_group"] = df["age_group"].map(age_map)

# Normalize sex labels
df["sex"] = df["sex"].str.lower()

# Save
df.to_csv(OUTPUT, index=False)

print(f"Saved tidy LAD projections to {OUTPUT}")

Saved tidy LAD projections to lad_population_projections_tidy.csv


In [3]:
import pandas as pd

lad = pd.read_csv("lad_population_projections_tidy.csv", dtype={"lad_code": str})

# normalize column names to match your merge
lad = lad.rename(columns={"lad_code": "lad22cd"})

# keys used in merge (year optional depending on step)
KEYS = ["lad22cd", "sex", "age_group", "year"]

# show duplicated rows
dups = lad[lad.duplicated(subset=KEYS, keep=False)] \
          .sort_values(KEYS)

print(f"Number of duplicated rows: {len(dups)}")
print(dups)

Number of duplicated rows: 0
Empty DataFrame
Columns: [lad22cd, lad_name, sex, age_group, year, population]
Index: []
