In [1]:
# === Split sheets → CSV, then strip/clean (drop first row & col, set header, rename first col to 'Suburb') ===
import re
from pathlib import Path
import pandas as pd

# -------- CONFIG: update only this path if needed --------
EXCEL_PATH = Path("/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/data/Moving annual median rent by suburb and town - March quarter 2025.xlsx")

# Output folders (we'll keep things under a 'raw' dir next to the Excel file)
RAW_DIR       = EXCEL_PATH.parent / "raw"
SPLIT_DIR     = RAW_DIR / "split_sheets"
STRIPPED_DIR  = RAW_DIR / "split_sheets_stripped"
for d in (RAW_DIR, SPLIT_DIR, STRIPPED_DIR):
    d.mkdir(parents=True, exist_ok=True)

# -------- STEP 1: split each sheet to CSV --------
def _safe_name(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_").lower()

try:
    xls = pd.ExcelFile(EXCEL_PATH)  # requires openpyxl for .xlsx
except Exception as e:
    raise RuntimeError(
        f"Could not open Excel file. If missing, try: pip install openpyxl\nOriginal error: {e}"
    )

print(f"Found sheets: {xls.sheet_names}")

for sheet_name in xls.sheet_names:
    df = pd.read_excel(EXCEL_PATH, sheet_name=sheet_name)
    out_csv = SPLIT_DIR / f"{_safe_name(sheet_name)}.csv"
    df.to_csv(out_csv, index=False)
    print(f"[split] Saved: {out_csv}")

# -------- STEP 2: strip/clean each CSV --------
produced = []
for p in sorted(SPLIT_DIR.glob("*.csv")):
    # Read raw so file's first row really is index 0 (we'll promote a row to header later)
    df = pd.read_csv(p, header=None, dtype=str)

    # Guard: need at least 2 rows & 2 columns to drop first row/col
    if df.shape[0] < 2 or df.shape[1] < 2:
        print(f"[skip] Too small to strip (needs ≥2 rows & ≥2 cols): {p.name}")
        continue

    # Drop first row and first column
    df2 = df.iloc[1:, 1:].reset_index(drop=True)

    # Promote next row to header (the first row of df2 becomes the header)
    new_header = df2.iloc[0].astype(str).str.strip()
    df2 = df2.iloc[1:].reset_index(drop=True)
    df2.columns = new_header

    # Ensure the first column's header is exactly 'Suburb'
    # (works even if header is empty/duplicated)
    df2.columns = ["Suburb"] + df2.columns[1:].tolist()

    # Save stripped/cleaned CSV (with header)
    out = STRIPPED_DIR / p.name
    df2.to_csv(out, index=False)
    produced.append(out)
    print(f"[strip] Saved: {out}")

# -------- STEP 3 (optional): build a combined CSV from stripped files --------
if produced:
    combined = []
    for p in produced:
        tmp = pd.read_csv(p, dtype=str)
        # add dwelling-type from filename for traceability, e.g. '1_bedroom_flat'
        tmp["dwelling_file"] = Path(p).stem
        combined.append(tmp)
    combined_df = pd.concat(combined, ignore_index=True)
    combined_out = RAW_DIR / "combined_stripped.csv"
    combined_df.to_csv(combined_out, index=False)
    print(f"[combine] Saved: {combined_out}")
else:
    print("[combine] No stripped files produced; combined CSV not created.")


Found sheets: ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house', 'All properties']
[split] Saved: /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/data/raw/split_sheets/1_bedroom_flat.csv
[split] Saved: /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/data/raw/split_sheets/2_bedroom_flat.csv
[split] Saved: /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/data/raw/split_sheets/3_bedroom_flat.csv
[split] Saved: /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/data/raw/split_sheets/2_bedroom_house.csv
[split] Saved: /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2

In [2]:
# ==== Reshape property tables to tidy monthly format ==========================================
from pathlib import Path
import pandas as pd
import re

# ---- paths (edit IN_DIR only if yours is different) ----
IN_DIR  = Path("/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/data/raw/split_sheets_stripped")
OUT_DIR = IN_DIR.parent / "tidy_monthly"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- helpers ---------------------------------------------------------------------------------
MONTH_ABBR = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
FILE_RX = re.compile(r"(?P<beds>\d+)_bedroom_(?P<ptype>flat|house)", re.I)

def _clean_cell(x):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return None
    x = str(x).strip()
    return None if x in {"", "-"} else x

def load_and_build_header(path: Path) -> pd.DataFrame:
    """
    Reconstruct 2-row headers like:
      row0: 'Mar 2000','Mar 2000','Jun 2000','Jun 2000',...
      row1: 'Count','Median','Count','Median',...
    -> single-line names: 'Mar 2000_Count', 'Mar 2000_Median', ...
    Falls back gracefully if file already has a single header row.
    """
    raw = pd.read_csv(path, header=None, dtype=str)
    raw = raw.applymap(lambda x: None if pd.isna(x) else str(x).strip())

    r0 = raw.iloc[0].tolist()
    r1 = raw.iloc[1].tolist()

    def looks_like_two_row_header(r0, r1):
        has_month = any(any(m in str(c) for m in MONTH_ABBR) for c in r0)
        has_metrics = any(str(c).strip().lower() in {"count","median"} for c in r1)
        return has_month and has_metrics

    if looks_like_two_row_header(r0, r1):
        # forward-fill month labels across duplicate pairs
        r0_ffill, last = [], None
        for c in r0:
            if c not in [None,"","nan"]:
                last = c
            r0_ffill.append(last)

        cols = []
        for i, (top, sub) in enumerate(zip(r0_ffill, r1)):
            top = "Suburb" if i == 0 else top
            name = top if i == 0 else f"{top}_{sub}"
            cols.append(name)

        df = raw.iloc[2:].copy()
        df.columns = cols
    else:
        # already has header row
        df = pd.read_csv(path, dtype=str)
        if not df.columns[0] or df.columns[0].lower().startswith("unnamed"):
            df.rename(columns={df.columns[0]: "Suburb"}, inplace=True)

    df = df.applymap(_clean_cell)
    if "Suburb" not in df.columns:
        df.rename(columns={df.columns[0]: "Suburb"}, inplace=True)

    return df.dropna(how="all")

def wide_to_tidy(df: pd.DataFrame) -> pd.DataFrame:
    """Collapse '<Mon YYYY>_Count/Median' columns into rows; parse month."""
    val_cols = [c for c in df.columns if c != "Suburb"]
    long = df.melt(id_vars=["Suburb"], value_vars=val_cols,
                   var_name="period_metric", value_name="value")

    ext = long["period_metric"].str.extract(r"(?P<period>[A-Za-z]{3}\s+\d{4})[_\s-]*(?P<metric>Count|Median)?")
    long["period"] = ext["period"]
    long["metric"] = ext["metric"].fillna("")

    tidy = (long.pivot_table(index=["Suburb","period"], columns="metric", values="value", aggfunc="first")
                 .reset_index())
    tidy.columns.name = None

    if "Count" in tidy.columns:
        tidy["Count"] = pd.to_numeric(tidy["Count"], errors="coerce")
    if "Median" in tidy.columns:
        tidy["Median"] = pd.to_numeric(tidy["Median"], errors="coerce")

    tidy["date"] = pd.to_datetime(tidy["period"], format="%b %Y", errors="coerce")
    tidy = tidy.drop(columns=["period"])

    cols = ["Suburb","date"] + [c for c in ["Count","Median"] if c in tidy.columns]
    return tidy[cols].sort_values(["Suburb","date"]).reset_index(drop=True)

def annotate_from_filename(path: Path) -> dict:
    m = FILE_RX.search(path.stem)
    if not m:
        return {"bedrooms": None, "property_type": None}
    return {"bedrooms": int(m.group("beds")), "property_type": m.group("ptype").lower()}

def process_file(path: Path) -> pd.DataFrame:
    df = load_and_build_header(path)
    tidy = wide_to_tidy(df)
    meta = annotate_from_filename(path)
    for k, v in meta.items():
        tidy[k] = v
    # reorder columns
    base = ["Suburb","date","bedrooms","property_type"]
    vals = [c for c in ["Count","Median"] if c in tidy.columns]
    return tidy[base + vals]

# ---- run on all CSVs ---------------------------------------------------------
csvs = sorted(p for p in IN_DIR.glob("*.csv") if not p.name.startswith("~$"))
all_frames = []

for p in csvs:
    t = process_file(p)
    t.to_csv(OUT_DIR / f"{p.stem}_tidy.csv", index=False)
    all_frames.append(t)
    print(f"[ok] {p.name:26s} -> {len(t):5d} rows")

if all_frames:
    combo = (pd.concat(all_frames, ignore_index=True)
               .sort_values(["Suburb","date","bedrooms","property_type"]))
    combo.to_csv(OUT_DIR / "all_properties_tidy.csv", index=False)
    print(f"\n[done] Combined -> {OUT_DIR/'all_properties_tidy.csv'}  rows={len(combo):,}")
else:
    print("No CSVs found to process.")
# ==============================================================================================


  raw = raw.applymap(lambda x: None if pd.isna(x) else str(x).strip())
  df = df.applymap(_clean_cell)
  raw = raw.applymap(lambda x: None if pd.isna(x) else str(x).strip())
  df = df.applymap(_clean_cell)
  raw = raw.applymap(lambda x: None if pd.isna(x) else str(x).strip())
  df = df.applymap(_clean_cell)
  raw = raw.applymap(lambda x: None if pd.isna(x) else str(x).strip())
  df = df.applymap(_clean_cell)


[ok] 1_bedroom_flat.csv         -> 13639 rows
[ok] 2_bedroom_flat.csv         -> 14708 rows
[ok] 2_bedroom_house.csv        -> 14225 rows
[ok] 3_bedroom_flat.csv         -> 14079 rows


  raw = raw.applymap(lambda x: None if pd.isna(x) else str(x).strip())
  df = df.applymap(_clean_cell)
  raw = raw.applymap(lambda x: None if pd.isna(x) else str(x).strip())
  df = df.applymap(_clean_cell)
  raw = raw.applymap(lambda x: None if pd.isna(x) else str(x).strip())
  df = df.applymap(_clean_cell)


[ok] 3_bedroom_house.csv        -> 14529 rows
[ok] 4_bedroom_house.csv        -> 14155 rows
[ok] all_properties.csv         -> 14840 rows

[done] Combined -> /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/data/raw/tidy_monthly/all_properties_tidy.csv  rows=100,175
