## Crime dataset ##


In [20]:
# --- 0) Setup ---------------------------------------------------------------
import pandas as pd
import numpy as np
from pathlib import Path


In [21]:

# ---- Input files (your absolute paths) ----
P_CORR  = Path("/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/CG_SA2_2021_LGA_2021.csv")
P_LGA   = Path("/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_dataset.csv")


In [22]:

# ---- Output files ----
OUT_DIR = P_LGA.parent
OUT_SA2 = OUT_DIR / "crime_dataset_weighted_to_SA2.csv"
OUT_CHECKS = OUT_DIR / "weight_checks_by_LGA.csv"


In [23]:

# --- 1) Read data -----------------------------------------------------------
# Read as strings to avoid code mangling (e.g., 2.01E+08)
corr = pd.read_csv(P_CORR, dtype=str)
lga  = pd.read_csv(P_LGA,  dtype=str)


In [24]:

# --- 2) Normalise/standardise column names ---------------------------------
# Make a case-insensitive mapping to canonical names
def normcols(df):
    m = {c: c.strip() for c in df.columns}
    df = df.rename(columns=m)
    # Lowercase for matching only; keep originals for output later if you want
    df.columns = [c.strip() for c in df.columns]
    return df

corr = normcols(corr)
lga  = normcols(lga)


In [25]:

# Try to detect expected key/weight columns in corr
# Common possibilities: SA2_CODE_2021 / SA2_NAME_2021 / LGA_CODE_2021 / LGA_NAME_2021 / RATIO_FROM_TO or RATIO_TO
def pick(name_options, cols):
    for n in name_options:
        if n in cols:
            return n
    return None

c_sa2_code = pick(["SA2_CODE_2021","SA2_CODE21","SA2_MAINCODE_2021","SA2_MAINCODE21","SA2_CODE"], corr.columns)
c_sa2_name = pick(["SA2_NAME_2021","SA2_NAME21","SA2_NAME"], corr.columns)
c_lga_code = pick(["LGA_CODE_2021","LGA_CODE21","LGA_CODE"], corr.columns)
c_lga_name = pick(["LGA_NAME_2021","LGA_NAME21","LGA_NAME"], corr.columns)
c_ratio    = pick(["RATIO_FROM_TO","RATIO_TO","RATIO","PROP","WEIGHT"], corr.columns)

missing = [x for x in [c_sa2_code,c_sa2_name,c_lga_code,c_lga_name,c_ratio] if x is None]
if missing:
    raise ValueError(f"Could not find these columns in correspondence file: {missing}\n"
                     f"Have columns: {list(corr.columns)}")

# Ensure codes are strings and trimmed
for c in [c_sa2_code, c_lga_code]:
    corr[c] = corr[c].astype(str).str.strip()

# Filter to Victoria SA2s only (ASGS 2021 VIC codes begin with '2')
corr_vic = corr[corr[c_sa2_code].str.startswith("2")].copy()

# Prepare LGA dataset: ensure it has an LGA key
l_lga_code = pick(["LGA_CODE_2021","LGA_CODE21","LGA_CODE"], lga.columns)
l_lga_name = pick(["LGA_NAME_2021","LGA_NAME21","LGA_NAME"], lga.columns)

if (l_lga_code is None) and (l_lga_name is None):
    raise ValueError(f"LGA dataset has no LGA code/name column. Found columns: {list(lga.columns)}")

# Coerce numeric measure columns later; for now keep everything as str


In [26]:

# --- 3) Build per-LGA normalised weights -----------------------------------
# We’ll normalise weights WITHIN each LGA so that allocations from that LGA
# to its SA2 parts sum to 1. This guarantees LGA totals are preserved.

# Use the ratio column (population-based overlap) then normalise by LGA
corr_vic["ratio_raw"] = pd.to_numeric(corr_vic[c_ratio], errors="coerce").fillna(0.0)

# Key to group by for normalisation
grp_key = c_lga_code if l_lga_code else c_lga_name

# If we’re grouping by name, lower-case/strip both sides for robust matches
if grp_key == c_lga_name:
    corr_vic[c_lga_name + "_key"] = corr_vic[c_lga_name].str.strip().str.lower()
    lga[l_lga_name + "_key"]       = lga[l_lga_name].str.strip().str.lower()
    corr_group_key = c_lga_name + "_key"
    lga_join_key   = l_lga_name + "_key"
else:
    # code path
    corr_vic[c_lga_code + "_key"] = corr_vic[c_lga_code].str.strip()
    lga[l_lga_code + "_key"]       = lga[l_lga_code].str.strip()
    corr_group_key = c_lga_code + "_key"
    lga_join_key   = l_lga_code + "_key"

# Normalise by LGA so weights sum to 1 within each LGA
sum_by_lga = corr_vic.groupby(corr_vic[corr_group_key])["ratio_raw"].sum().rename("ratio_sum_lga")
corr_vic = corr_vic.merge(sum_by_lga, left_on=corr_group_key, right_index=True, how="left")
# Avoid divide-by-zero; if an LGA has zero sum (shouldn’t), fall back to equal split
corr_vic["weight_norm"] = np.where(
    corr_vic["ratio_sum_lga"] > 0,
    corr_vic["ratio_raw"] / corr_vic["ratio_sum_lga"],
    1.0 / corr_vic.groupby(corr_vic[corr_group_key])[corr_group_key].transform("count")
)


In [27]:

# --- 4) Join LGA data to weights -------------------------------------------
# Identify numeric measure columns in LGA data:
# Counts -> sum after weighting; Rates -> weighted mean after weighting
def starts_with_any(s, prefixes):
    return any(s.startswith(p) for p in prefixes)

count_prefixes = ("Incidents_", "Victims_")
rate_prefixes  = ("CrimeRate_", "VictimRate_")

# infer numeric columns by try-convert
lga_numeric = []
for c in lga.columns:
    # skip keys
    if c in [l_lga_code, l_lga_name, lga_join_key]:
        continue
    # try numeric conversion
    try:
        pd.to_numeric(lga[c])
        lga_numeric.append(c)
    except Exception:
        pass

count_cols = [c for c in lga_numeric if starts_with_any(c, count_prefixes)]
rate_cols  = [c for c in lga_numeric if starts_with_any(c, rate_prefixes)]

# Merge LGA data to correspondence weights
to_merge_cols = [col for col in [l_lga_code, l_lga_name, lga_join_key] if col is not None]
lga_for_merge = lga[to_merge_cols + count_cols + rate_cols].copy()

merged = corr_vic.merge(
    lga_for_merge,
    left_on=corr_group_key,
    right_on=lga_join_key,
    how="inner"
)

# Coerce measures to numeric (errors -> NaN -> 0)
for c in count_cols + rate_cols:
    merged[c] = pd.to_numeric(merged[c], errors="coerce").fillna(0.0)


In [28]:

# --- 5) Apply weights -------------------------------------------------------
# For counts: allocate by weight_norm and then sum to SA2
for c in count_cols:
    merged[c + "_alloc"] = merged[c] * merged["weight_norm"]

# For rates: weighted mean using weight_norm
for c in rate_cols:
    merged[c + "_alloc"] = merged[c] * merged["weight_norm"]


In [29]:

# --- 6) Aggregate to SA2 (unique) ------------------------------------------
sa2_keys = [c_sa2_code, c_sa2_name]
alloc_count_cols = [c + "_alloc" for c in count_cols]
alloc_rate_cols  = [c + "_alloc" for c in rate_cols]

sa2_counts = (
    merged.groupby(sa2_keys, dropna=False)[alloc_count_cols]
          .sum()
          .rename(columns=lambda x: x.replace("_alloc",""))
)

# For rates, we summed rate * weight; need to divide by sum of weights per SA2 (which should be 1 if every LGA covering
# the SA2 is present; to be safe, recompute per-SA2 total weight from the same merge).
sa2_weight_sum = merged.groupby(sa2_keys, dropna=False)["weight_norm"].sum().rename("weight_sum_sa2")

sa2_rates_sum = (
    merged.groupby(sa2_keys, dropna=False)[alloc_rate_cols]
          .sum()
          .rename(columns=lambda x: x.replace("_alloc","_weighted_sum"))
)
sa2_rates = sa2_rates_sum.copy()
for c in rate_cols:
    # divide by sum of weights; avoid zero-div
    sa2_rates[c] = np.where(sa2_weight_sum.values > 0,
                            sa2_rates_sum[c + "_weighted_sum"].values / sa2_weight_sum.values,
                            np.nan)
    del sa2_rates[c + "_weighted_sum"]

# Combine
sa2_final = sa2_counts.join(sa2_rates, how="outer").reset_index()

# Optional: keep lat/lng if present in your LGA file (usually not). Skip unless you have a rule to pick one.


In [30]:

# --- 7) Sanity checks & exports --------------------------------------------
# Check that for each LGA, sum of allocated counts across its SA2s ≈ original LGA totals
# (floating point tolerance)
checks = []
for c in count_cols:
    lga_alloc_back = (
        merged.groupby(lga_join_key)[c + "_alloc"].sum().rename("allocated_sum")
        .reset_index()
        .merge(lga_for_merge[[lga_join_key, c]], on=lga_join_key, how="left")
    )
    lga_alloc_back["abs_diff"] = (lga_alloc_back["allocated_sum"] - pd.to_numeric(lga_alloc_back[c], errors="coerce")).abs()
    lga_alloc_back["measure"]  = c
    checks.append(lga_alloc_back[[lga_join_key,"measure","allocated_sum",c,"abs_diff"]])

weight_checks = pd.concat(checks, ignore_index=True) if checks else pd.DataFrame()
weight_checks.to_csv(OUT_CHECKS, index=False)

# Export the SA2 dataset
sa2_final.to_csv(OUT_SA2, index=False)

print(f"Done. SA2-level dataset written to:\n  {OUT_SA2}")
if not weight_checks.empty:
    print(f"Allocation check by LGA written to:\n  {OUT_CHECKS}\n"
          f"(abs_diff should be ~0 per LGA per measure)")


Done. SA2-level dataset written to:
  /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_dataset_weighted_to_SA2.csv
Allocation check by LGA written to:
  /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/weight_checks_by_LGA.csv
(abs_diff should be ~0 per LGA per measure)


## CRIME PREDICTION TILL 2030 ##

In [41]:
#path = /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_dataset_weighted_to_SA2.csv



In [42]:
# %% --------------------------------- CONFIG & IMPORTS ---------------------------------
import warnings, os, re
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# Try to import pmdarima; if not installed, fallback will still work
try:
    from pmdarima import auto_arima
    HAS_ARIMA = True
except Exception:
    HAS_ARIMA = False

# ---- Input file (already weighted to SA2) ----
IN_PATH = "/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_dataset_weighted_to_SA2.csv"

# ---- Output directory (same parent as IN_PATH by default) ----
OUT_DIR = os.path.dirname(IN_PATH)  # you can change if you want outputs elsewhere
YEARLY_OUT = os.path.join(OUT_DIR, "crime_predictions_yearly_to_2030.csv")
QUARTERLY_OUT = os.path.join(OUT_DIR, "crime_predictions_quarterly_to_2030.csv")

END_YEAR = 2030  # inclusive


In [43]:
# %% ------------------------------- LOAD & NORMALISE (WIDE -> LONG) -------------------
import re
import pandas as pd

df = pd.read_csv(IN_PATH)

# keep identifiers
id_cols = [c for c in ("SA2_CODE_2021", "SA2_NAME_2021") if c in df.columns]
if "SA2_CODE_2021" not in id_cols:
    raise ValueError("Expected SA2_CODE_2021 in the file. Found: {}".format(df.columns.tolist()))

# we ONLY want these 3 metric families
want_metrics = ("Incidents", "CrimeRate", "VictimRate")
pat = re.compile(rf"^({'|'.join(want_metrics)})_(\d{{4}})$")

# collect (col_name, metric, year) triples that match
metric_year_cols = []
for c in df.columns:
    m = pat.match(c)
    if m:
        metric_year_cols.append((c, m.group(1), int(m.group(2))))

if not metric_year_cols:
    raise ValueError("No (Incidents|CrimeRate|VictimRate)_YYYY columns found. "
                     f"Available columns: {df.columns.tolist()}")

# build long frame by stacking the matching columns
long_parts = []
for col, metric, year in metric_year_cols:
    part = df[id_cols + [col]].copy()
    part["metric"] = metric
    part["Year"] = year
    part = part.rename(columns={col: "value"})
    long_parts.append(part)

long = (
    pd.concat(long_parts, ignore_index=True)
      .assign(value=lambda x: pd.to_numeric(x["value"], errors="coerce"))
      .dropna(subset=["value"])
      .sort_values(["SA2_CODE_2021", "metric", "Year"])
      .reset_index(drop=True)
)

print("Long shape:", long.shape)
print("Metrics:", long["metric"].unique())
print("Year range:", int(long["Year"].min()), "→", int(long["Year"].max()))


Long shape: (15660, 5)
Metrics: ['CrimeRate' 'Incidents' 'VictimRate']
Year range: 2016 → 2025


In [44]:
# %% --------------------------------- CONFIG & IMPORTS ---------------------------------
import warnings, os, re
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# Try to import pmdarima; if not installed or fails, we fall back to linear trend
try:
    from pmdarima import auto_arima
    HAS_ARIMA = True
except Exception:
    HAS_ARIMA = False

# ---- Input file (already weighted to SA2; wide format with *_YYYY columns) ----
IN_PATH = "/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_dataset_weighted_to_SA2.csv"

# ---- Outputs ----
OUT_DIR = os.path.dirname(IN_PATH)
YEARLY_OUT = os.path.join(OUT_DIR, "crime_predictions_yearly_to_2030.csv")
QUARTERLY_OUT = os.path.join(OUT_DIR, "crime_predictions_quarterly_to_2030.csv")

END_YEAR = 2030  # inclusive

# %% ------------------------------- LOAD & NORMALISE (WIDE -> LONG) -------------------
df = pd.read_csv(IN_PATH)

# Ensure SA2 identifier exists
id_cols = [c for c in ("SA2_CODE_2021","SA2_NAME_2021") if c in df.columns]
if "SA2_CODE_2021" not in id_cols:
    raise ValueError(f"Expected SA2_CODE_2021 column. Found: {df.columns.tolist()}")

# We only want these three metric families
want_metrics = ("Incidents", "CrimeRate", "VictimRate")
pat = re.compile(rf"^({'|'.join(want_metrics)})_(\d{{4}})$")

metric_year_cols = []
for c in df.columns:
    m = pat.match(c)
    if m:
        metric_year_cols.append((c, m.group(1), int(m.group(2))))

if not metric_year_cols:
    raise ValueError("No (Incidents|CrimeRate|VictimRate)_YYYY columns found.\n"
                     f"Available: {df.columns.tolist()}")

# Stack into long format: SA2_CODE_2021, optional SA2_NAME_2021, metric, Year, value
long_parts = []
for col, metric, year in metric_year_cols:
    part = df[id_cols + [col]].copy()
    part["metric"] = metric
    part["Year"] = year
    part = part.rename(columns={col: "value"})
    long_parts.append(part)

long = (
    pd.concat(long_parts, ignore_index=True)
      .assign(value=lambda x: pd.to_numeric(x["value"], errors="coerce"))
      .dropna(subset=["value"])
      .sort_values(["SA2_CODE_2021","metric","Year"])
      .reset_index(drop=True)
)

print(f"[INFO] Long shape: {long.shape} | Metrics: {sorted(long['metric'].unique())} | "
      f"Years: {int(long['Year'].min())}–{int(long['Year'].max())}")

# %% ----------------------------- FORECASTING HELPERS ----------------------------------
def linear_trend_forecast(years, values, target_years):
    years = np.array(years, dtype=float)
    values = np.array(values, dtype=float)
    target_years = np.array(target_years, dtype=float)
    if len(np.unique(values)) == 1:
        return np.full(len(target_years), float(values[-1]))
    coeffs = np.polyfit(years, values, deg=1)    # slope, intercept
    return coeffs[0] * target_years + coeffs[1]

def arima_or_trend_predict(years, values, future_years, prefer_arima=True):
    """Try ARIMA (if available & series has enough variation), else linear trend."""
    y = pd.Series(values, dtype=float)
    can_arima = prefer_arima and HAS_ARIMA and len(y) >= 5 and not np.allclose(y, y.iloc[0])
    if can_arima:
        try:
            model = auto_arima(
                y, seasonal=False, stationary=False, information_criterion="bic",
                suppress_warnings=True, error_action="ignore",
                max_p=3, max_q=3, max_d=2, stepwise=True
            )
            return np.asarray(model.predict(n_periods=len(future_years)), dtype=float)
        except Exception:
            pass
    return linear_trend_forecast(years, values, future_years)

# %% ----------------------------- BUILD YEARLY FORECASTS -------------------------------
yearly_frames = []

group_keys = ["SA2_CODE_2021","metric"]
if "SA2_NAME_2021" in long.columns:
    group_keys = ["SA2_CODE_2021","SA2_NAME_2021","metric"]

for keys, g in long.groupby(group_keys):
    g = g.sort_values("Year")
    hist_years  = g["Year"].tolist()
    hist_values = g["value"].tolist()

    last_hist = max(hist_years)
    fut_years = list(range(last_hist + 1, END_YEAR + 1))

    out_years  = hist_years.copy()
    out_values = hist_values.copy()

    if fut_years:
        preds = arima_or_trend_predict(hist_years, hist_values, fut_years, prefer_arima=True)
        out_years.extend(fut_years)
        out_values.extend(preds.tolist())

    tmp = pd.DataFrame({
        "SA2_CODE_2021": g["SA2_CODE_2021"].iloc[0],
        "metric": g["metric"].iloc[0],
        "Year": out_years,
        "value": out_values
    })
    if "SA2_NAME_2021" in g.columns:
        tmp["SA2_NAME_2021"] = g["SA2_NAME_2021"].iloc[0]

    # Non-negative counts for Incidents
    if tmp["metric"].iloc[0] == "Incidents":
        tmp["value"] = tmp["value"].clip(lower=0)

    yearly_frames.append(tmp)

yearly_all = pd.concat(yearly_frames, ignore_index=True)

# Round Incidents in the SAVED yearly file; keep float internally if you’d like
yearly_save = yearly_all.copy()
mask_inc = yearly_save["metric"] == "Incidents"
yearly_save.loc[mask_inc, "value"] = np.round(yearly_save.loc[mask_inc, "value"]).astype(int)

# Pivot to wide (Yearly) and save
index_cols = ["SA2_CODE_2021","Year"]
if "SA2_NAME_2021" in yearly_save.columns:
    index_cols = ["SA2_CODE_2021","SA2_NAME_2021","Year"]

yearly_wide = (
    yearly_save
    .pivot_table(index=index_cols, columns="metric", values="value", aggfunc="first")
    .reset_index()
    .sort_values(["SA2_CODE_2021","Year"])
)

# Neat column order
ordered_cols = [c for c in ["SA2_CODE_2021","SA2_NAME_2021","Year","Incidents","CrimeRate","VictimRate"] if c in yearly_wide.columns]
yearly_wide = yearly_wide[ordered_cols + [c for c in yearly_wide.columns if c not in ordered_cols]]

yearly_wide.to_csv(YEARLY_OUT, index=False)
print(f"[OK] Yearly forecasts saved -> {YEARLY_OUT}")

# %% -------------------------- QUARTERLY INTERPOLATION (Q1–Q4) -------------------------
# Strategy:
# - Treat each yearly value as at Q4 (Dec-31) using 'A-DEC' freq.
# - Build a full quarterly index (Q-DEC) and interpolate linearly.
# - Forward/back fill edges to avoid NaNs before rounding Incidents.

quarterly_list = []
grp_cols = ["SA2_CODE_2021","metric"]
name_in = "SA2_NAME_2021" in yearly_all.columns

if name_in:
    grp_cols = ["SA2_CODE_2021","SA2_NAME_2021","metric"]

for keys, g in yearly_all.groupby(grp_cols):
    g = g.sort_values("Year")
    start_year = int(g["Year"].min())
    end_year   = END_YEAR

    # yearly values at Dec-31
    ts = pd.Series(g["value"].values,
                   index=pd.PeriodIndex(g["Year"].astype(str), freq="A-DEC").to_timestamp(how="end")).sort_index()

    # full quarterly index
    qidx = pd.period_range(f"{start_year}Q1", f"{end_year}Q4", freq="Q-DEC").to_timestamp(how="end")
    q = ts.reindex(qidx)
    q_interp = q.interpolate(method="time").ffill().bfill()  # ensure no NaNs remain

    out = pd.DataFrame({
        "SA2_CODE_2021": g["SA2_CODE_2021"].iloc[0],
        "metric": g["metric"].iloc[0],
        "QuarterEnd": q_interp.index,
        "Year": q_interp.index.year,
        "Quarter": q_interp.index.to_period("Q-DEC").strftime("Q%q"),
        "value": q_interp.values
    })
    if name_in:
        out["SA2_NAME_2021"] = g["SA2_NAME_2021"].iloc[0]

    # keep non-negative for Incidents
    if out["metric"].iloc[0] == "Incidents":
        out["value"] = out["value"].clip(lower=0)

    quarterly_list.append(out)

quarterly_all = pd.concat(quarterly_list, ignore_index=True)

# Round Incidents safely (no NaNs thanks to ffill/bfill)
quarterly_save = quarterly_all.copy()
mask_q_inc = quarterly_save["metric"] == "Incidents"
quarterly_save.loc[mask_q_inc, "value"] = np.round(quarterly_save.loc[mask_q_inc, "value"]).astype(int)

# Pivot to wide and save
q_index = ["SA2_CODE_2021","Year","Quarter","QuarterEnd"]
if name_in:
    q_index = ["SA2_CODE_2021","SA2_NAME_2021","Year","Quarter","QuarterEnd"]

quarterly_wide = (
    quarterly_save
    .pivot_table(index=q_index, columns="metric", values="value", aggfunc="first")
    .reset_index()
    .sort_values(["SA2_CODE_2021","Year","Quarter"])
)

ordered_cols_q = [c for c in ["SA2_CODE_2021","SA2_NAME_2021","Year","Quarter","QuarterEnd","Incidents","CrimeRate","VictimRate"] if c in quarterly_wide.columns]
quarterly_wide = quarterly_wide[ordered_cols_q + [c for c in quarterly_wide.columns if c not in ordered_cols_q]]

quarterly_wide.to_csv(QUARTERLY_OUT, index=False)
print(f"[OK] Quarterly panel saved -> {QUARTERLY_OUT}")

# %% --------------------------------- QUICK SUMMARY ------------------------------------
print("\nSAMPLES (Yearly):")
print(yearly_wide.head(8).to_string(index=False))
print("\nSAMPLES (Quarterly):")
print(quarterly_wide.head(12).to_string(index=False))


[INFO] Long shape: (15660, 5) | Metrics: ['CrimeRate', 'Incidents', 'VictimRate'] | Years: 2016–2025
[OK] Yearly forecasts saved -> /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_predictions_yearly_to_2030.csv
[OK] Quarterly panel saved -> /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_predictions_quarterly_to_2030.csv

SAMPLES (Yearly):
 SA2_CODE_2021 SA2_NAME_2021  Year  Incidents   CrimeRate  VictimRate
     201011001     Alfredton  2016    14606.0 8723.671498 5740.096618
     201011001     Alfredton  2017    14907.0 8730.376785 5300.755085
     201011001     Alfredton  2018    14394.0 8268.901940 4992.193889
     201011001     Alfredton  2019    13148.0 7404.370838 4344.419848
     201011001     Alfredton  2020    14095.0 7804.200815 4677.325451
     2

In [45]:
# %% --------- Make YEARLY and QUARTERLY outputs "wide matrix" (one col per time slice) ---------
import os
import pandas as pd

# match your earlier paths
IN_PATH = "/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_dataset_weighted_to_SA2.csv"
OUT_DIR = os.path.dirname(IN_PATH)
YEARLY_IN  = os.path.join(OUT_DIR, "crime_predictions_yearly_to_2030.csv")
QUARTERLY_IN = os.path.join(OUT_DIR, "crime_predictions_quarterly_to_2030.csv")

YEARLY_WIDE_MATRIX_OUT   = os.path.join(OUT_DIR, "crime_predictions_yearly_to_2030_WIDE_MATRIX.csv")
QUARTERLY_WIDE_MATRIX_OUT = os.path.join(OUT_DIR, "crime_predictions_quarterly_to_2030_WIDE_MATRIX.csv")

# --- Load sources (from memory if present, else from disk) ---
def _ensure_yearly_df():
    try:
        return yearly_wide.copy()
    except NameError:
        return pd.read_csv(YEARLY_IN)

def _ensure_quarterly_df():
    try:
        return quarterly_wide.copy()
    except NameError:
        return pd.read_csv(QUARTERLY_IN)

y_src = _ensure_yearly_df()
q_src = _ensure_quarterly_df()

# Detect id columns
id_cols_year = [c for c in ("SA2_CODE_2021","SA2_NAME_2021") if c in y_src.columns]
id_cols_quart = [c for c in ("SA2_CODE_2021","SA2_NAME_2021") if c in q_src.columns]

# ------------------ YEARLY -> WIDE MATRIX: metric_year columns ------------------
# If already long-like, keep; if "tidy wide", melt back to long
y_metrics = [c for c in ["Incidents","CrimeRate","VictimRate"] if c in y_src.columns]
if y_metrics:
    y_long = y_src.melt(id_vars=id_cols_year+["Year"], value_vars=y_metrics,
                        var_name="metric", value_name="value").dropna(subset=["value"])
else:
    # assume already long with columns: id, Year, metric, value
    y_long = y_src.rename(columns={"value":"value", "metric":"metric"})

y_long["colname"] = y_long["metric"] + "_" + y_long["Year"].astype(int).astype(str)

yearly_matrix = (
    y_long
    .pivot_table(index=id_cols_year, columns="colname", values="value", aggfunc="first")
    .reset_index()
)

# sort columns nicely (Incidents, CrimeRate, VictimRate; chronological)
fixed = id_cols_year
metric_order = ["Incidents","CrimeRate","VictimRate"]
time_cols = [c for c in yearly_matrix.columns if c not in fixed]
def _metric_key(c):
    for i,m in enumerate(metric_order):
        if c.startswith(m+"_"):
            return (i, int(c.split("_")[1]))
    return (99, 9999)
time_cols_sorted = sorted(time_cols, key=_metric_key)
yearly_matrix = yearly_matrix[fixed + time_cols_sorted]

yearly_matrix.to_csv(YEARLY_WIDE_MATRIX_OUT, index=False)
print(f"[OK] Yearly WIDE matrix -> {YEARLY_WIDE_MATRIX_OUT}")

# ---------------- QUARTERLY -> WIDE MATRIX: metric_YYYYQn columns ----------------
# If tidy wide, melt back; else assume long
q_metrics = [c for c in ["Incidents","CrimeRate","VictimRate"] if c in q_src.columns]
if q_metrics:
    q_long = q_src.melt(id_vars=id_cols_quart+["Year","Quarter","QuarterEnd"],
                        value_vars=q_metrics, var_name="metric", value_name="value").dropna(subset=["value"])
else:
    q_long = q_src.rename(columns={"value":"value", "metric":"metric"})

# Build period label like 2027Q3
# Quarter might be "Q1" already; ensure it's clean
q_long["Quarter"] = q_long["Quarter"].astype(str).str.replace(r"[^Q\d]", "", regex=True)
q_long["Period"] = q_long["Year"].astype(int).astype(str) + q_long["Quarter"]

q_long["colname"] = q_long["metric"] + "_" + q_long["Period"]

quarterly_matrix = (
    q_long
    .pivot_table(index=id_cols_quart, columns="colname", values="value", aggfunc="first")
    .reset_index()
)

# sort quarterly columns (metric order, then year, then Q1..Q4)
fixed_q = id_cols_quart
q_time_cols = [c for c in quarterly_matrix.columns if c not in fixed_q]

def _q_key(c):
    # c like "Incidents_2028Q3"
    try:
        m, rest = c.split("_", 1)
        year = int(rest[:-2])  # drop 'Qx'
        qn = int(rest[-1:])
        mpos = metric_order.index(m) if m in metric_order else 99
        return (mpos, year, qn)
    except Exception:
        return (99, 9999, 9)

q_time_cols_sorted = sorted(q_time_cols, key=_q_key)
quarterly_matrix = quarterly_matrix[fixed_q + q_time_cols_sorted]

quarterly_matrix.to_csv(QUARTERLY_WIDE_MATRIX_OUT, index=False)
print(f"[OK] Quarterly WIDE matrix -> {QUARTERLY_WIDE_MATRIX_OUT}")

# Quick peek
print("\nYearly wide columns (sample):", yearly_matrix.columns[:15].tolist(), "...")
print("Quarterly wide columns (sample):", quarterly_matrix.columns[:15].tolist(), "...")


[OK] Yearly WIDE matrix -> /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_predictions_yearly_to_2030_WIDE_MATRIX.csv
[OK] Quarterly WIDE matrix -> /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_predictions_quarterly_to_2030_WIDE_MATRIX.csv

Yearly wide columns (sample): ['SA2_CODE_2021', 'SA2_NAME_2021', 'Incidents_2016', 'Incidents_2017', 'Incidents_2018', 'Incidents_2019', 'Incidents_2020', 'Incidents_2021', 'Incidents_2022', 'Incidents_2023', 'Incidents_2024', 'Incidents_2025', 'Incidents_2026', 'Incidents_2027', 'Incidents_2028'] ...
Quarterly wide columns (sample): ['SA2_CODE_2021', 'SA2_NAME_2021', 'Incidents_2016Q1', 'Incidents_2016Q2', 'Incidents_2016Q3', 'Incidents_2016Q4', 'Incidents_2017Q1', 'Incidents_2017Q2', 'Incidents_2017Q3', 'Incidents_201

In [47]:
import pandas as pd
import os

# --- Path setup ---
IN_PATH = "/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_predictions_quarterly_to_2030.csv"
OUT_PATH = IN_PATH.replace(".csv", "_datetime.csv")

# --- Load dataset ---
df = pd.read_csv(IN_PATH)

# Check that required columns exist
required_cols = ["Year", "Quarter"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing column: {col}. Found: {df.columns.tolist()}")

# --- Normalize Quarter column ---
df["Quarter"] = df["Quarter"].astype(str).str.strip().str.upper()

# --- Map quarters to ending months (Mar, Jun, Sep, Dec) ---
quarter_to_month = {"Q1": 3, "Q2": 6, "Q3": 9, "Q4": 12}

# --- Create QuarterDate = first day of ending month ---
df["QuarterDate"] = pd.to_datetime(
    df["Year"].astype(str) + "-" + df["Quarter"].map(quarter_to_month).astype(str) + "-01"
)

# --- Drop old QuarterEnd column if present ---
if "QuarterEnd" in df.columns:
    df = df.drop(columns=["QuarterEnd"])

# --- Reorder columns neatly ---
cols_order = ["SA2_CODE_2021", "SA2_NAME_2021", "Year", "Quarter", "QuarterDate", 
              "Incidents", "CrimeRate", "VictimRate"]
df = df[[c for c in cols_order if c in df.columns] + [c for c in df.columns if c not in cols_order]]

# --- Save updated file ---
df.to_csv(OUT_PATH, index=False)
print(f"[OK] File saved with QuarterDate (first day of ending month):\n{OUT_PATH}")

# --- Preview few rows ---
print("\nPreview:")
print(df.head(8).to_string(index=False))


[OK] File saved with QuarterDate (first day of ending month):
/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_predictions_quarterly_to_2030_datetime.csv

Preview:
 SA2_CODE_2021 SA2_NAME_2021  Year Quarter QuarterDate  Incidents   CrimeRate  VictimRate
     201011001     Alfredton  2016      Q1  2016-03-01    14606.0 8723.671498 5740.096618
     201011001     Alfredton  2016      Q2  2016-06-01    14606.0 8723.671498 5740.096618
     201011001     Alfredton  2016      Q3  2016-09-01    14606.0 8723.671498 5740.096618
     201011001     Alfredton  2016      Q4  2016-12-01    14606.0 8723.671498 5740.096618
     201011001     Alfredton  2017      Q1  2017-03-01    14680.0 8725.324856 5631.765829
     201011001     Alfredton  2017      Q2  2017-06-01    14755.0 8726.996585 5522.231365
     201011001     Alfredton  2017      Q3  2017-09-01    14831.0 8728.686685 5411.493225
 

## JOINING DATASET with Prediction_df ##

In [48]:
import pandas as pd

# --- Paths ---
crime_path = "/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/crime_predictions_quarterly_to_2030_datetime.csv"
pred_path  = "/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/predicition-data/prediction_df.csv"
merged_out = "/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/predicition-data/prediction_with_crime.csv"

# --- Load datasets ---
crime_df = pd.read_csv(crime_path)
pred_df  = pd.read_csv(pred_path)

# --- Standardize SA2 column names ---
for df in [crime_df, pred_df]:
    if "SA2_CODE_2021" not in df.columns:
        alt = [c for c in df.columns if "SA2" in c and "CODE" in c]
        if alt:
            df.rename(columns={alt[0]: "SA2_CODE_2021"}, inplace=True)

# --- Convert to datetime ---
crime_df["QuarterDate"] = pd.to_datetime(crime_df["QuarterDate"])
pred_df["date"] = pd.to_datetime(pred_df["date"])

# --- Optional: sort for clarity ---
crime_df = crime_df.sort_values(["SA2_CODE_2021","QuarterDate"])
pred_df = pred_df.sort_values(["SA2_CODE_2021","date"])

# --- Merge on SA2 code and matching date ---
merged = pred_df.merge(
    crime_df,
    how="left",
    left_on=["SA2_CODE_2021","date"],
    right_on=["SA2_CODE_2021","QuarterDate"]
)

# --- Drop duplicate key columns (if you want to keep only one datetime column) ---
merged = merged.drop(columns=["QuarterDate"])

# --- Save merged dataset ---
out_path = "/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/prediction_with_crime.csv"
merged.to_csv(out_path, index=False)

print(f"[OK] Merged file saved to:\n{out_path}")
print("\nPreview:")
print(merged.head(10).to_string(index=False))


[OK] Merged file saved to:
/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/prediction_with_crime.csv

Preview:
       Lat        Lng  SA2_CODE_2021 SA2_NAME21   Suburb       date          ERP  Predicted_Income  t SA2_NAME_2021  Year Quarter  Incidents   CrimeRate  VictimRate
-37.556144 143.836655      201011002   Ballarat Ballarat 2025-06-01 11720.193827      59796.465049 33      Ballarat  2025      Q2    15728.0 7938.753559 4279.657264
-37.556144 143.836655      201011002   Ballarat Ballarat 2025-06-01 11720.193827      59796.465049 33      Ballarat  2025      Q2    15728.0 7938.753559 4279.657264
-37.556144 143.836655      201011002   Ballarat Ballarat 2025-06-01 11720.193827      59796.465049 33      Ballarat  2025      Q2    15728.0 7938.753559 4279.657264
-37.556144 143.836655      201011002   Ballarat Ballarat 2025-06-01 11720.193827      59796.465049 33      Ballarat  20

In [49]:
import pandas as pd

# --- Path to your merged dataset ---
IN_PATH = "/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/prediction_with_crime.csv"
OUT_PATH = IN_PATH.replace(".csv", "_deduped.csv")

# --- Load ---
df = pd.read_csv(IN_PATH)

# --- Drop duplicate columns (same name) ---
df = df.loc[:, ~df.columns.duplicated(keep="first")]

# --- Drop duplicate rows (entirely identical) ---
df = df.drop_duplicates(keep="first")

# --- Optional: reset index ---
df = df.reset_index(drop=True)

# --- Save cleaned dataset ---
df.to_csv(OUT_PATH, index=False)
print(f"[OK] Duplicates removed and saved -> {OUT_PATH}")

# --- Preview summary ---
print(f"Rows: {len(df):,} | Columns: {len(df.columns)}")
print("\nSample:")
print(df.head(10).to_string(index=False))


[OK] Duplicates removed and saved -> /Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/3. crime/prediction_with_crime_deduped.csv
Rows: 3,197 | Columns: 15

Sample:
       Lat        Lng  SA2_CODE_2021 SA2_NAME21   Suburb       date          ERP  Predicted_Income  t SA2_NAME_2021  Year Quarter  Incidents   CrimeRate  VictimRate
-37.556144 143.836655      201011002   Ballarat Ballarat 2025-06-01 11720.193827      59796.465049 33      Ballarat  2025      Q2    15728.0 7938.753559 4279.657264
-37.556144 143.836655      201011002   Ballarat Ballarat 2025-09-01 11710.778773      60156.514521 34      Ballarat  2025      Q3    16075.0 8068.863846 4262.661917
-37.556144 143.836655      201011002   Ballarat Ballarat 2025-12-01 11701.466056      60512.650411 35      Ballarat  2025      Q4    16422.0 8198.974132 4245.666569
-37.556144 143.836655      201011002   Ballarat Ballarat 2026-03-01 11701.69