In [3]:
# arima
import pandas as pd
merged_df = pd.read_csv("/Users/rajaa/Desktop/Applied Data Science MAST30034/project-2-group-real-estate-industry-project-7-2025/Raja-workspace/notebooks/landing/raw/cleaned/10. data for modeling suburb growth 2/merged_df.csv")

In [4]:
# =========================================================
# UCM (local linear trend + seasonal + exog) â€” ONLY
# =========================================================

import numpy as np
import pandas as pd
import warnings

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm
from statsmodels.tsa.statespace.structural import UnobservedComponents
from statsmodels.tools.sm_exceptions import ConvergenceWarning, ValueWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=ValueWarning)

# -------------------------
# Helper: robust MAE/RMSE/R2
# -------------------------
def _metrics(y_true, y_pred):
    y_true = pd.to_numeric(pd.Series(y_true), errors="coerce")
    y_pred = pd.to_numeric(pd.Series(y_pred), errors="coerce")
    m = ~(y_true.isna() | y_pred.isna())
    if m.sum() == 0:
        return np.nan, np.nan, np.nan
    y_true = y_true[m]
    y_pred = y_pred[m]
    mae  = mean_absolute_error(y_true, y_pred)
    mse  = mean_squared_error(y_true, y_pred)
    rmse = float(np.sqrt(mse))
    r2   = r2_score(y_true, y_pred) if len(y_true) > 1 else np.nan
    return mae, rmse, r2

# -------------------------
# Quarterly panel builder
# -------------------------
def to_quarterly_panel(df, exog_cols=("ERP_quarterly","Income_quarterly_med")):
    """
    Ensure per-suburb quarterly index (3MS) and ffill/bfill Median + exogs.
    Returns a clean DataFrame sorted by Suburb/date.
    """
    need = ["Suburb", "date", "Median"] + list(exog_cols)
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise KeyError(f"merged_df is missing columns: {missing}")

    out = df.copy()
    out["date"] = pd.to_datetime(out["date"], errors="coerce")
    out = out.sort_values(["Suburb","date"]).reset_index(drop=True)

    def _qfill(g):
        g = g.set_index("date").asfreq("3MS").sort_index()
        cols = ["Median"] + list(exog_cols)
        for c in cols:
            g[c] = g[c].ffill().bfill()
        return g.reset_index()

    out = (out.groupby("Suburb", group_keys=False)
               .apply(_qfill)
               .sort_values(["Suburb","date"])
               .reset_index(drop=True))
    return out

# -------------------------
# UCM per suburb (with exog)
# -------------------------
def evaluate_ucm_per_suburb(merged_df,
                            split_date="2024-09-01",
                            exog_cols=("ERP_quarterly","Income_quarterly_med"),
                            seasonal_periods=4,
                            min_train=8,
                            min_test=2):
    """
    Fits Unobserved Components model per suburb:
      level='local linear trend', seasonal=seasonal_periods, exog=X
    Evaluates on test window >= split_date.
    Returns: metrics_df (one row per suburb), preds_df (long).
    """
    df = to_quarterly_panel(merged_df, exog_cols)
    split_date = pd.Timestamp(split_date)

    metrics, preds = [], []
    for suburb, g in df.groupby("Suburb"):
        g = g[["date","Median"] + list(exog_cols)].copy()
        y = g.set_index("date")["Median"]
        X = g.set_index("date")[list(exog_cols)]

        y_tr, y_te = y[y.index < split_date], y[y.index >= split_date]
        X_tr, X_te = X.loc[y_tr.index], X.loc[y_te.index]

        if len(y_tr) < min_train or len(y_te) < min_test:
            continue

        try:
            mdl = UnobservedComponents(
                endog=y_tr,
                level="local linear trend",
                seasonal=seasonal_periods,
                exog=X_tr
            )
            res = mdl.fit(disp=False)
            fc  = res.get_forecast(steps=len(y_te), exog=X_te)
            yhat = fc.predicted_mean

            mae, rmse, r2 = _metrics(y_te, yhat)
            metrics.append({
                "Suburb": suburb,
                "Model": f"UCM_LLtrend+Season{seasonal_periods}+exog",
                "MAE_Test": mae, "RMSE_Test": rmse, "R2_Test": r2,
                "Train_n": len(y_tr), "Test_n": len(y_te)
            })
            preds.append(pd.DataFrame({
                "Suburb": suburb, "date": y_te.index, "Actual": y_te.values,
                "Predicted": yhat.values, "Model": "UCM"
            }))
        except Exception as e:
            metrics.append({
                "Suburb": suburb,
                "Model": f"UCM_LLtrend+Season{seasonal_periods}+exog",
                "MAE_Test": np.nan, "RMSE_Test": np.nan, "R2_Test": np.nan,
                "Train_n": len(y_tr), "Test_n": len(y_te), "Error": str(e)
            })

    metrics_df = pd.DataFrame(metrics)
    preds_df   = pd.concat(preds, ignore_index=True) if preds else pd.DataFrame()
    return metrics_df, preds_df

# -------------------------
# Run UCM only
# -------------------------
SPLIT = "2024-09-01"
EXOGS = ("ERP_quarterly","Income_quarterly_med")

# Expect `merged_df` to be available
ucm_metrics, ucm_preds = evaluate_ucm_per_suburb(
    merged_df,
    split_date=SPLIT,
    exog_cols=EXOGS,
    seasonal_periods=4,   # quarterly seasonality
    min_train=8,
    min_test=2
)

print("UCM rows:", len(ucm_metrics))
try:
    display(ucm_metrics.head())
    display(ucm_preds.head())
except NameError:
    print(ucm_metrics.head().to_string(index=False))
    print(ucm_preds.head().to_string(index=False))


  .apply(_qfill)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend 

UCM rows: 146


  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)
  trend = spsolve(I+lamb*K.T.dot(K), x, use_umfpack=use_umfpack)


Unnamed: 0,Suburb,Model,MAE_Test,RMSE_Test,R2_Test,Train_n,Test_n
0,Albert Park-Middle Park-West St Kilda,UCM_LLtrend+Season4+exog,42.869214,48.864122,-218.278789,30,3
1,Altona,UCM_LLtrend+Season4+exog,20.28303,25.297622,0.0,30,3
2,Armadale,UCM_LLtrend+Season4+exog,20.233757,22.243057,-2.562226,30,3
3,Aspendale-Chelsea-Carrum,UCM_LLtrend+Season4+exog,33.90651,38.083926,0.0,30,3
4,Bairnsdale,UCM_LLtrend+Season4+exog,15.206298,16.094883,-1.914259,30,3


Unnamed: 0,Suburb,date,Actual,Predicted,Model
0,Albert Park-Middle Park-West St Kilda,2024-09-01,693,708.350087,UCM
1,Albert Park-Middle Park-West St Kilda,2024-12-01,700,740.599971,UCM
2,Albert Park-Middle Park-West St Kilda,2025-03-01,700,772.657582,UCM
3,Altona,2024-09-01,550,551.807454,UCM
4,Altona,2024-12-01,550,570.201893,UCM


In [5]:
# =========================================================
# ARIMA-new (auto-tuned ARMA(p,0,q) with exog, log, interaction) â€” ONLY
# =========================================================

import numpy as np
import pandas as pd
import warnings
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tools.sm_exceptions import ConvergenceWarning, ValueWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=ValueWarning)

# ---------------------------------------------------------
# Robust metrics (version-agnostic RMSE)
# ---------------------------------------------------------
def _metrics(y_true, y_pred):
    y_true = pd.to_numeric(pd.Series(y_true), errors="coerce")
    y_pred = pd.to_numeric(pd.Series(y_pred), errors="coerce")
    m = ~(y_true.isna() | y_pred.isna())
    if m.sum() == 0:
        return np.nan, np.nan, np.nan
    y_true = y_true[m]
    y_pred = y_pred[m]
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = float(np.sqrt(mse))
    r2 = r2_score(y_true, y_pred) if len(y_true) > 1 else np.nan
    return mae, rmse, r2

# ---------------------------------------------------------
# Step 0 â€” Quarterly panel builder with exog + interactions
# ---------------------------------------------------------
def to_quarterly_panel(df, exog_cols=("ERP_quarterly","Income_quarterly_med")):
    """
    Ensures per-suburb quarterly index (3MS), sorted, with ffill/bfill.
    Adds interaction term ERP*Income to help capture non-linear exog effects.
    """
    need = ["Suburb", "date", "Median"] + list(exog_cols)
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise KeyError(f"merged_df is missing columns: {missing}")

    out = df.copy()
    out["date"] = pd.to_datetime(out["date"], errors="coerce")
    out = out.sort_values(["Suburb","date"]).reset_index(drop=True)

    def _qfill(g):
        g = g.set_index("date").asfreq("3MS").sort_index()
        cols = ["Median"] + list(exog_cols)
        for c in cols:
            g[c] = g[c].ffill().bfill()
        # Add interaction (or generic interaction of the first two exogs)
        if set(exog_cols).issuperset({"ERP_quarterly","Income_quarterly_med"}):
            g["ERPxIncome"] = g["ERP_quarterly"] * g["Income_quarterly_med"]
        else:
            ex = list(exog_cols)
            if len(ex) >= 2:
                g["ERPxIncome"] = g[ex[0]] * g[ex[1]]
            else:
                g["ERPxIncome"] = np.nan
        return g.reset_index()

    out = (out.groupby("Suburb", group_keys=False)
               .apply(_qfill)
               .sort_values(["Suburb","date"])
               .reset_index(drop=True))
    return out

# ---------------------------------------------------------
# Step 1 â€” ARMA-new helpers: grid search ARMA(p,0,q) via ARIMA(d=0)
# ---------------------------------------------------------
def arma_new_best_for_suburb(y_tr, X_tr, p_grid=(0,1,2,3), q_grid=(0,1,2,3)):
    """
    Try a small grid of ARMA(p,0,q) using ARIMA with d=0.
    Select by AIC.
    """
    best = {"aic": np.inf, "order": None, "res": None}
    for p in p_grid:
        for q in q_grid:
            try:
                mdl = ARIMA(endog=y_tr, exog=X_tr, order=(p,0,q))  # d=0 => ARMA
                res = mdl.fit()
                if np.isfinite(res.aic) and res.aic < best["aic"]:
                    best.update({"aic": res.aic, "order": (p,0,q), "res": res})
            except Exception:
                continue
    return best

# ---------------------------------------------------------
# Step 2 â€” Evaluate ARIMA-new per suburb
# ---------------------------------------------------------
def evaluate_arma_new(
    merged_df,
    split_date="2024-09-01",
    exog_cols=("ERP_quarterly","Income_quarterly_med","ERPxIncome"),
    p_grid=(0,1,2,3), q_grid=(0,1,2,3),
    min_train=12, min_test=2,
    use_log=True
):
    """
    ARIMA-new:
      - ARMA(p,0,q) via ARIMA with d=0 (no seasonality)
      - log transform (default)
      - exog + interaction term
      - per-suburb grid over p,q and choose by AIC on train
    """
    df = to_quarterly_panel(merged_df, exog_cols=tuple(c for c in exog_cols if c != "ERPxIncome"))
    if "ERPxIncome" not in df.columns and set(exog_cols).issuperset({"ERP_quarterly","Income_quarterly_med"}):
        df["ERPxIncome"] = df["ERP_quarterly"] * df["Income_quarterly_med"]

    split_date = pd.Timestamp(split_date)
    metrics, preds = [], []

    for suburb, g in df.groupby("Suburb"):
        g = g[["date","Median"] + list(exog_cols)].copy()
        g = g.set_index("date").sort_index().ffill().bfill()

        y = g["Median"]
        X = g[list(exog_cols)]

        y_tr, y_te = y[y.index < split_date], y[y.index >= split_date]
        X_tr, X_te = X.loc[y_tr.index], X.loc[y_te.index]

        if len(y_tr) < min_train or len(y_te) < min_test:
            continue

        try:
            y_tr_fit = np.log(y_tr.clip(lower=1.0)) if use_log else y_tr

            best = arma_new_best_for_suburb(y_tr_fit, X_tr, p_grid=p_grid, q_grid=q_grid)
            if best["res"] is None:
                raise RuntimeError("No converged ARMA-new model for this suburb.")

            fc = best["res"].forecast(steps=len(y_te), exog=X_te)  # ARIMA.forecast accepts exog
            yhat_tr = pd.Series(fc, index=y_te.index)

            yhat = np.exp(yhat_tr.values) if use_log else yhat_tr.values
            mae, rmse, r2 = _metrics(y_te, yhat)

            metrics.append({
                "Suburb": suburb,
                "Model": "ARMA-new",
                "MAE_Test": mae, "RMSE_Test": rmse, "R2_Test": r2,
                "Train_n": len(y_tr), "Test_n": len(y_te),
                "AIC": best["aic"],
                "Order": best["order"]
            })
            preds.append(pd.DataFrame({
                "Suburb": suburb, "date": y_te.index, "Actual": y_te.values,
                "Predicted": yhat, "Model": "ARMA-new"
            }))
        except Exception as e:
            metrics.append({
                "Suburb": suburb,
                "Model": "ARMA-new",
                "MAE_Test": np.nan, "RMSE_Test": np.nan, "R2_Test": np.nan,
                "Train_n": len(y_tr), "Test_n": len(y_te),
                "AIC": np.inf, "Error": str(e)
            })

    metrics_df = pd.DataFrame(metrics)
    preds_df = pd.concat(preds, ignore_index=True) if preds else pd.DataFrame()
    return metrics_df, preds_df

# ---------------------------------------------------------
# Step 3 â€” Run ARIMA-new only
# ---------------------------------------------------------
SPLIT = "2024-09-01"
EXOGS = ("ERP_quarterly","Income_quarterly_med","ERPxIncome")

# Expect `merged_df` to be present in the environment
arma_metrics, arma_preds = evaluate_arma_new(
    merged_df,
    split_date=SPLIT,
    exog_cols=EXOGS,
    p_grid=(0,1,2,3), q_grid=(0,1,2,3),
    min_train=12, min_test=2,
    use_log=True
)

print("Rows â€” ARMA-new:", len(arma_metrics))
try:
    display(arma_metrics.head())
    display(arma_preds.head())
except NameError:
    print(arma_metrics.head().to_string(index=False))
    print(arma_preds.head().to_string(index=False))


  .apply(_qfill)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible startin

Rows â€” ARMA-new: 146


  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-stationary starting autoregressive parameters'


Unnamed: 0,Suburb,Model,MAE_Test,RMSE_Test,R2_Test,Train_n,Test_n,AIC,Order
0,Albert Park-Middle Park-West St Kilda,ARMA-new,54.091857,55.589576,-282.793964,30,3,-110.502382,"(1, 0, 0)"
1,Altona,ARMA-new,30.484753,31.636443,0.0,30,3,-171.252968,"(2, 0, 2)"
2,Armadale,ARMA-new,24.749284,29.440658,-5.240617,30,3,-119.761146,"(3, 0, 0)"
3,Aspendale-Chelsea-Carrum,ARMA-new,7.10462,7.717819,0.0,30,3,-167.780107,"(2, 0, 1)"
4,Bairnsdale,ARMA-new,52.153074,56.872461,-35.387865,30,3,-132.726069,"(2, 0, 0)"


Unnamed: 0,Suburb,date,Actual,Predicted,Model
0,Albert Park-Middle Park-West St Kilda,2024-09-01,693,656.226999,ARMA-new
1,Albert Park-Middle Park-West St Kilda,2024-12-01,700,641.880222,ARMA-new
2,Albert Park-Middle Park-West St Kilda,2025-03-01,700,632.617208,ARMA-new
3,Altona,2024-09-01,550,529.938645,ARMA-new
4,Altona,2024-12-01,550,519.385984,ARMA-new


In [6]:
# One row per suburb with the chosen ARMA order and AIC
arma_orders = (
    arma_metrics.loc[arma_metrics["Order"].notna(), ["Suburb","Order","AIC","RMSE_Test","MAE_Test","R2_Test"]]
    .sort_values(["Suburb"])
)
arma_orders.head(20)


Unnamed: 0,Suburb,Order,AIC,RMSE_Test,MAE_Test,R2_Test
0,Albert Park-Middle Park-West St Kilda,"(1, 0, 0)",-110.502382,55.589576,54.091857,-282.793964
1,Altona,"(2, 0, 2)",-171.252968,31.636443,30.484753,0.0
2,Armadale,"(3, 0, 0)",-119.761146,29.440658,24.749284,-5.240617
3,Aspendale-Chelsea-Carrum,"(2, 0, 1)",-167.780107,7.717819,7.10462,0.0
4,Bairnsdale,"(2, 0, 0)",-132.726069,56.872461,52.153074,-35.387865
5,Ballarat,"(2, 0, 2)",-166.201138,8.232607,7.406074,-11.199646
6,Balwyn,"(2, 0, 0)",-136.807846,55.125944,53.793958,0.0
7,Bayswater,"(3, 0, 0)",-189.760974,40.784627,38.296845,-7.316929
8,Belmont-Grovedale,"(2, 0, 0)",-193.205871,3.825415,3.372462,0.341479
9,Benalla,"(2, 0, 1)",-138.061045,23.901139,23.304884,-6.909816


In [7]:
import numpy as np

def portfolio_np_summary(metrics_df, label):
    """
    metrics_df: DataFrame with ['MAE_Test','RMSE_Test','R2_Test']
    Prints result like:
    UCM : {'count': 146, 'MAE_mean': np.float64(...), 'RMSE_mean': np.float64(...), 'R2_mean': np.float64(...)}
    """
    sub = metrics_df[['MAE_Test','RMSE_Test','R2_Test']].dropna(how='all')
    out = {
        'count': int(len(sub)),
        'MAE_mean': np.float64(sub['MAE_Test'].mean()) if not sub.empty else np.float64('nan'),
        'RMSE_mean': np.float64(sub['RMSE_Test'].mean()) if not sub.empty else np.float64('nan'),
        'R2_mean': np.float64(sub['R2_Test'].mean()) if not sub.empty else np.float64('nan'),
    }
    print(f"{label} : {out}")

# Example usage (assuming both DataFrames exist)
portfolio_np_summary(ucm_metrics, "UCM")
portfolio_np_summary(arma_metrics, "ARIMA-new")


UCM : {'count': 146, 'MAE_mean': np.float64(16.926441461392915), 'RMSE_mean': np.float64(18.920763262772855), 'R2_mean': np.float64(-33.18928268188232)}
ARIMA-new : {'count': 146, 'MAE_mean': np.float64(26.27789352635148), 'RMSE_mean': np.float64(28.640906645262177), 'R2_mean': np.float64(-79.1174255249944)}
