# This Notebook interpolate the future ERP and Income

ERP interpolate

In [21]:
import pandas as pd

In [22]:
population_df = pd.read_csv('/home/eeamanda/project-2-group-real-estate-industry-project-7-2025/Amanda-workspace/full_erp_only_population_data.csv')
population_df

Unnamed: 0,sa2_code,erp_2015,erp_2016,erp_2017,erp_2018,erp_2019,erp_2020,erp_2021,erp_2022,erp_2023,...,erp_2027,erp_2028,erp_2029,erp_2030,erp_2031,erp_2032,erp_2033,erp_2034,erp_2035,erp_2036
0,201011001,11039.0,11852.0,12649.0,13537.0,14434.0,15507.0,16841.0,18002.0,18995.0,...,21325.893697,21895.531232,22465.168766,23034.806301,23604.443836,24095.619230,24586.794624,25077.970018,25569.145413,26060.320807
1,201011002,12300.0,12301.0,12266.0,12244.0,12320.0,12196.0,12071.0,11938.0,11811.0,...,11719.320995,11740.348397,11761.375799,11782.403201,11803.430603,11839.942960,11876.455316,11912.967673,11949.480030,11985.992387
2,201011005,7191.0,7311.0,7409.0,7418.0,7458.0,7377.0,7229.0,7247.0,7323.0,...,7434.686493,7497.293213,7559.899932,7622.506652,7685.113372,7753.868146,7822.622920,7891.377695,7960.132469,8028.887243
3,201011006,6846.0,7195.0,7622.0,8183.0,8890.0,9755.0,10648.0,11798.0,12865.0,...,16827.266327,17739.346612,18651.426898,19563.507183,20475.587469,21373.510463,22271.433457,23169.356451,24067.279445,24965.202439
4,201011007,3966.0,3990.0,4004.0,4042.0,4112.0,4152.0,4211.0,4223.0,4267.0,...,4341.161505,4370.224481,4399.287456,4428.350431,4457.413406,4511.024293,4564.635179,4618.246065,4671.856951,4725.467837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,217031476,3538.0,3556.0,3635.0,3710.0,3802.0,3911.0,3979.0,3974.0,3983.0,...,4147.083280,4182.682086,4218.280891,4253.879696,4289.478501,4325.233368,4360.988235,4396.743101,4432.497968,4468.252835
526,217041477,6716.0,6709.0,6717.0,6746.0,6798.0,6883.0,6990.0,7046.0,7131.0,...,7221.447801,7256.180692,7290.913584,7325.646476,7360.379368,7392.136323,7423.893278,7455.650233,7487.407188,7519.164143
527,217041478,9467.0,9603.0,9686.0,9783.0,9845.0,9859.0,9967.0,10098.0,10147.0,...,10495.508296,10585.734805,10675.961314,10766.187824,10856.414333,10941.352640,11026.290947,11111.229254,11196.167562,11281.105869
528,217041479,21217.0,21442.0,21688.0,21954.0,22184.0,22416.0,22470.0,22586.0,22761.0,...,23444.843119,23635.499809,23826.156499,24016.813189,24207.469878,24402.183550,24596.897222,24791.610894,24986.324566,25181.038238


In [23]:
properties_df = pd.read_csv("/home/eeamanda/project-2-group-real-estate-industry-project-7-2025/Amanda-workspace/properties_df.csv")
properties_df["date"] = pd.to_datetime(properties_df["date"], format="mixed", errors="raise")

In [24]:
# 1) Long-form annual ERP with a proper date
erp_quarterly = (
    population_df
      .melt(id_vars='sa2_code', var_name='year', value_name='ERP')
      .assign(
          year=lambda x: x['year'].str.extract(r'(\d{4})').astype(int),
          ERP=lambda x: pd.to_numeric(x['ERP'], errors='coerce'),
          date=lambda x: pd.to_datetime(x['year'].astype(str) + '-01-01')
      )
      # if source has duplicates per (sa2_code, date), collapse them
      .groupby(['sa2_code','date'], as_index=False)['ERP'].mean()
)

# 2) Target quarterly index (quarter starts in March)
q_idx = pd.date_range('2025-06-01', '2030-12-01', freq='QS-MAR')

# 3) For each SA2: union with quarterly timeline, interpolate by time, then keep quarters
frames = []
for sa2, g in erp_quarterly.groupby('sa2_code', sort=False):
    g = (g.set_index('date').sort_index())

    # build a union index so interpolate() can "see" the known annual points
    full_idx = g.index.union(q_idx).unique().sort_values()

    # reindex to union, then time-based interpolation
    gg = (g.reindex(full_idx)
            .interpolate(method='time', limit_direction='both')  # extrapolates both ends
         )

    # keep only the quarterly stamps we want
    gg = gg.reindex(q_idx)

    gg['sa2_code'] = sa2
    frames.append(gg)

erp_future_quarterly = (
    pd.concat(frames, axis=0)
      .reset_index()
      .rename(columns={'index': 'date'})
      .loc[:, ['sa2_code', 'date', 'ERP']]
)

# 1️⃣ Build a unique lookup for Suburb per SA2
suburb_lookup = (
    properties_df[["SA2_CODE21", "Suburb"]]
    .dropna(subset=["SA2_CODE21", "Suburb"])
    .drop_duplicates(subset=["SA2_CODE21"])
    .rename(columns={"SA2_CODE21": "sa2_code"})
)

# 2️⃣ Merge Suburb onto ERP quarterly data
erp_future_quarterly = erp_future_quarterly.merge(suburb_lookup, how="right", on="sa2_code")

# 3️⃣ Reorder columns (optional)
erp_future_quarterly = erp_future_quarterly[["sa2_code", "Suburb", "date", "ERP"]]
erp_future_quarterly = erp_future_quarterly.rename(columns={'sa2_code': 'SA2_CODE21'})

# ✅ Preview result
print(erp_future_quarterly.tail(10))

      SA2_CODE21             Suburb       date           ERP
3164   213031352  Yarraville-Seddon 2028-09-01  17120.896274
3165   213031352  Yarraville-Seddon 2028-12-01  17160.524127
3166   213031352  Yarraville-Seddon 2029-03-01  17199.786900
3167   213031352  Yarraville-Seddon 2029-06-01  17239.959987
3168   213031352  Yarraville-Seddon 2029-09-01  17280.133073
3169   213031352  Yarraville-Seddon 2029-12-01  17319.869496
3170   213031352  Yarraville-Seddon 2030-03-01  17359.169254
3171   213031352  Yarraville-Seddon 2030-06-01  17399.342340
3172   213031352  Yarraville-Seddon 2030-09-01  17439.515427
3173   213031352  Yarraville-Seddon 2030-12-01  17479.251849


Income Interpolate

In [25]:
income_df = pd.read_csv("/home/eeamanda/project-2-group-real-estate-industry-project-7-2025/Amanda-workspace/income_predictions_2017_final.csv")
income_df = income_df.drop(income_df.index[0]).reset_index(drop=True) #First row is victoria we dont need it
income_df = income_df.rename(columns={"SA2": "sa2_code"})

In [26]:
import re
import pandas as pd

# --- 0) Ensure sa2_code is integer early
income_df['sa2_code'] = pd.to_numeric(income_df['sa2_code'], errors='coerce').astype('Int64')

# --- 0b) Normalise suburb name column to 'sa2_name'
if 'sa2_name' in income_df.columns:
    pass
elif 'SA2 NAME' in income_df.columns:
    income_df = income_df.rename(columns={'SA2 NAME': 'sa2_name'})
else:
    # if no name column exists, create a placeholder
    income_df['sa2_name'] = pd.NA

# --- 1) Pick predicted columns (2025–2030)
income_year_cols = [c for c in income_df.columns if re.fullmatch(r'Predicted_Income_\d{4}', c)]
if not income_year_cols:
    raise ValueError("No columns matching 'Predicted_Income_YYYY' found.")

# --- 2) Reshape to long format with clean year/date (carry sa2_name)
income_long = (
    income_df[['sa2_code', 'sa2_name'] + income_year_cols]
      .melt(id_vars=['sa2_code','sa2_name'], var_name='year_col', value_name='Predicted_Income')
      .assign(
          year=lambda x: x['year_col'].str.extract(r'(\d{4})').astype(int),
          Predicted_Income=lambda x: pd.to_numeric(x['Predicted_Income'], errors='coerce'),
          date=lambda x: pd.to_datetime(x['year'].astype(str) + '-01-01')
      )
      # guard against accidental dupes
      .groupby(['sa2_code','sa2_name','date'], as_index=False)['Predicted_Income'].mean()
)

# --- 3) Quarterly date index from 2025 → 2030
q_idx = pd.date_range('2025-06-01', '2030-12-01', freq='QS-MAR')
# If you prefer starting at the first quarter boundary in 2025, use '2025-03-01'.

# --- 4) Interpolate + extrapolate quarterly
frames = []
for (sa2, name), g in income_long.groupby(['sa2_code','sa2_name'], sort=False):
    g = g.set_index('date').sort_index()
    g = g.infer_objects(copy=False)
    g['Predicted_Income'] = pd.to_numeric(g['Predicted_Income'], errors='coerce')

    full_idx = g.index.union(q_idx).unique().sort_values()

    gg = g.reindex(full_idx)
    gg = gg.infer_objects(copy=False)  # <--- fixes the warning
    gg['Predicted_Income'] = pd.to_numeric(gg['Predicted_Income'], errors='coerce')

    gg = gg.interpolate(method='time', limit_direction='both')


    gg = gg.reindex(q_idx)
    gg['sa2_code'] = sa2
    gg['sa2_name'] = name
    frames.append(gg)

# --- 5) Combine all SA2 results
income_future_quarterly = (
    pd.concat(frames, axis=0)
      .reset_index()
      .rename(columns={'index': 'date'})
      .loc[:, ['sa2_code', 'sa2_name', 'date', 'Predicted_Income']]
)

# --- 6) Confirm types
income_future_quarterly['sa2_code'] = income_future_quarterly['sa2_code'].astype(int)

print(income_future_quarterly.head())
income_future_quarterly = income_future_quarterly.rename(columns={'sa2_code': 'SA2_CODE21'})
income_future_quarterly = income_future_quarterly.rename(columns={'sa2_name': 'SA2_NAME21'})

  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(me

    sa2_code   sa2_name       date  Predicted_Income
0  201011001  Alfredton 2025-06-01      63004.510059
1  201011001  Alfredton 2025-09-01      63428.473425
2  201011001  Alfredton 2025-12-01      63847.828493
3  201011001  Alfredton 2026-03-01      64262.575264
4  201011001  Alfredton 2026-06-01      64686.538630


  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(method='time', limit_direction='both')
  gg = gg.interpolate(me

In [27]:
erp_income_future = erp_future_quarterly.merge(
    income_future_quarterly,
    on=["SA2_CODE21", "date"],
    how="inner"
)


prediction_df = properties_df.merge(
    erp_income_future,
    on=["SA2_CODE21","SA2_NAME21"],
    how="left"
)
prediction_df = prediction_df.drop(columns=["t", "date_x","Median","Suburb_x"], errors="ignore")
prediction_df = prediction_df.rename(columns={'date_y': 'date' , 'Suburb_y' : 'Suburb'})
print(prediction_df.head())

         Lat         Lng  SA2_CODE21   SA2_NAME21  \
0 -37.853484  144.970161   206051128  Albert Park   
1 -37.853484  144.970161   206051128  Albert Park   
2 -37.853484  144.970161   206051128  Albert Park   
3 -37.853484  144.970161   206051128  Albert Park   
4 -37.853484  144.970161   206051128  Albert Park   

                                  Suburb       date           ERP  \
0  Albert Park-Middle Park-West St Kilda 2025-06-01  17081.458844   
1  Albert Park-Middle Park-West St Kilda 2025-09-01  17063.176312   
2  Albert Park-Middle Park-West St Kilda 2025-12-01  17045.092504   
3  Albert Park-Middle Park-West St Kilda 2026-03-01  17068.777750   
4  Albert Park-Middle Park-West St Kilda 2026-06-01  17115.316752   

   Predicted_Income  
0      80479.028754  
1      81070.846210  
2      81656.230868  
3      82235.182727  
4      82827.000183  


In [28]:
#prediction_df = prediction_df.drop(columns=["t"], errors="ignore")
#print(prediction_df.head())

qnum_hist_last = properties_df["date"].dt.to_period("Q").astype(int).max()
last_t          = int(properties_df["t"].max())
c               = last_t - qnum_hist_last

# Now compute t for predictions: t = qnum + c
qnum_pred = prediction_df["date"].dt.to_period("Q").astype(int)
prediction_df["t"] = qnum_pred + c

# 5) Sanity checks (optional)
# same calendar date should have same t across all suburbs
assert prediction_df.groupby("date")["t"].nunique().max() == 1
# first prediction t should be last_t+1 if predictions start in the next quarter
print("first pred t:", prediction_df["t"].min(), " last hist t:", last_t)

first pred t: 33  last hist t: 32


In [29]:
prediction_df.to_csv("prediction_df.csv", index=False)
