# This notebook is for income prediction, we will focus on median as income are very skewed

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

In [None]:
income_df = pd.read_csv("../Amanda-workspace/predicted_income_till_2030.csv")
income_df 

Unnamed: 0,SA2,SA2 NAME,2017-18.sum,2018-19.sum,2019-20.sum,2020-21.sum,2021-22.sum,2017.med,2018.med,2019.med,2020.med,2021.med,2022.med,2017-18.mean,2018-19.mean,2019-20.mean,2020-21.mean,2021-22.mean
0,Victoria,,2.400000e+11,2.550000e+11,2.650000e+11,2.780000e+11,3.040000e+11,47999,47003,48801,49848,52218,54708,60976,62942,64361,67802,71842
1,201011001,Alfredton,4.830365e+08,5.266747e+08,5.871047e+08,6.719109e+08,7.661202e+08,49982,50151,51620,52777,55499,58316,59967,61305,63327,66618,69685
2,201011002,Ballarat,5.836767e+08,6.140196e+08,6.227198e+08,6.587578e+08,6.838888e+08,48152,48937,51187,50968,53232,55618,71661,74463,75117,80268,82715
3,201011005,Buninyong,2.675157e+08,2.797703e+08,2.882522e+08,3.027231e+08,3.251919e+08,50469,48461,49846,51572,53959,55278,59448,60979,62365,65468,68664
4,201011006,Delacombe,2.307361e+08,2.653591e+08,3.038581e+08,3.477675e+08,4.214570e+08,46355,45598,47940,49810,51041,53390,49954,52505,54270,56825,59595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518,217031476,Otway,1.080793e+08,1.150822e+08,1.214713e+08,1.353870e+08,1.412112e+08,32085,30663,32420,33597,38795,40989,42251,43608,45632,52824,54543
519,217041477,Moyne - East,2.071029e+08,2.359113e+08,2.487158e+08,2.591970e+08,2.854665e+08,38224,40764,43158,43759,47460,49857,48074,52694,55703,56891,61049
520,217041478,Moyne - West,3.111598e+08,3.363136e+08,3.592223e+08,3.842777e+08,4.308266e+08,41581,41093,43243,44966,48614,51579,50114,53248,55745,59513,64874
521,217041479,Warrnambool - North,7.970469e+08,7.598940e+08,8.041996e+08,8.545918e+08,9.159373e+08,42537,43939,45632,47821,50159,51796,56162,52602,55573,58183,60799


In [5]:
med_cols = [col for col in income_df.columns if col.endswith(".med")]
df_med_growth = income_df[med_cols].pct_change(axis=1) * 100

n_years = 5
def calculate_cagr(start, end, n_years, eps=1e-9):
    start_safe = start + eps
    return ((end / start_safe) ** (1/n_years) - 1) * 100

income_df['med_CAGR'] = income_df.apply(lambda x: calculate_cagr(x[med_cols[0]], x[med_cols[-1]], n_years), axis=1)


print(income_df[['SA2 NAME', 'med_CAGR']])

                SA2 NAME  med_CAGR
0                    NaN  2.651129
1              Alfredton  3.132328
2               Ballarat  2.924842
3              Buninyong  1.836983
4              Delacombe  2.866193
..                   ...       ...
518                Otway  5.020252
519         Moyne - East  5.457629
520         Moyne - West  4.403630
521  Warrnambool - North  4.017371
522  Warrnambool - South  4.781340

[523 rows x 2 columns]


Split train and test. train on 2017-2021 and test on 2022. using linear regression and cagr see which one is better

In [6]:
# 1. Identify columns and years
med_cols = [col for col in income_df.columns if col.endswith(".med")]
years = np.array([int(c.split('.')[0]) for c in med_cols])
train_years = years[years <= 2021]
test_years = years[years > 2021]

# Convert to numeric arrays for regression
train_X = train_years.reshape(-1, 1)
test_X = test_years.reshape(-1, 1)

# Storage for predictions and errors
mae_lin, rmse_lin, mae_cagr, rmse_cagr = [], [], [], []

# 2. Loop by SA2 
for idx, row in income_df.iterrows():
    y = row[med_cols].values.astype(float)
    y_dict = dict(zip(years, y))
    
    # Train/test split
    y_train = np.array([y_dict[y] for y in train_years])
    y_test = np.array([y_dict[y] for y in test_years])
    
    #Linear Regression 
    model = LinearRegression()
    model.fit(train_X, y_train)
    y_pred_lin = model.predict(test_X)
    
    # CAGR 
    n_train_years = train_years[-1] - train_years[0]
    start, end = y_dict[train_years[0]], y_dict[train_years[-1]]
    cagr = ((end / (start + 1e-9)) ** (1 / n_train_years)) - 1
    y_pred_cagr = [end * ((1 + cagr) ** (t - train_years[-1])) for t in test_years]
    
    # Error metrics
    mae_lin.append(mean_absolute_error(y_test, y_pred_lin))
    rmse_lin.append(root_mean_squared_error(y_test, y_pred_lin))
    mae_cagr.append(mean_absolute_error(y_test, y_pred_cagr))
    rmse_cagr.append(root_mean_squared_error(y_test, y_pred_cagr))

# 3. Summarize
results = pd.DataFrame({
    "Model": ["Linear Regression", "CAGR"],
    "MAE": [np.mean(mae_lin), np.mean(mae_cagr)],
    "RMSE": [np.mean(rmse_lin), np.mean(rmse_cagr)]
})

print("Model Accuracy Comparison (Test: 2021–2022)")
print(results)



Model Accuracy Comparison (Test: 2021–2022)
               Model           MAE          RMSE
0  Linear Regression   2600.125048   2600.125048
1               CAGR  97359.065049  97359.065049


We can see that MAE and RMSE is much more smaller for LINEAR REGRESSION hence we will use linear regressions our model

In [7]:
years_numeric = np.array([2017, 2018, 2019, 2020, 2021, 2022]).reshape(-1, 1)

# Years to predict
future_years = np.arange(2023, 2031).reshape(-1, 1) 

predictions_all_years = []

for idx, row in income_df.iterrows():
    y = row[med_cols].values.astype(float)
    
    # Fit linear regression
    model = LinearRegression()
    model.fit(years_numeric, y)
    
    # Predict for all future years
    pred_future = model.predict(future_years)
    
    # Save as a list 
    predictions_all_years.append(pred_future)

future_years_cols = [f'Predicted_Income_{year}' for year in range(2023, 2031)]

pred_df = pd.DataFrame(predictions_all_years, columns=future_years_cols)

# Combine with original DataFrame
income_df = pd.concat([income_df.reset_index(drop=True), pred_df], axis=1)

In [8]:
income_df

Unnamed: 0,SA2,SA2 NAME,2017-18.sum,2018-19.sum,2019-20.sum,2020-21.sum,2021-22.sum,2017.med,2018.med,2019.med,...,2021-22.mean,med_CAGR,Predicted_Income_2023,Predicted_Income_2024,Predicted_Income_2025,Predicted_Income_2026,Predicted_Income_2027,Predicted_Income_2028,Predicted_Income_2029,Predicted_Income_2030
0,Victoria,,2.400000e+11,2.550000e+11,2.650000e+11,2.780000e+11,3.040000e+11,47999,47003,48801,...,71842,2.651129,55119.866667,56555.209524,57990.552381,59425.895238,60861.238095,62296.580952,63731.923810,65167.266667
1,201011001,Alfredton,4.830365e+08,5.266747e+08,5.871047e+08,6.719109e+08,7.661202e+08,49982,50151,51620,...,69685,3.132328,58944.600000,60626.628571,62308.657143,63990.685714,65672.714286,67354.742857,69036.771429,70718.800000
2,201011002,Ballarat,5.836767e+08,6.140196e+08,6.227198e+08,6.587578e+08,6.838888e+08,48152,48937,51187,...,82715,2.924842,56348.600000,57777.057143,59205.514286,60633.971429,62062.428571,63490.885714,64919.342857,66347.800000
3,201011005,Buninyong,2.675157e+08,2.797703e+08,2.882522e+08,3.027231e+08,3.251919e+08,50469,48461,49846,...,68664,1.836983,55824.000000,57031.571429,58239.142857,59446.714286,60654.285714,61861.857143,63069.428571,64277.000000
4,201011006,Delacombe,2.307361e+08,2.653591e+08,3.038581e+08,3.477675e+08,4.214570e+08,46355,45598,47940,...,59595,2.866193,54359.733333,55884.704762,57409.676190,58934.647619,60459.619048,61984.590476,63509.561905,65034.533333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518,217031476,Otway,1.080793e+08,1.150822e+08,1.214713e+08,1.353870e+08,1.412112e+08,32085,30663,32420,...,54543,5.020252,41767.466667,43770.123810,45772.780952,47775.438095,49778.095238,51780.752381,53783.409524,55786.066667
519,217041477,Moyne - East,2.071029e+08,2.359113e+08,2.487158e+08,2.591970e+08,2.854665e+08,38224,40764,43158,...,61049,5.457629,51755.733333,54008.704762,56261.676190,58514.647619,60767.619048,63020.590476,65273.561905,67526.533333
520,217041478,Moyne - West,3.111598e+08,3.363136e+08,3.592223e+08,3.842777e+08,4.308266e+08,41581,41093,43243,...,64874,4.403630,52606.933333,54729.104762,56851.276190,58973.447619,61095.619048,63217.790476,65339.961905,67462.133333
521,217041479,Warrnambool - North,7.970469e+08,7.598940e+08,8.041996e+08,8.545918e+08,9.159373e+08,42537,43939,45632,...,60799,4.017371,53695.066667,55613.466667,57531.866667,59450.266667,61368.666667,63287.066667,65205.466667,67123.866667


In [36]:
income_df.to_csv("income_predictions_2017_final.csv", index=False)