# This notebook is trying to find the suburb with the highest growth using FOI COUNTS AS WELL

In [2]:
import pandas as pd
merged_df = pd.read_csv('/home/eeamanda/project-2-group-real-estate-industry-project-7-2025/Amanda-workspace/merged_df.csv')

In [7]:
merged_df.dtypes

Suburb                   object
date                     object
Median                    int64
Lat                     float64
Lng                     float64
SA2_CODE21                int64
SA2_NAME21               object
t                         int64
ERP_quarterly           float64
Income_quarterly_med    float64
dtype: object

Combine with FOI counts

In [6]:
foi_counts_df = pd.read_csv("/home/eeamanda/project-2-group-real-estate-industry-project-7-2025/Amanda-workspace/pivot_counts.csv")
foi_counts_df.dtypes

SA2_CODE21    int64
cultural      int64
education     int64
health        int64
others        int64
tourist       int64
dtype: object

In [None]:
# merge on SA2 code
merged_df = merged_df.merge(
    foi_counts_df,
    on='SA2_CODE21',
    how='left'   # keep all rows from merged_df
)
print(merged_df.head())


                                  Suburb        date  Median        Lat  \
0  Albert Park-Middle Park-West St Kilda  2017-03-01     520 -37.853484   
1  Albert Park-Middle Park-West St Kilda  2017-06-01     532 -37.853484   
2  Albert Park-Middle Park-West St Kilda  2017-09-01     530 -37.853484   
3  Albert Park-Middle Park-West St Kilda  2017-12-01     530 -37.853484   
4  Albert Park-Middle Park-West St Kilda  2018-03-01     550 -37.853484   

          Lng  SA2_CODE21   SA2_NAME21  t  ERP_quarterly  \
0  144.970161   206051128  Albert Park  0   16536.854795   
1  144.970161   206051128  Albert Park  1   16594.323288   
2  144.970161   206051128  Albert Park  2   16651.791781   
3  144.970161   206051128  Albert Park  3   16708.635616   
4  144.970161   206051128  Albert Park  4   16785.060274   

   Income_quarterly_med  cultural  education  health  others  tourist  
0          62618.808219         4          7       2     103       12  
1          62804.068493         4          7

In [10]:
merged_df

Unnamed: 0,Suburb,date,Median,Lat,Lng,SA2_CODE21,SA2_NAME21,t,ERP_quarterly,Income_quarterly_med,cultural,education,health,others,tourist
0,Albert Park-Middle Park-West St Kilda,2017-03-01,520,-37.853484,144.970161,206051128,Albert Park,0,16536.854795,62618.808219,4,7,2,103,12
1,Albert Park-Middle Park-West St Kilda,2017-06-01,532,-37.853484,144.970161,206051128,Albert Park,1,16594.323288,62804.068493,4,7,2,103,12
2,Albert Park-Middle Park-West St Kilda,2017-09-01,530,-37.853484,144.970161,206051128,Albert Park,2,16651.791781,62989.328767,4,7,2,103,12
3,Albert Park-Middle Park-West St Kilda,2017-12-01,530,-37.853484,144.970161,206051128,Albert Park,3,16708.635616,63172.575342,4,7,2,103,12
4,Albert Park-Middle Park-West St Kilda,2018-03-01,550,-37.853484,144.970161,206051128,Albert Park,4,16785.060274,63400.523288,4,7,2,103,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4813,Yarraville-Seddon,2024-03-01,570,-37.812809,144.884163,213031352,Yarraville,28,16280.121038,84590.600000,2,4,2,43,0
4814,Yarraville-Seddon,2024-06-01,590,-37.812809,144.884163,213031352,Yarraville,29,16337.039963,84590.600000,2,4,2,43,0
4815,Yarraville-Seddon,2024-09-01,595,-37.812809,144.884163,213031352,Yarraville,30,16393.958888,84590.600000,2,4,2,43,0
4816,Yarraville-Seddon,2024-12-01,600,-37.812809,144.884163,213031352,Yarraville,31,16450.259129,84590.600000,2,4,2,43,0


In [15]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# --- Stage 1: Within-suburb growth slopes ---
df = merged_df.copy()
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['Suburb','date'])

# numeric quarter index
t0 = df['date'].min()
df['t'] = ((df['date'] - t0) / pd.Timedelta(days=90)).astype(int)

train_mask = df['date'] <= '2024-09-01'
train_df = df.loc[train_mask]

growth_results = []
for suburb, df_train in train_df.groupby('Suburb'):
    if len(df_train) < 4:  # need enough quarters
        continue
    X_train = df_train[['t','ERP_quarterly','Income_quarterly_med']]
    y_train = df_train['Median']
    model = LinearRegression().fit(X_train, y_train)
    coef_t = model.coef_[0]   # slope wrt time
    growth_results.append({'Suburb': suburb, 'Growth_slope_t': coef_t})

growth_df = pd.DataFrame(growth_results)

# --- Stage 2: Cross-sectional regression on FOI counts ---
foi_df = (
    df.groupby('Suburb')
      .agg({
          'cultural':'first',
          'education':'first',
          'health':'first',
          'others':'first',
          'tourist':'first',
          'ERP_quarterly':'mean',
          'Income_quarterly_med':'mean'
      })
      .reset_index()
)

cross_df = growth_df.merge(foi_df, on='Suburb', how='left')

X = cross_df[['cultural','education','health','others','tourist',
              'ERP_quarterly','Income_quarterly_med']]
y = cross_df['Growth_slope_t']

X = sm.add_constant(X)
cross_model = sm.OLS(y, X).fit()
print(cross_model.summary())

# --- Stage 3: Predict long-run growth potential & rank ---
cross_df['Predicted_growth'] = cross_model.predict(X)

ranking = cross_df[['Suburb','Growth_slope_t','Predicted_growth',
                    'cultural','education','health','others','tourist',
                    'ERP_quarterly','Income_quarterly_med']] \
          .sort_values('Predicted_growth', ascending=False)

print("\nüèÜ Top suburbs by predicted long-run growth:")
print(ranking.head(10))




                            OLS Regression Results                            
Dep. Variable:         Growth_slope_t   R-squared:                       0.128
Model:                            OLS   Adj. R-squared:                  0.084
Method:                 Least Squares   F-statistic:                     2.905
Date:                Wed, 08 Oct 2025   Prob (F-statistic):            0.00728
Time:                        01:04:56   Log-Likelihood:                -495.04
No. Observations:                 146   AIC:                             1006.
Df Residuals:                     138   BIC:                             1030.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -2.8801 

In [14]:
import statsmodels.api as sm

# attach FOI variables (they are constant per suburb, so take first row)
foi_df = (
    df.groupby('Suburb')
      .agg({
          'cultural':'first',
          'education':'first',
          'health':'first',
          'others':'first',
          'tourist':'first',
          'ERP_quarterly':'mean',
          'Income_quarterly_med':'mean'
      })
      .reset_index()
)

# merge with growth slopes
cross_df = growth_df.merge(foi_df, on='Suburb', how='left')

X = cross_df[['cultural','education','health','others','tourist',
              'ERP_quarterly','Income_quarterly_med']]
y = cross_df['Growth_slope_t']

X = sm.add_constant(X)
cross_model = sm.OLS(y, X).fit()
print(cross_model.summary())


                            OLS Regression Results                            
Dep. Variable:         Growth_slope_t   R-squared:                       0.128
Model:                            OLS   Adj. R-squared:                  0.084
Method:                 Least Squares   F-statistic:                     2.905
Date:                Wed, 08 Oct 2025   Prob (F-statistic):            0.00728
Time:                        01:04:25   Log-Likelihood:                -495.04
No. Observations:                 146   AIC:                             1006.
Df Residuals:                     138   BIC:                             1030.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -2.8801 