In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import accuracy_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree
%matplotlib inline

In [2]:
# Read Data

modelDF = pd.read_csv("Files/Cleaned/Postcode-based/Unstacked_Transformed.csv",
                     index_col="postcode")
modelDF.head()

Unnamed: 0_level_0,mean_price 2020 Q1,mean_price 2020 Q2,mean_price 2020 Q3,mean_price 2020 Q4,mean_price 2021 Q1,median_price 2020 Q1,median_price 2020 Q2,median_price 2020 Q3,median_price 2020 Q4,median_price 2021 Q1,...,35-54yo_Prop,55-64yo_Prop,65+yo_Prop,citizen_AU_Prop,citizen_non_AU_Prop,YARRP <1975_Prop,YARRP 1976-1995_Prop,YARRP 1996-2005_Prop,YARRP 2006-2016_Prop,ATSI_Prop
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000,1541.0,1322.0,1631.0,1379.0,2794.0,1225.0,1000.0,1390.0,1110.0,1371.0,...,0.229762,0.060924,0.061727,0.317062,0.534895,0.019153,0.058006,0.079348,0.475101,0.001934
2007,834.0,739.0,658.0,677.0,754.0,745.0,775.0,655.0,651.0,763.0,...,0.166177,0.037644,0.042957,0.308162,0.568958,0.012322,0.054149,0.067036,0.510626,0.006557
2008,956.0,1144.0,985.0,1184.0,937.0,750.0,1173.0,890.0,991.0,855.0,...,0.158982,0.04013,0.031592,0.372097,0.509051,0.013234,0.046192,0.051827,0.471909,0.006574
2009,1277.0,1282.0,1373.0,1661.0,1427.0,986.0,1100.0,1085.0,1075.0,1188.0,...,0.290018,0.095762,0.089753,0.578085,0.323265,0.03356,0.093733,0.085616,0.291111,0.009756
2010,1357.0,1395.0,1476.0,2267.0,1371.0,1280.0,1325.0,1270.0,1240.0,1201.0,...,0.322504,0.087802,0.088883,0.60401,0.244222,0.03714,0.079436,0.064328,0.226373,0.008474


In [3]:
# Remove interest rate, bond yields (they're the same for all postcodes)
# Remove redundant calculated proportion variables
modelDF = modelDF.iloc[:, np.r_[0:56, 96:121]] 

# Drop mean price columns
modelDF = modelDF.drop(["mean_price 2020 Q1", "mean_price 2020 Q2", "mean_price 2020 Q3",
                        "mean_price 2020 Q4", "mean_price 2021 Q1"],
                      axis=1)

# Drop one category from each feature group (optional)
modelDF = modelDF.drop(columns=['INCP_NEG_NIL', 'INCP_NEG_NIL_Prop', 
                                '65+yo','65+yo_Prop',
                                'CPRF_na', 'CPRF_na_Prop',
                                'citizen_AU', 'citizen_AU_Prop'],axis=1)

print(modelDF.shape)
modelDF.head(1)

(573, 68)


Unnamed: 0_level_0,median_price 2020 Q1,median_price 2020 Q2,median_price 2020 Q3,median_price 2020 Q4,median_price 2021 Q1,median_rent_newb 2020 Q1,median_rent_newb 2020 Q2,median_rent_newb 2020 Q3,median_rent_newb 2020 Q4,median_rent_newb 2021 Q1,...,15-24yo_Prop,25-34yo_Prop,35-54yo_Prop,55-64yo_Prop,citizen_non_AU_Prop,YARRP <1975_Prop,YARRP 1976-1995_Prop,YARRP 1996-2005_Prop,YARRP 2006-2016_Prop,ATSI_Prop
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000,1225.0,1000.0,1390.0,1110.0,1371.0,700.0,630.0,600.0,550.0,600.0,...,0.212725,0.389479,0.229762,0.060924,0.534895,0.019153,0.058006,0.079348,0.475101,0.001934


In [4]:
coord = pd.read_csv("Files/Map Vis/australian_postcodes.csv",
                     index_col="postcode")

In [5]:
coord = pd.DataFrame(coord)

In [6]:
coord.head()

Unnamed: 0_level_0,id,locality,state,long,lat,dc,type,status,sa3,sa3name,...,SA3_NAME_2016,SA4_CODE_2016,SA4_NAME_2016,RA_2011,RA_2016,MMM_2015,MMM_2019,ced,altitude,chargezone
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,458,SYDNEY,NSW,151.268071,-33.794883,EAST SUBS MC,LVR,Updated 25-Mar-2020 SA3,11703.0,Sydney Inner City,...,Sydney Inner City,117.0,Sydney - City and Inner South,1.0,1.0,1.0,1.0,,,N1
1002,459,SYDNEY,NSW,151.268071,-33.794883,EAST SUBS MC,LVR,Updated 25-Mar-2020 SA3,11703.0,Sydney Inner City,...,Sydney Inner City,117.0,Sydney - City and Inner South,1.0,1.0,1.0,1.0,,,N1
1003,460,SYDNEY,NSW,151.268071,-33.794883,EAST SUBS MC,LVR,Updated 25-Mar-2020 SA3,11703.0,Sydney Inner City,...,Sydney Inner City,117.0,Sydney - City and Inner South,1.0,1.0,1.0,1.0,,,N1
1004,461,SYDNEY,NSW,151.268071,-33.794883,EAST SUBS MC,LVR,Updated 25-Mar-2020 SA3,11703.0,Sydney Inner City,...,Sydney Inner City,117.0,Sydney - City and Inner South,1.0,1.0,1.0,1.0,,,N1
1005,462,SYDNEY,NSW,151.268071,-33.794883,GPO BOX CENTRE,LVR,Updated 25-Mar-2020 SA3,11703.0,Sydney Inner City,...,Sydney Inner City,117.0,Sydney - City and Inner South,1.0,1.0,1.0,1.0,,,N1


**INDEX**

id:	Primary Key from source database

postcode:	The postcode in numerical format - 0000 to 9999

locality:	The locality of the postcode - typically the city/suburb or postal distribution centre	

state:	The Australian state in which the locality is situated	

long:	The longitude of the locality - defaults to 0 when not available

lat:	The latitude of the locality - defaults to 0 when not available

dc1:	The Australia Post distribution Centre servicing this postcode - defaults to blank when not available

type1:	The type of locality, such as a delivery area, post office or a "Large Volume Recipient" such as a GPO, defaults to blank when not available

SA3:	The SA3 Statistical Area code

SA3 Name:	The name of the SA3 Statistical Area

SA4:	The SA4 Statistical Area code

SA4 Name:	The name of the SA4 Statistical Area

Region:	Designated Regional Area

status:	A note indicating whether the data is new, removed or updated - new column Nov 2018

CED:	The Commonwealth Electroal Division	

Altitude:	Altitude/Elevation (meters)

Charge Zone:	Australia Post Charge Zones

SA1 Maincode 2011:	Statistical Area 1 2011 Code

SA1 Maincode 2016:	Statistical Area 1 2016 Code

SA2 Maincode 2016:	Statistical Area 2 2016 Code

SA2 Name: 2016	Statistical Area 2 2016 Name

RA 2011	Remoteness Area - 2011 Dataset

RA 2016	Remoteness Area - 2016 Dataset

MMM 2015	Modified Monash Model - 2015 Dataset **(originally was 2016, not 2015)**

MMM 2019	Modified Monash Model - 2019 Dataset


In [7]:
set(coord['state'])

{'NSW'}

In [8]:
coord[['id', 'locality']]

Unnamed: 0_level_0,id,locality
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,458,SYDNEY
1002,459,SYDNEY
1003,460,SYDNEY
1004,461,SYDNEY
1005,462,SYDNEY
...,...,...
4380,21116,RUBY CREEK
4380,21122,UNDERCLIFFE
4383,21127,JENNINGS
4385,21128,CAMP CREEK


In [9]:
coord.columns

Index(['id', 'locality', 'state', 'long', 'lat', 'dc', 'type', 'status', 'sa3',
       'sa3name', 'sa4', 'sa4name', 'region', 'Lat_precise', 'Long_precise',
       'SA1_MAINCODE_2011', 'SA1_MAINCODE_2016', 'SA2_MAINCODE_2016',
       'SA2_NAME_2016', 'SA3_CODE_2016', 'SA3_NAME_2016', 'SA4_CODE_2016',
       'SA4_NAME_2016', 'RA_2011', 'RA_2016', 'MMM_2015', 'MMM_2019', 'ced',
       'altitude', 'chargezone'],
      dtype='object')

In [16]:
coord['region']

postcode
1001     R1
1002     R1
1003     R1
1004     R1
1005     R1
       ... 
4380     R3
4380     R3
4383     R3
4385     R3
4385    NaN
Name: region, Length: 5571, dtype: object

In [18]:
coord = coord[['id', 'long', 'lat']]

In [11]:
coord[coord.columns[0:5]]

Unnamed: 0_level_0,id,locality,state,long,lat
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,458,SYDNEY,NSW,151.268071,-33.794883
1002,459,SYDNEY,NSW,151.268071,-33.794883
1003,460,SYDNEY,NSW,151.268071,-33.794883
1004,461,SYDNEY,NSW,151.268071,-33.794883
1005,462,SYDNEY,NSW,151.268071,-33.794883
...,...,...,...,...,...
4380,21116,RUBY CREEK,NSW,152.018346,-28.625911
4380,21122,UNDERCLIFFE,NSW,152.182263,-28.622551
4383,21127,JENNINGS,NSW,151.969412,-28.940512
4385,21128,CAMP CREEK,NSW,150.880359,-28.755116


In [23]:
coord[(coord.index >=2000) & (coord.index <= 2880)]

Unnamed: 0_level_0,id,locality,state,long,lat,dc,type,status,sa3,sa3name,...,SA3_NAME_2016,SA4_CODE_2016,SA4_NAME_2016,RA_2011,RA_2016,MMM_2015,MMM_2019,ced,altitude,chargezone
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000,20208,BARANGAROO,NSW,151.201580,-33.860520,Sydney Metro,Delivery Area,Updated 6-Feb-2020,11703.0,Sydney Inner City,...,Sydney Inner City,117.0,Sydney - City and Inner South,1.0,1.0,1.0,1.0,Sydney,,N1
2000,4478,DARLING HARBOUR,NSW,151.256649,-33.859953,Sydney Metro,,Updated 6-Feb-2020,11703.0,Sydney Inner City,...,Sydney Inner City,117.0,Sydney - City and Inner South,1.0,1.0,1.0,1.0,Sydney,,N1
2000,4479,DAWES POINT,NSW,151.256649,-33.859953,WATERLOO DELIVERY FACILITY,Delivery Area,Updated 6-Feb-2020,11703.0,Sydney Inner City,...,Sydney Inner City,117.0,Sydney - City and Inner South,1.0,1.0,1.0,1.0,Sydney,,N1
2000,4480,HAYMARKET,NSW,151.256649,-33.859953,WATERLOO DELIVERY FACILITY,Delivery Area,Updated 6-Feb-2020,11703.0,Sydney Inner City,...,Sydney Inner City,117.0,Sydney - City and Inner South,1.0,1.0,1.0,1.0,Sydney,,N1
2000,4481,MILLERS POINT,NSW,151.256649,-33.859953,WATERLOO DELIVERY FACILITY,Delivery Area,Updated 6-Feb-2020,11703.0,Sydney Inner City,...,Sydney Inner City,117.0,Sydney - City and Inner South,1.0,1.0,1.0,1.0,Sydney,,N1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2880,20747,PACKSADDLE,NSW,142.170513,-30.441973,Broken Hill,Delivery Area,Updated 6-Feb-2020,10502.0,Broken Hill and Far West,...,Broken Hill and Far West,105.0,Far West and Orana,4.0,4.0,6.0,6.0,Parkes,192.477799,S2
2880,4986,SILVERTON,NSW,142.203381,-30.170441,BROKEN HILL DELIVERY,Delivery Area,Updated 6-Feb-2020,10502.0,Broken Hill and Far West,...,Broken Hill and Far West,105.0,Far West and Orana,4.0,4.0,6.0,6.0,Parkes,192.477799,S2
2880,4987,SOUTH BROKEN HILL,NSW,142.203381,-30.170441,BROKEN HILL SOUTH LPO,Delivery Area,Updated 6-Feb-2020,10502.0,Broken Hill and Far West,...,Broken Hill and Far West,105.0,Far West and Orana,3.0,3.0,3.0,3.0,Parkes,192.477799,S2
2880,4988,STEPHENS CREEK,NSW,142.203381,-30.170441,BROKEN HILL DELIVERY,Delivery Area,Updated 6-Feb-2020,10502.0,Broken Hill and Far West,...,Broken Hill and Far West,105.0,Far West and Orana,4.0,4.0,6.0,6.0,Parkes,192.477799,S2
