In [1]:
import pandas as pd
import numpy as np 

# STAGE II-1 CREATE MASTER SALES FILE

In [22]:
# Read sales 2017-2018

sales17_18 = "Files/sales_2017_2018"
sales17_18 = pd.read_csv(sales17_18, usecols=['postcode', 'dwelling_type', 'median_price', 'mean_price',
                                               'sales_no', 'Qdelta_median', 'Adelta_median', 'Qdelta_count',
                                               'Adelta_count', 'key', 'time_period', 'year', 'quarter'])

cols = ['postcode', 'key', 'time_period', 'year', 'quarter', 
        'dwelling_type', 'median_price', 'mean_price','sales_no', 
        'Qdelta_median', 'Adelta_median', 'Qdelta_count','Adelta_count' ]

sales17_18 = sales17_18[cols]
sales17_18.head(1)

Unnamed: 0,postcode,key,time_period,year,quarter,dwelling_type,median_price,mean_price,sales_no,Qdelta_median,Adelta_median,Qdelta_count,Adelta_count
0,2000,s122,2017 Q3,2017,Q3,Total,1350.0,1516.328059,135.0,0.1345,0.4746,-0.325,-0.3112


In [23]:
# Read sales 2019-2021

sales19_21 = "Files/sales_2019_2021"
sales19_21 = pd.read_csv(sales19_21, usecols=['postcode', 'dwelling_type', 'median_price', 'mean_price',
                                               'sales_no', 'Qdelta_median', 'Adelta_median', 'Qdelta_count',
                                               'Adelta_count', 'key', 'time_period', 'year', 'quarter'])

cols = ['postcode', 'key', 'time_period', 'year', 'quarter', 
        'dwelling_type', 'median_price', 'mean_price','sales_no', 
        'Qdelta_median', 'Adelta_median', 'Qdelta_count','Adelta_count' ]

sales19_21 = sales19_21[cols]
sales19_21.head(1)

Unnamed: 0,postcode,key,time_period,year,quarter,dwelling_type,median_price,mean_price,sales_no,Qdelta_median,Adelta_median,Qdelta_count,Adelta_count
0,2000,s128,2019 Q1,2019,Q1,Total,1160.0,1348.0,103.0,-0.0169,-0.1375,-0.1043,-0.1488


In [24]:
# Concatenate the two sales files 
sales_full = pd.concat([sales17_18, sales19_21])

# Check if all quarters are present
print(sales_full.groupby('time_period').size())

time_period
2017 Q3    1404
2017 Q4    1409
2018 Q1    1371
2018 Q2    1378
2018 Q3    1338
2018 Q4    1372
2019 Q1    1332
2019 Q2    1356
2019 Q3    1377
2019 Q4    1382
2020 Q1    1361
2020 Q2    1332
2020 Q3    1419
2020 Q4    1459
2021 Q1    1427
dtype: int64


In [25]:
# Check null values
sales_full.isnull().sum()

# We don't really care about null values in the delta columns right?

postcode            0
key                 0
time_period         0
year                0
quarter             0
dwelling_type       0
median_price        0
mean_price          0
sales_no            0
Qdelta_median    5857
Adelta_median    5861
Qdelta_count     5857
Adelta_count     5861
dtype: int64

In [None]:
sales_full.to_csv('Files/Cleaned/Sales_2017Q3_2021Q1_Clean.csv', index=False)

# STAGE II-2 CREATE MASTER RENT FILE

In [6]:
# Read rent 2017-2018
rent17_18 = "Files/rent_2017_2018"
rent17_18 = pd.read_csv(rent17_18)

# Read rent 2019-2021
rent19_21 = "Files/rent_2019_2021"
rent19_21 = pd.read_csv(rent19_21)

# Concat both rent files
rent_full = pd.concat([rent17_18, rent19_21])
rent_full.head()

Unnamed: 0,postcode,median_rent_newb,new_bonds_no,total_bonds_no,Qdelta_median_rent,Qdelta_new_bonds,key,Adelta_median_rent,Adelta_new_bonds
0,2000,640.0,1169.0,7914.0,-0.2,0.5545,r122,,
1,2007,535.0,301.0,2231.0,-0.1371,1.1049,r122,,
2,2008,479.0,762.0,5020.0,-0.1812,1.3374,r122,,
3,2009,745.0,232.0,2966.0,-0.0067,-0.072,r122,,
4,2010,650.0,805.0,8978.0,-0.0152,-0.0451,r122,,


In [7]:
# Check null values
rent_full.isnull().sum()

postcode                 0
median_rent_newb         0
new_bonds_no             0
total_bonds_no           0
Qdelta_median_rent    3187
Qdelta_new_bonds      3066
key                      0
Adelta_median_rent    3179
Adelta_new_bonds      3179
dtype: int64

In [8]:
# Check all quarters are presented
rent_full.groupby('key').size()

key
r122    612
r123    612
r124    637
r125    639
r126    638
r127    636
r128    637
r129    640
r130    641
r131    641
r132    619
r133    617
r134    617
r135    620
r136    617
dtype: int64

**@Chirs** Is r136 Q2 2021? I didn't see Q2 2021 rent file in the 'rent/2019-2021' folder

In [9]:
# Need to map keys to time_periods so that can merge it with sales data

tp = ['2017 Q4', 
      '2018 Q1', '2018 Q2', '2018 Q3', '2018 Q4', 
      '2019 Q1', '2019 Q2', '2019 Q3', '2019 Q4', 
      '2020 Q1', '2020 Q2', '2020 Q3', '2020 Q4', 
      '2021 Q1', '2021 Q2']
rkeys = ['r122', 
         'r123','r124','r125','r126',
         'r127','r128','r129','r130',
         'r131','r132','r133','r134',
         'r135','r136']

rent_full['time_period'] = np.nan

for i in list(range(0,15)):
    rent_full.loc[rent_full['key']==rkeys[i], 'time_period']=tp[i]
    
rent_full.groupby('time_period').size()

time_period
2017 Q4    612
2018 Q1    612
2018 Q2    637
2018 Q3    639
2018 Q4    638
2019 Q1    636
2019 Q2    637
2019 Q3    640
2019 Q4    641
2020 Q1    641
2020 Q2    619
2020 Q3    617
2020 Q4    617
2021 Q1    620
2021 Q2    617
dtype: int64

In [10]:
# Update column name of 'key'
rent_full = rent_full.rename(columns={'key':'rkey'})


# Change columns order
cols = ['postcode', 'rkey', 'time_period', 'median_rent_newb', 'new_bonds_no', 'total_bonds_no',
       'Qdelta_median_rent', 'Qdelta_new_bonds', 'Adelta_median_rent',
       'Adelta_new_bonds']
rent_full = rent_full[cols]

rent_full.head()

Unnamed: 0,postcode,rkey,time_period,median_rent_newb,new_bonds_no,total_bonds_no,Qdelta_median_rent,Qdelta_new_bonds,Adelta_median_rent,Adelta_new_bonds
0,2000,r122,2017 Q4,640.0,1169.0,7914.0,-0.2,0.5545,,
1,2007,r122,2017 Q4,535.0,301.0,2231.0,-0.1371,1.1049,,
2,2008,r122,2017 Q4,479.0,762.0,5020.0,-0.1812,1.3374,,
3,2009,r122,2017 Q4,745.0,232.0,2966.0,-0.0067,-0.072,,
4,2010,r122,2017 Q4,650.0,805.0,8978.0,-0.0152,-0.0451,,


In [None]:
# Save full rent data into csv
rent_full.to_csv('Files/Cleaned/Rent_2017Q4_2021Q2_Clean.csv', index=False)

# STAGE II-3 MERGE MASTER SALES AND RENT DATA

Note that we have sales data for Q3 2017 but don't have rent data for the same time period, for the sake of merging, we'll use sales data as the 'left' df and hence will remove data from Q3 2017.

In [30]:
# Remove Q3 2017 data from saels_full
sales_full = sales_full.drop(sales_full[sales_full['time_period']=='2017 Q3'].index)
sales_full.groupby('time_period').size()

time_period
2017 Q4    1409
2018 Q1    1371
2018 Q2    1378
2018 Q3    1338
2018 Q4    1372
2019 Q2    1284
2019 Q3    1377
2019 Q4    1382
2020 Q1    1361
2020 Q2    1332
2020 Q3    1419
2020 Q4    1459
2021 Q1    1427
dtype: int64

In [32]:
# Change the name of 'key' column from sales to 'skey' to differentiate from 'rkey' in rent
sales_full = sales_full.rename(columns={'key':'skey'})

In [39]:
sales_rent_full = pd.merge(sales_full, rent_full, how='left',
                           left_on=['postcode','time_period'],
                           right_on=['postcode', 'time_period'])

sales_rent_full.head()

Unnamed: 0,postcode,skey,time_period,year,quarter,dwelling_type,median_price,mean_price,sales_no,Qdelta_median,...,Qdelta_count,Adelta_count,rkey,median_rent_newb,new_bonds_no,total_bonds_no,Qdelta_median_rent,Qdelta_new_bonds,Adelta_median_rent,Adelta_new_bonds
0,2000,s123,2017 Q4,2017,Q4,Total,1315.0,1823.0,183.0,-0.0259,...,0.3456,-0.0758,r122,640.0,1169.0,7914.0,-0.2,0.5545,,
1,2000,s123,2017 Q4,2017,Q4,Non Strata,4156.0,4497.0,20.0,,...,,0.3333,r122,640.0,1169.0,7914.0,-0.2,0.5545,,
2,2000,s123,2017 Q4,2017,Q4,Strata,1160.0,1340.0,155.0,-0.1407,...,0.1397,-0.1243,r122,640.0,1169.0,7914.0,-0.2,0.5545,,
3,2007,s123,2017 Q4,2017,Q4,Total,770.0,630.0,20.0,-0.0581,...,-0.4167,-0.7375,r122,535.0,301.0,2231.0,-0.1371,1.1049,,
4,2007,s123,2017 Q4,2017,Q4,Strata,770.0,630.0,20.0,-0.0581,...,-0.4167,-0.7342,r122,535.0,301.0,2231.0,-0.1371,1.1049,,


In [44]:
print(sales_rent_full.shape)
print(sales_rent_full.groupby(''))

(17909, 21)
postcode                 0
skey                     0
time_period              0
year                     0
quarter                  0
dwelling_type            0
median_price             0
mean_price               0
sales_no                 0
Qdelta_median         5017
Adelta_median         5020
Qdelta_count          5017
Adelta_count          5020
rkey                     6
median_rent_newb         6
new_bonds_no             6
total_bonds_no           6
Qdelta_median_rent    3560
Qdelta_new_bonds      3371
Adelta_median_rent    3540
Adelta_new_bonds      3540
dtype: int64


In [46]:
print(sales_rent_full.isnull().sum())

postcode                 0
skey                     0
time_period              0
year                     0
quarter                  0
dwelling_type            0
median_price             0
mean_price               0
sales_no                 0
Qdelta_median         5017
Adelta_median         5020
Qdelta_count          5017
Adelta_count          5020
rkey                     6
median_rent_newb         6
new_bonds_no             6
total_bonds_no           6
Qdelta_median_rent    3560
Qdelta_new_bonds      3371
Adelta_median_rent    3540
Adelta_new_bonds      3540
dtype: int64


### @all: do we just drop the 6 nulls in rent?

In [45]:
# Check the 6 postcodes that are null in rent:
sales_rent_full.loc[sales_rent_full['rkey'].isnull()]

Unnamed: 0,postcode,skey,time_period,year,quarter,dwelling_type,median_price,mean_price,sales_no,Qdelta_median,...,Qdelta_count,Adelta_count,rkey,median_rent_newb,new_bonds_no,total_bonds_no,Qdelta_median_rent,Qdelta_new_bonds,Adelta_median_rent,Adelta_new_bonds
1329,2817,s123,2017 Q4,2017,Q4,Total,714.0,714.0,5.0,,...,,,,,,,,,,
1330,2817,s123,2017 Q4,2017,Q4,Non Strata,750.0,801.5,5.0,,...,,,,,,,,,,
1407,3644,s123,2017 Q4,2017,Q4,Total,475.0,484.0,20.0,0.8627,...,0.3333,-0.0769,,,,,,,,
1408,3644,s123,2017 Q4,2017,Q4,Non Strata,475.0,484.0,20.0,0.8627,...,0.3333,-0.0769,,,,,,,,
2778,3644,s124,2018 Q1,2018,Q1,Total,260.0,326.0,20.0,-0.4526,...,-0.0833,0.8333,,,,,,,,
2779,3644,s124,2018 Q1,2018,Q1,Non Strata,260.0,326.0,20.0,-0.4526,...,-0.0833,0.8333,,,,,,,,


In [None]:
sales_rent_full.to_csv('Files/Cleaned/Rent_2017Q4_2021Q2_Clean.csv', index=False)


<BR>

# BELOW ARE CODES FOR STAGE I CLEANING
______

## STEP1. CLEAN SALES DATA (SINGLE QUARTER) #

In [None]:
# Read one file at a time into df, name df by 's'+'issue number ''
s136 = "Files/Sales/2019_2021/Issue-136-Sales-tables-March-2021-quarter.xlsx"
s136 = pd.read_excel(s136, sheet_name="Postcode", na_values='-', header=6)

In [None]:
# Rename columns
rename_cols= {'Postcode':'postcode', 
             'Dwelling Type':'dwelling_type', 
             "First Quartile Sales Price\n$'000s" : '25%_price',
             "Median Sales Price\n$'000s" : 'median_price', 
             "Third Quartile Sales Price\n'000s" : '75%_price',
             "Mean Sales Price\n$'000s" : 'mean_price',
             'Sales\nNo.':'sales_no',
             'Qtly change in Median':'Qdelta_median_price',
             'Annual change in Median':'Adelta_median_price',
             'Qtly change in Count':'Qdelta_sales_no',
             'Annual change in Count':'Adelta_sales_no'}
s136.rename(columns=rename_cols, inplace=True)

# Drop unwanted columns
s136 = s136.drop(columns=['25%_price', '75%_price'], axis=1)

s136.head(5)

In [None]:
print("number of postcodes:", s136['postcode'].nunique())
print("number of rows:", s136.shape[0],'\n')
print(s136.isnull().sum())

<b>IMPUTATION</b>

Roughly 1/4 of the data contains null values, which comes from:
<em>"for confidentiality, we don't report sales in any geographical area where the number of sales is 10 or less (shown as na).</em> Also: <em>"Statistics calculated from sample sizes between 10 an 30 are shown by an 's' in the relevant table".</em> This will be the primary rules of imputation:
* **For 'sales_no':**
    * Replace na with 5
    * Replace s with 20
    
* **For 'median_price' and 'mean_price'**
    * Fill na with the median of price of the same dwelling type

In [None]:
# Imputation of sales number

# na
s136.loc[s136['sales_no'].isnull(), 'sales_no'] = 5.0
s136['sales_no'].isnull().any()

# s
s136.loc[s136['sales_no'] == 's', 'sales_no'] = 20.0
s136['sales_no'] = s136['sales_no'].astype(float) # Cast type as float

print("number of na in sales_no:", s136['sales_no'].isnull().sum())
print("data type of sales_no:", s136['sales_no'].dtype)

In [None]:
# Imputation of median_price (by dwelling type)

# Total
imp_total = s136.loc[(s136['median_price'].notna()) & (s136['dwelling_type']=='Total'),
                     'median_price'].median() # calculate imputer value 
    
s136.loc[(s136['median_price'].isnull()) & (s136['dwelling_type']=='Total'),
         'median_price']= imp_total #impute

# Strata 
imp_strata = s136.loc[(s136['median_price'].notna()) & (s136['dwelling_type']=='Strata'),
                     'median_price'].median() 
    
s136.loc[(s136['median_price'].isnull()) & (s136['dwelling_type']=='Strata'),
         'median_price']= imp_strata 

# Non-Strata
imp_ns = s136.loc[(s136['median_price'].notna()) & (s136['dwelling_type']=='Non Strata'),
                     'median_price'].median() 
    
s136.loc[(s136['median_price'].isnull()) & (s136['dwelling_type']=='Non Strata'),
         'median_price']= imp_ns

print("number of na in median_price:", s136['median_price'].isnull().sum())

In [None]:
# Imputation of mean_price (by dwelling type)

# Total
imp_total = s136.loc[(s136['mean_price'].notna()) & (s136['dwelling_type']=='Total'),
                     'mean_price'].median() # calculate imputer value 
    
s136.loc[(s136['mean_price'].isnull()) & (s136['dwelling_type']=='Total'),
         'mean_price']= imp_total #impute

# Strata 
imp_strata = s136.loc[(s136['mean_price'].notna()) & (s136['dwelling_type']=='Strata'),
                     'mean_price'].median() 
    
s136.loc[(s136['mean_price'].isnull()) & (s136['dwelling_type']=='Strata'),
         'mean_price']= imp_strata 

# Non-Strata
imp_ns = s136.loc[(s136['mean_price'].notna()) & (s136['dwelling_type']=='Non Strata'),
                  'mean_price'].median() 
    
s136.loc[(s136['mean_price'].isnull()) & (s136['dwelling_type']=='Non Strata'),
         'mean_price']= imp_ns

print("number of na in mean_price:", s136['mean_price'].isnull().sum())

In [None]:
s136.describe().round(2)

**ADD TIME PERIOD TAG**

In [None]:
s136['key_s'] = 's136'
s136['time_period'] = '2021 Q1'
s136['year'] = '2021'
s136['quarter'] = '1'

print(s136.shape)

In [None]:
s136

### End of the sales data cleaning process ###
----

# STEP2. CLEAN RENT DATA (SINGLE QUARTER) #


In [None]:
# Read into df
r135 = "Files/Rent/2019_2021/Issue-135-Rent-tables-March-2021-quarter.xlsx"
r135 = pd.read_excel(r135, sheet_name="Postcode", na_values='-', header=7)

# Drop unwanted columns
r135 = r135.drop(columns=['First Quartile Weekly Rent for New Bonds\n$',
                          'Third Quartile Weekly Rent for New Bonds\n$'],
                axis=1)

# Rename columns
rename_cols= {'Postcode':'postcode',
              'Dwelling Types':'dwelling_type', 
              'Number of Bedrooms':'bed_number',
              'Median Weekly Rent for New Bonds\n$': 'median_rent_newb',
              'New Bonds Lodged\nNo.' : 'new_bonds_no',
              'Total Bonds Held\nNo.': 'total_bonds_no',
              'Quarterly change in Median Weekly Rent':'Qdelta_median_rent',
              'Annual change in Median Weekly Rent':'Adelta_median_rent',
              'Quarterly change in New Bonds Lodged':'Qdelta_new_bonds',
              'Annual change in New Bonds Lodged':'Adelta_new_bonds'}
r135.rename(columns=rename_cols,inplace=True)

r135.head(10)

In [None]:
# Check df shape and null values
print(r135.shape)
print(r135.isnull().sum())

**NOTE:**

Note that an alarming 3/4 of the data has null values. This is because the data is broken down to very granular level - first by dwelling type (Total, house, townhouse, flat/unit, other) and then by bed_numbers (see below cell).

In [None]:
print(r135.groupby('dwelling_type').size(),'\n')
print(r135.groupby('bed_number').size())

**AGGREGATION:**

Given the amount of na present, we'll **aggregate the data to the total level of each postcode** (i.e. only consider the total number of bonds without classifying them into dwelling types nor bedroom numbers) to avoid excessive imputation. Although we'll inevitably lose the richness of information, the main function of the rent data is to serve as an input variable in the prediction of sales, where this level of granularity is not strictly necessary.

In [None]:
# Aggregate dwelling type and bed number, save as new df r135_ag

r135_ag = r135.loc[(r135['bed_number']=='Total') & (r135['dwelling_type']=='Total')]
r135_ag = r135_ag.drop(columns=['bed_number','dwelling_type'], axis=1) # Drop bed_number and dwelling_type

print(r135_ag.shape)
print(r135_ag.isnull().sum())

By aggregating the data, we're able to bring down the proportion of na from 3/4 to around 1/3. But there's still need for imputation. According to the data interpretation note:

<em><b>"For confidentiality, we don't report rents in any geographical area where the number of new bonds is 10 or less (shown as na). Statistics calculated from sample sizes between 10 an 30 are shown by an 's' in the relevant table"</b></em>

In [None]:
print("number of 's' in new_bonds_no:", r135_ag.loc[r135_ag['new_bonds_no']=='s'].shape[0])
print("number of 's' in total_bonds_noA:", r135_ag.loc[r135_ag['total_bonds_no']=='s'].shape[0],"\n")
print("number of na in new_bonds_no:", r135_ag.loc[r135_ag['new_bonds_no'].isnull()].shape[0])
print("number of na in total_bonds_no:", r135_ag.loc[r135_ag['total_bonds_no'].isnull()].shape[0])

<b>IMPUTATION</b>
* For 'new_bonds_no' and 'total_bonds_no' columns:
    * Impute na with 5
    * Impute s with 20
    
* For 'median_rent_newb' column
    * Impute na with median of rents of all POAs

In [None]:
# Impute 's' in 'new_bonds_no' and 'total_bonds_no' with 20
r135_ag.loc[r135_ag['new_bonds_no']=='s','new_bonds_no'] = 20.0
r135_ag.loc[r135_ag['total_bonds_no']=='s', 'total_bonds_no'] = 20.0

# Impute na in 'new_bonds_no' and 'total_bonds_no' with 5
r135_ag.loc[r135_ag['new_bonds_no'].isnull(),'new_bonds_no'] = 5.0
r135_ag.loc[r135_ag['total_bonds_no'].isnull(), 'total_bonds_no'] = 5.0

# Cast both variables as float (was object)
r135_ag['new_bonds_no'] = r135_ag['new_bonds_no'].astype(float)
r135_ag['total_bonds_no'] = r135_ag['total_bonds_no'].astype(float)

In [None]:
# Impute na in 'median_rent' with median of the column
r135_ag['median_rent_newb'].fillna(r135_ag['median_rent_newb'].median(), inplace=True)

# Check na in the df again
print(r135_ag.isnull().sum())

In [None]:
# Prepare for merging
r135_ag['key_r'] = 'r135' # Add key

# Set postcode as index
r135_ag = r135_ag.set_index('postcode')
r135_ag.head(1)

In [None]:
r135_ag

### End of the rent data cleaning process ##
----



# STEP3. JOIN SALES AND RENT DATA OF THE SAME QUARTER #

### !! Please note !! ###
**A. the issue number of the sales and rent table of the same quarter are different**
<br>For example, for Q1 2021, the issue number of the sales table is 136 and that of the rent table is 135. The resulting data frames are s136 and r135_ag. For the same quarter, the issue number of sales table is always that of the rent+1 </br>

**B. the sales table and the rent table contains different numbers of postcodes**
<br>For example, s136 (sales table) contains 544 unique postcodes while r135 (rent table) contains 617. We will merge the rent table into the sales table, i.e. only keep postcodes that are available in the sales table </br>

In [None]:
# Merge the rent df into the sales df of the same quarter (Q1 2021)
q121 = s136.join(r135_ag, on='postcode')
q121.head(1)

# Note: naming convention of merged quarterly sales+rent df
# quarter in lowercase + 2digit year
# e.g. Q4 2018 will be q418

In [None]:
q121['postcode'].nunique()

# STEP4 Repeat above process to the sales and rent data of the rest quarters #

**@Chris** - I'm assuming that we will be able to do this easily with the functions created from step 1-3?

# STEP5 Concact merged sales and rent quarterly df into one master df #

In [None]:
# NOTE: Dummy code here 
frames = [q118, q218, q318, q418, 
          q119, q219, q319, q419,
          q120, q220, q320, q420,
          q121]
sr_master = pd.concat(frames)

# STEP 6 Add postcode to LGA Mapping to the master df #

In [None]:
# Read the lga-postcode data into df
mapping = "Files/Area/Postcode_and_LGA.xlsx"
lga_poa = pd.read_excel(mapping, sheet_name="SuburbLGA", 
                        usecols=['lganame','councilnam','suburbname','postcode'])

# Rename columns 
rename_cols= {'lganame':'LGA',
              'councilnam':'council', 
              'suburbname':'suburb'}
lga_poa.rename(columns=rename_cols,inplace=True)

lga_poa.head()

In [None]:
# Check the number of postcodes and null values

print(lga_poa['postcode'].nunique())
print(lga_poa.isnull().sum())

In [None]:
# Check null values in the postcode
lga_poa.loc[lga_poa['postcode'].isnull()]

In [None]:
# unincorporated LGA?
lga_poa.loc[lga_poa['LGA']=='UNINCORPORATED']