This notebook is for cleaning __Construction cost index__ and __Population data__  

- Construction cost index is fairly straightforward
- Population data has a lot of categories

##### To do:
- Decide which population data to include
- How to merge 
- Clean LGA Map data inconsistencies:
    - Albury City vs Albury
    - City of parramatta vs parramatta
    - Nambucca Valley vs Nambucca
    - 'unincorporated NSW' whatever that is
    - Lord Howe

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

# Construction cost index

In [2]:
# --read file, --rename columns
construction_file = "Files/Construction/Quarterly, Building construction prices rose, due to Homebuilder grants and government infrastructure investment.xlsx"
df_cons = pd.read_excel(construction_file,header=1,usecols="A:B", skipfooter=2)
df_cons.columns=['date','constr_index']

In [3]:
# --convert to datetime
df_cons['date'] = pd.to_datetime(df_cons['date'],format='%b-%y')

# --get year and quarter, --concatenate as time_period format, --drop other columns
df_cons['year'] = df_cons.date.dt.year
df_cons['quarter'] = df_cons.date.dt.quarter
df_cons['time_period'] = df_cons.year.map(str) + " Q" + df_cons.quarter.map(str)
df_cons_clean = df_cons.drop(columns=['date','year','quarter'],axis=1)
df_cons_clean.head()

Unnamed: 0,constr_index,time_period
0,100.1,2012 Q2
1,100.3,2012 Q3
2,100.2,2012 Q4
3,101.0,2013 Q1
4,101.6,2013 Q2


#### Join statement:
replace the two xxx with master dataframe

xxx = pd.merge(xxx, df_cons_clean, on='time_period',how='left')

# Population

### LGA to Postcode mapping file

Do we need suburb name?

In [4]:
SuburbLGA = "Files/Area/Postcode_and_LGA.xlsx"
postcodeLGA = pd.read_excel(SuburbLGA, usecols = "A, C, D") #suburbname optional

postcodeLGA = postcodeLGA.dropna()
postcodeLGA["postcode"] = postcodeLGA["postcode"].astype(int)
postcodeLGA["lganame"] = postcodeLGA.lganame.str.title()
postcodeLGA.head()

Unnamed: 0,lganame,suburbname,postcode
0,Albury City,ALBURY,2640
1,Albury City,EAST ALBURY,2640
2,Albury City,ETTAMOGAH,2640
3,Albury City,GLENROY,2640
4,Albury City,HAMILTON VALLEY,2641


### Household per LGA, 2016 and 2021

Long format

In [5]:
popfile = "Files/Population/2019 NSW Population Projections ASGS 2019 LGA.xlsx"
df_hhold = pd.read_excel(popfile,sheet_name='LGA Household Totals',header=6,usecols="A:C",skipfooter=3)

# --convert wide to long format with melt, --rename cols, --clean LGA name
df_hhold = pd.melt(df_hhold, id_vars='Counting households', value_vars=[2016,2021])
df_hhold.columns=['LGA','year','hhold_count']
df_hhold['LGA'] = df_hhold.LGA.str.split('(').str.get(0)
df_hhold

Unnamed: 0,LGA,year,hhold_count
0,Albury,2016,21940
1,Armidale Regional,2016,11755
2,Ballina,2016,18178
3,Balranald,2016,963
4,Bathurst Regional,2016,16105
...,...,...,...
253,Wingecarribee,2021,20577
254,Wollondilly,2021,18402
255,Wollongong,2021,87168
256,Woollahra,2021,24009


Wide format

In [6]:
popfile = "Files/Population/2019 NSW Population Projections ASGS 2019 LGA.xlsx"
df_hhold_wide = pd.read_excel(popfile,sheet_name='LGA Household Totals',header=6,usecols="A:C",skipfooter=3)
df_hhold_wide.columns=['LGA','hhold_count_2016','hhold_count_2021']
df_hhold_wide['LGA'] = df_hhold.LGA.str.split('(').str.get(0)

#optional
df_hhold_wide['hhold_count_delta'] = df_hhold_wide.hhold_count_2021 - df_hhold_wide.hhold_count_2016

df_hhold_wide.head()

Unnamed: 0,LGA,hhold_count_2016,hhold_count_2021,hhold_count_delta
0,Albury,21940,23227,1287
1,Armidale Regional,11755,13041,1286
2,Ballina,18178,19080,902
3,Balranald,963,1015,52
4,Bathurst Regional,16105,17351,1246


Check LGA Map name vs household count LGA Name

In [7]:
lgamap = pd.Series(postcodeLGA.lganame.unique())
lgadf = pd.Series(df_hhold.LGA.unique())

lgacomps = pd.concat([lgamap,lgadf],axis=1)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(lgacomps)

                                          0                               1
0                               Albury City                         Albury 
1                         Armidale Regional              Armidale Regional 
2                                   Ballina                        Ballina 
3                                 Balranald                      Balranald 
4                         Bathurst Regional              Bathurst Regional 
5                                   Bayside                        Bayside 
6                               Bega Valley                    Bega Valley 
7                                 Bellingen                      Bellingen 
8                                  Berrigan                       Berrigan 
9                                 Blacktown                      Blacktown 
10                                    Bland                          Bland 
11                                  Blayney                        Blayney 
12          

There are some inconsistencies with LGA Mapping
- Albury City vs Albury
- City of parramatta vs parramatta
- Nambucca Valley vs Nambucca
- 'unincorporated NSW' 
- Lord Howe Island - Unincorporated Area

### Population movement in 5 year period


In [8]:
df_move = pd.read_excel(popfile,sheet_name='LGA population accounts', header=5, skipfooter=3, usecols="A:C")
df_move.columns=['LGA','pop_move','2016-2021']
df_move.head() 

Unnamed: 0,LGA,pop_move,2016-2021
0,Albury (C),Population at Start of Period,52171
1,Albury (C),Births,3390
2,Albury (C),Deaths,2219
3,Albury (C),Natural change,1171
4,Albury (C),Net Migration (all sources),1031


In [9]:
df_move_melt = pd.melt(df_move,id_vars=['LGA','pop_move'], value_vars=['2016-2021'], var_name='year')
df_move_pivot = df_move_melt.pivot(index=['LGA','year'], columns='pop_move', values='value').reset_index()
df_move_pivot['LGA'] = df_move_pivot.LGA.str.split('(').str.get(0)
df_move_pivot['pop_delta'] = df_move_pivot['Population at End of Period'] - df_move_pivot['Population at Start of Period']
df_move_pivot.head()

pop_move,LGA,year,Births,Deaths,Natural change,Net Migration (all sources),Population at End of Period,Population at Start of Period,pop_delta
0,Albury,2016-2021,3390.0,2219.0,1171.0,1031.0,54374.0,52171.0,2203.0
1,Armidale Regional,2016-2021,1768.0,1266.0,502.0,1921.0,32736.0,30313.0,2423.0
2,Ballina,2016-2021,1790.0,2491.0,-701.0,1945.0,44237.0,42993.0,1244.0
3,Balranald,2016-2021,194.0,96.0,98.0,8.0,2437.0,2330.0,107.0
4,Bathurst Regional,2016-2021,2500.0,1710.0,790.0,1277.0,44310.0,42244.0,2066.0


### Population Age

In [10]:
popfile = "Files/Population/2019 NSW Population Projections ASGS 2019 LGA.xlsx"
df_age = pd.read_excel(popfile,sheet_name='LGA Sex Age projections',header=5,usecols="A:E",skipfooter=3)
df_age.columns=['LGA','sex','age','2016','2021']
df_age['age_delta'] = df_age['2021'] - df_age['2016']
df_age['LGA'] = df_age.LGA.str.split('(').str.get(0)
df_age

Unnamed: 0,LGA,sex,age,2016,2021,age_delta
0,Albury,Female,00-04,1693,1661,-32
1,Albury,Female,05-09,1597,1694,97
2,Albury,Female,10-14,1617,1647,30
3,Albury,Female,15-19,1705,1724,19
4,Albury,Female,20-24,1928,1785,-143
...,...,...,...,...,...,...
4639,Yass Valley,Male,65-69,472,484,12
4640,Yass Valley,Male,70-74,335,433,98
4641,Yass Valley,Male,75-79,209,295,86
4642,Yass Valley,Male,80-84,128,160,32


In [11]:
df_age_pivot = pd.pivot_table(df_age,index=['LGA','age'], values=['2016','2021','age_delta'], 
               aggfunc=({'2016':np.sum, '2021':np.sum, 'age_delta':np.sum})).reset_index()
df_age_pivot.head()

Unnamed: 0,LGA,age,2016,2021,age_delta
0,Albury,00-04,3505,3401,-104
1,Albury,05-09,3279,3510,231
2,Albury,10-14,3228,3370,142
3,Albury,15-19,3381,3306,-75
4,Albury,20-24,3744,3448,-296


In [12]:
df_age_pivot.age.unique()

array(['00-04', '05-09', '10-14', '15-19', '20-24', '25-29', '30-34',
       '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69',
       '70-74', '75-79', '80-84', '85+'], dtype=object)

Group  
0-14: Child  
15-24: Youth  
25-49: Adult  
50-64: Middle Age  
65+ : Senior  

In [13]:
#clusters

Child = df_age_pivot.age.unique()[:3]
Youth = df_age_pivot.age.unique()[3:5]
Adult = df_age_pivot.age.unique()[5:9]
MiddleAge = df_age_pivot.age.unique()[9:13]
Senior = df_age_pivot.age.unique()[13:]

print('Child',Child)
print('Youth',Youth)
print('Adult',Adult)
print('MiddleAge',MiddleAge)
print('Senior',Senior)

Child ['00-04' '05-09' '10-14']
Youth ['15-19' '20-24']
Adult ['25-29' '30-34' '35-39' '40-44']
MiddleAge ['45-49' '50-54' '55-59' '60-64']
Senior ['65-69' '70-74' '75-79' '80-84' '85+']


In [14]:
age_categ = [df_age_pivot['age'].isin(Child),
             df_age_pivot['age'].isin(Youth),
             df_age_pivot['age'].isin(Adult),
             df_age_pivot['age'].isin(MiddleAge),
             df_age_pivot['age'].isin(Senior)]
age_output = ['Child','Youth','Adult','MiddleAge','Senior']

df_age_pivot['age_bracket'] = np.select(age_categ,age_output)
df_age_pivot.head(20)

Unnamed: 0,LGA,age,2016,2021,age_delta,age_bracket
0,Albury,00-04,3505,3401,-104,Child
1,Albury,05-09,3279,3510,231,Child
2,Albury,10-14,3228,3370,142,Child
3,Albury,15-19,3381,3306,-75,Youth
4,Albury,20-24,3744,3448,-296,Youth
5,Albury,25-29,3485,3505,20,Adult
6,Albury,30-34,3400,3543,143,Adult
7,Albury,35-39,3143,3526,383,Adult
8,Albury,40-44,3206,3145,-61,Adult
9,Albury,45-49,3330,3284,-46,MiddleAge
