**The objective of this notebook is to collate the codes for cleaning below data:**
1. Bond yields (F)
2. Ineterest rate (F)
3. Population (K)
4. Construction (K)
5. Weekly income (A)
6. Household size (A)

(please add features to the list if there's any additional ones)

In [1]:
import pandas as pd

### 1. Bond Yields ###
Please paste cleaning codes below

### 2. Interest Rate ###
Please paste cleaning codes below

### 3. Population / Age bands 
Please paste cleaning codes below

### 4. Construction
Please paste cleaning codes below

### 5. Weekly Income

In [2]:
# Read data in to the raw da
census_INCP = "Files/Census/POA (UR) by INCP Toal Personal Income (Weekly).csv"

incp_raw = pd.read_csv(census_INCP, skiprows=9, nrows=11142,
                       usecols=['POA (UR)', 'INCP Total Personal Income (weekly)', 'Count'])

# Rename column for easier referencing
incp_cols = {'POA (UR)':'postcode', 'INCP Total Personal Income (weekly)':'INCP_WK'}
incp_raw.rename(columns=incp_cols, inplace=True)

# Unstack
incp = incp_raw.groupby(['postcode','INCP_WK'])['Count'].sum().unstack()

# Remove the last row (grand total)
incp = incp[:-1]

incp.head(2)

INCP_WK,"$1,000-$1,249 ($52,000-$64,999)","$1,250-$1,499 ($65,000-$77,999)","$1,500-$1,749 ($78,000-$90,999)","$1,750-$1,999 ($91,000-$103,999)","$1-$149 ($1-$7,799)","$150-$299 ($7,800-$15,599)","$2,000-$2,999 ($104,000-$155,999)","$3,000 or more ($156,000 or more)","$300-$399 ($15,600-$20,799)","$400-$499 ($20,800-$25,999)","$500-$649 ($26,000-$33,799)","$650-$799 ($33,800-$41,599)","$800-$999 ($41,600-$51,999)",Negative income,Nil income,Not applicable,Not stated,Total
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
"2000, NSW",1676,1028,977,677,502,1112,1456,1796,1978,2201,1915,1599,1638,204,3297,1251,4115,27411
"2006, NSW",11,9,9,3,275,291,4,11,98,47,31,5,3,9,267,15,156,1261


In [3]:
# Remove 'NSW' in the index and cast postcode to int64
incp.reset_index(inplace=True)
incp['postcode'] = incp['postcode'].str.split(",", n=1, expand=True)
incp['postcode'] = incp['postcode'].astype('int64')
incp = incp.set_index('postcode')

In [4]:
# Clean column names
income_cols= {'$1,000-$1,249 ($52,000-$64,999)' : '$1000-1249', 
            '$1,250-$1,499 ($65,000-$77,999)' : '$1250-1499',
            '$1,500-$1,749 ($78,000-$90,999)' : '$1500-1749 ', 
            '$1,750-$1,999 ($91,000-$103,999)': '$1750-1999',
            '$1-$149 ($1-$7,799)': '$1-149', 
            '$150-$299 ($7,800-$15,599)' : '$150-299',
            '$2,000-$2,999 ($104,000-$155,999)':'$2000-2999',
            '$3,000 or more ($156,000 or more)':'>=$3000', 
            '$300-$399 ($15,600-$20,799)':'$300-399',
            '$400-$499 ($20,800-$25,999)':'$400-499', 
            '$500-$649 ($26,000-$33,799)':'$500-649',
            '$650-$799 ($33,800-$41,599)':'$650-799', 
            '$800-$999 ($41,600-$51,999)':'$800-999'}
incp.rename(columns=income_cols, inplace=True)

# Combine 'not applicable' and 'not stated' into 'total_na'
incp['total_na'] = incp['Not applicable'] + incp['Not stated']

# Drop the 'Total column'
incp = incp.drop(columns=['Not applicable', 'Not stated', 'Total'], axis=1)

# Reorder columns
cols = incp.columns.tolist()
cols = ['$1-149','$150-299','$300-399','$400-499','$500-649','$650-799',
        '$800-999','$1000-1249','$1250-1499','$1500-1749 ',
        '$1750-1999','$2000-2999','>=$3000',
        'Negative income','Nil income','total_na']
incp=incp[cols]

incp.head(1)

INCP_WK,$1-149,$150-299,$300-399,$400-499,$500-649,$650-799,$800-999,$1000-1249,$1250-1499,$1500-1749,$1750-1999,$2000-2999,>=$3000,Negative income,Nil income,total_na
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2000,502,1112,1978,2201,1915,1599,1638,1676,1028,977,677,1456,1796,204,3297,5366


In [5]:
# Create income buckets and save into incp_gr
incp['INCP_LOW'] = incp.iloc[:, 0:6].sum(axis=1)
incp['INCP_MID'] = incp.iloc[:, 6:10].sum(axis=1)
incp['INCP_HIGH'] = incp.iloc[:, 10:13].sum(axis=1)
incp['INCP_NEG_NIL'] = incp.iloc[:, 13:15].sum(axis=1)
incp_gr = incp[['INCP_LOW', 'INCP_MID', 'INCP_HIGH', 'INCP_NEG_NIL']]

# Reset index
incp_gr.reset_index(inplace=True)

incp_gr.head(1)

INCP_WK,postcode,INCP_LOW,INCP_MID,INCP_HIGH,INCP_NEG_NIL
0,2000,9307,5319,3929,3501


*The resulting cleanead df is <b>incp_gr</b>*

----

### 6. Household size

In [6]:
# Read data
census_cprf = "Files/Census/POA by CPRF Count of Persons in Family by STATE.xlsx"
cprf = pd.read_excel(census_cprf, sheet_name="Data Sheet 0", skiprows=9, nrows=619)

# Remove redundant rows and columns 
cprf = cprf[1:] #remove the first row
cprf = cprf.drop(columns='CPRF Count of Persons in Family') # remove the first column

# Rename columns
cprf_cols= {'Unnamed: 1' : 'postcode', 
            'Two persons in family' : 'CPRF_2',
            'Three persons in family' : 'CPRF_3', 
            'Four persons in family': 'CPRF_4',
            'Five persons in family': 'CPRF_5', 
            'Six or more persons in family' : 'CPRF_6+',
            'Not applicable':'CPRF_na',
            'Total' :'CPRF_TOTAL_FAM_NO'}
cprf.rename(columns=cprf_cols, inplace=True)

cprf.head(1)

Unnamed: 0,postcode,CPRF_2,CPRF_3,CPRF_4,CPRF_5,CPRF_6+,CPRF_na,CPRF_TOTAL_FAM_NO
1,"2000, NSW",3453.0,857.0,354.0,54.0,21.0,8125.0,12861.0


In [7]:
# Remove 'NSW' in the index and cast postcode to int64
cprf.reset_index(inplace=True)
cprf['postcode'] = cprf['postcode'].str.split(",", n=1, expand=True)
cprf['postcode'] = cprf['postcode'].astype('int64')
cprf = cprf.set_index('postcode')
cprf = cprf.drop(columns='index', axis=1)

In [8]:
# Reset index for merging
cprf.reset_index(inplace=True)
cprf.head(1)

Unnamed: 0,postcode,CPRF_2,CPRF_3,CPRF_4,CPRF_5,CPRF_6+,CPRF_na,CPRF_TOTAL_FAM_NO
0,2000,3453.0,857.0,354.0,54.0,21.0,8125.0,12861.0


*The resulting cleanead df is <b>cprf</b>*

----

### 7. Additional Feature 1 
Please paste cleaning codes below

### 8. Additional Feature 2
Please paste cleaning codes below

### 9. Additional Feature 3
Please paste cleaning codes below

## USE BELOW CELL TO MERGE FEATURES INTO THE MASTER DF, IGNORE FOR NOW

--------