In [1]:
import pandas as pd

In [3]:
# Location of data sources
folder_path = "../Data/2016_GCP_SA2/2016_GCP_SA2/"
file_name = "2016Census_G10C_AUS_SA2.csv"
output_folder = "../Data/"
output_name = "census_year_of_arrival.csv"
key_columns = ["SA2_MAINCODE_2016"]
file_columns = ["Tot_Before_1946","Tot_1946_1955","Tot_1956_1965","Tot_1966_1975","Tot_1976_1985"
                ,"Tot_1986_1995","Tot_1996_2005","Tot_2006_2010","Tot_2011","Tot_2012","Tot_2013"
                ,"Tot_2014","Tot_2015","Tot_2016","Tot_Tot"]
data_columns = key_columns + file_columns
file_path = folder_path + file_name

In [2]:
def load_csv_data(data_path, columns = [], skip_rows = 0):
    data = pd.read_csv(data_path, skiprows = skip_rows)
    if len(columns) > 0:
        data = data.loc[:,columns]
    lowercase = lambda x: str(x).lower()
    data.rename(lowercase, axis='columns', inplace=True)
    return data

def missing_values(data, columns = []):
    results = pd.DataFrame(columns = ["column_name", "total_missing"])
    for column in columns:
        col = column
        missing_value = data[column].isnull().sum()
        result = {"column_name": [col],
                  "total_missing": [missing_value]}
        result_df = pd.DataFrame(result, columns = ["column_name", "total_missing"])
        results = pd.concat([results, result_df])
    return results

In [22]:
df = load_csv_data(file_path, data_columns)

In [23]:
# Should be ~2310 as this is how many SA's there are.
len(df)

2310

In [24]:
df.head()

Unnamed: 0,sa2_maincode_2016,tot_before_1946,tot_1946_1955,tot_1956_1965,tot_1966_1975,tot_1976_1985,tot_1986_1995,tot_1996_2005,tot_2006_2010,tot_2011,tot_2012,tot_2013,tot_2014,tot_2015,tot_2016,tot_tot
0,101021007,0,29,64,107,80,51,41,30,4,5,8,4,13,4,460
1,101021008,0,68,177,289,144,174,162,136,34,30,23,22,25,15,1322
2,101021009,3,173,260,299,152,197,269,544,101,105,124,142,136,79,2680
3,101021010,0,50,81,123,78,107,115,277,35,44,46,90,75,40,1189
4,101021011,0,126,299,490,287,256,266,199,45,50,38,30,30,22,2205


In [25]:
df_columns = list(df)
missing_df = missing_values(df, df_columns)
missing_df

Unnamed: 0,column_name,total_missing
0,sa2_maincode_2016,0
0,tot_before_1946,0
0,tot_1946_1955,0
0,tot_1956_1965,0
0,tot_1966_1975,0
0,tot_1976_1985,0
0,tot_1986_1995,0
0,tot_1996_2005,0
0,tot_2006_2010,0
0,tot_2011,0


In [26]:
if "home_percent" not in df:
    df["home_percent"] = (df["tot_before_1946"] + df["tot_1946_1955"] + df["tot_1956_1965"] + df["tot_1966_1975"])/df["tot_tot"] 
    
if "established_percent" not in df:
    df["established_percent"] = (df["tot_1976_1985"] + df["tot_1986_1995"])/df["tot_tot"]
    
if "recent_percent" not in df:
    df["recent_percent"] = (df["tot_1996_2005"] + df["tot_2006_2010"])/df["tot_tot"]
    
if "new_percent" not in df:
    df["new_percent"] = (df["tot_2011"] + df["tot_2012"] + df["tot_2013"] + df["tot_2014"] + df["tot_2015"] + df["tot_2016"])/df["tot_tot"]

In [27]:
# Replace the NaN's created by calculation
df.fillna(0, inplace = True)

In [28]:
df_columns = list(df)
missing_df = missing_values(df, df_columns)
missing_df

Unnamed: 0,column_name,total_missing
0,sa2_maincode_2016,0
0,tot_before_1946,0
0,tot_1946_1955,0
0,tot_1956_1965,0
0,tot_1966_1975,0
0,tot_1976_1985,0
0,tot_1986_1995,0
0,tot_1996_2005,0
0,tot_2006_2010,0
0,tot_2011,0


In [29]:
# Confirm the Fill has worked
df.loc[df["sa2_maincode_2016"] == 801101136]

Unnamed: 0,sa2_maincode_2016,tot_before_1946,tot_1946_1955,tot_1956_1965,tot_1966_1975,tot_1976_1985,tot_1986_1995,tot_1996_2005,tot_2006_2010,tot_2011,tot_2012,tot_2013,tot_2014,tot_2015,tot_2016,tot_tot,home_percent,established_percent,recent_percent,new_percent
2296,801101136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [30]:
# Write data to disk
df.to_csv(output_folder + output_name, index = False)