In [None]:
import pandas as pd

In [None]:
# Location of data sources
folder_path = "../Data/2016_GCP_SA2/2016_GCP_SA2/"
file_name = "2016Census_G01_AUS_SA2.csv"
output_folder = "../Data/"
output_name = "census_selected_characteristics.csv"
key_columns = ["SA2_MAINCODE_2016"]
file_columns = ["Tot_P_P","Age_20_24_yr_P","Age_25_34_yr_P","Age_35_44_yr_P","Age_45_54_yr_P",
                "Age_55_64_yr_P","Age_65_74_yr_P","Age_75_84_yr_P","Age_85ov_P",
                "Indigenous_P_Tot_P","Birthplace_Australia_P","Lang_spoken_home_Eng_only_P",
               "High_yr_schl_comp_Yr_12_eq_P"]
data_columns = key_columns + file_columns
file_path = folder_path + file_name

In [None]:
def load_csv_data(data_path, columns = [], skip_rows = 0):
    data = pd.read_csv(data_path, skiprows = skip_rows)
    if len(columns) > 0:
        data = data.loc[:,columns]
    lowercase = lambda x: str(x).lower()
    data.rename(lowercase, axis='columns', inplace=True)
    return data

def missing_values(data, columns = []):
    results = pd.DataFrame(columns = ["column_name", "total_missing"])
    for column in columns:
        col = column
        missing_value = data[column].isnull().sum()
        result = {"column_name": [col],
                  "total_missing": [missing_value]}
        result_df = pd.DataFrame(result, columns = ["column_name", "total_missing"])
        results = pd.concat([results, result_df])
    return results

In [None]:
df = load_csv_data(file_path, data_columns)

In [None]:
# Should be ~2310 as this is how many SA's there are.
len(df)

In [None]:
df.head()

In [None]:
df_columns = list(df)
missing_df = missing_values(df, df_columns)
missing_df

In [None]:
if "age_20_24_percent" not in df:
    df["age_20_24_percent"] = df["age_20_24_yr_p"]/df["tot_p_p"] 
    
if "age_25_34_percent" not in df:
    df["age_25_34_percent"] = df["age_25_34_yr_p"]/df["tot_p_p"] 
    
if "age_35_44_percent" not in df:
    df["age_35_44_percent"] = df["age_35_44_yr_p"]/df["tot_p_p"] 
    
if "age_45_54_percent" not in df:
    df["age_45_54_percent"] = df["age_45_54_yr_p"]/df["tot_p_p"] 
    
if "age_55_64_percent" not in df:
    df["age_55_64_percent"] = df["age_55_64_yr_p"]/df["tot_p_p"]
    
if "age_65_74_percent" not in df:
    df["age_65_74_percent"] = df["age_65_74_yr_p"]/df["tot_p_p"]
    
if "age_75_84_percent" not in df:
    df["age_75_84_percent"] = df["age_75_84_yr_p"]/df["tot_p_p"]
    
if "age_85ov_percent" not in df:
    df["age_85ov_percent"] = df["age_85ov_p"]/df["tot_p_p"]
    
if "indigenous_pop_percent" not in df:
    df["indigenous_percent"] = df["indigenous_p_tot_p"]/df["tot_p_p"]
    
if "birthplace_australia_percent" not in df:
    df["birthplace_australia_percent"] = df["birthplace_australia_p"]/df["tot_p_p"]

if "lang_eng_only_percent" not in df:
    df["lang_eng_only_percent"] = df["lang_spoken_home_eng_only_p"]/df["tot_p_p"]
    
if "high_yr_12_comp_percent" not in df:
    df["high_yr_12_comp_percent"] = df["high_yr_schl_comp_yr_12_eq_p"]/df["tot_p_p"]

In [None]:
# Replace the NaN's created by calculation
df.fillna(0, inplace = True)

In [None]:
# Confirm the Fill has worked
df.loc[df["sa2_maincode_2016"] == 801101136]

In [None]:
# Write data to disk
df.to_csv(output_folder + output_name, index = False)