In [2]:
import pandas as pd

In [3]:
# Location of data sources
folder_path = "../Data/2016_GCP_SA2/2016_GCP_SA2/"
file_name = "2016Census_G14_AUS_SA2.csv"
output_folder = "../Data/"
output_name = "census_religious_affimiliation.csv"
key_columns = ["SA2_MAINCODE_2016"]
file_columns = ["Buddhism_P","Christianity_Anglican_P","Christianity_Baptist_P","Christianity_Catholic_P"
                ,"Christianity_Eastrn_Orthdox_P","Christianity_Uniting_Church_P","Christianity_Tot_P"
                ,"Hinduism_P","Islam_P","Judaism_P","Other_Religions_Tot_P","SB_OSB_NRA_Tot_P","Tot_P"]
data_columns = key_columns + file_columns
file_path = folder_path + file_name

In [4]:
def load_csv_data(data_path, columns = [], skip_rows = 0):
    data = pd.read_csv(data_path, skiprows = skip_rows)
    if len(columns) > 0:
        data = data.loc[:,columns]
    lowercase = lambda x: str(x).lower()
    data.rename(lowercase, axis='columns', inplace=True)
    return data

def missing_values(data, columns = []):
    results = pd.DataFrame(columns = ["column_name", "total_missing"])
    for column in columns:
        col = column
        missing_value = data[column].isnull().sum()
        result = {"column_name": [col],
                  "total_missing": [missing_value]}
        result_df = pd.DataFrame(result, columns = ["column_name", "total_missing"])
        results = pd.concat([results, result_df])
    return results

In [5]:
df = load_csv_data(file_path, data_columns)

In [6]:
# Should be ~2310 as this is how many SA's there are.
len(df)

2310

In [7]:
df.head()

Unnamed: 0,sa2_maincode_2016,buddhism_p,christianity_anglican_p,christianity_baptist_p,christianity_catholic_p,christianity_eastrn_orthdox_p,christianity_uniting_church_p,christianity_tot_p,hinduism_p,islam_p,judaism_p,other_religions_tot_p,sb_osb_nra_tot_p,tot_p
0,101021007,34,731,11,821,21,109,1879,0,0,0,21,1365,3872
1,101021008,80,1467,56,2523,367,244,5299,54,54,3,62,2024,8247
2,101021009,135,1480,81,2644,422,226,5658,331,261,10,223,3088,10842
3,101021010,50,625,36,1197,110,106,2511,153,186,8,126,1313,4786
4,101021011,95,2954,178,4281,177,435,9085,51,38,4,99,5503,16946


In [8]:
df_columns = list(df)
missing_df = missing_values(df, df_columns)
missing_df

Unnamed: 0,column_name,total_missing
0,sa2_maincode_2016,0
0,buddhism_p,0
0,christianity_anglican_p,0
0,christianity_baptist_p,0
0,christianity_catholic_p,0
0,christianity_eastrn_orthdox_p,0
0,christianity_uniting_church_p,0
0,christianity_tot_p,0
0,hinduism_p,0
0,islam_p,0


In [11]:
if "buddhism_percent" not in df:
    df["buddhism_percent"] = df["buddhism_p"]/df["tot_p"] 
    
if "christianity_anglican_percent" not in df:
    df["christianity_anglican_percent"] = df["christianity_anglican_p"]/df["tot_p"] 
    
if "christianity_baptist_percent" not in df:
    df["christianity_baptist_percent"] = df["christianity_baptist_p"]/df["tot_p"] 
    
if "christianity_catholic_percent" not in df:
    df["christianity_catholic_percent"] = df["christianity_catholic_p"]/df["tot_p"] 
    
if "christianity_eastrn_orthdox_percent" not in df:
    df["christianity_eastrn_orthdox_percent"] = df["christianity_eastrn_orthdox_p"]/df["tot_p"] 
    
if "christianity_uniting_church_percent" not in df:
    df["christianity_uniting_church_percent"] = df["christianity_uniting_church_p"]/df["tot_p"] 
    
if "christianity_tot_percent" not in df:
    df["christianity_tot_percent"] = df["christianity_tot_p"]/df["tot_p"] 
    
if "hinduism_percent" not in df:
    df["hinduism_percent"] = df["hinduism_p"]/df["tot_p"]
    
if "islam_percent" not in df:
    df["islam_percent"] = df["islam_p"]/df["tot_p"] 
    
if "judaism_percent" not in df:
    df["judaism_percent"] = df["judaism_p"]/df["tot_p"] 
    
if "other_religions_percent" not in df:
    df["other_religions_percent"] = df["other_religions_tot_p"]/df["tot_p"] 
    
if "no_religion_percent" not in df:
    df["no_religion_percent"] = df["sb_osb_nra_tot_p"]/df["tot_p"] 

In [13]:
# Replace the NaN's created by calculation
df.fillna(0, inplace = True)

In [14]:
# Confirm the Fill has worked
df.loc[df["sa2_maincode_2016"] == 801101136]

Unnamed: 0,sa2_maincode_2016,buddhism_p,christianity_anglican_p,christianity_baptist_p,christianity_catholic_p,christianity_eastrn_orthdox_p,christianity_uniting_church_p,christianity_tot_p,hinduism_p,islam_p,...,christianity_baptist_percent,christianity_catholic_percent,christianity_eastrn_orthdox_percent,christianity_uniting_church_percent,christianity_tot_percent,hinduism_percent,islam_percent,judaism_percent,other_religions_percent,no_religion_percent
2296,801101136,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Write data to disk
df.to_csv(output_folder + output_name, index = False)