In [1]:
import pandas as pd

In [2]:
# Location of data sources
folder_path = "../Data/2016_GCP_SA2/2016_GCP_SA2/"
file_name = "2016Census_G06_AUS_SA2.csv"
output_folder = "../Data/"
output_name = "census_social_marital_status.csv"
key_columns = ["SA2_MAINCODE_2016"]
file_columns = ["P_Tot_Marrd_reg_marrge","P_Tot_Married_de_facto","P_Tot_Not_married","P_Tot_Total"]
data_columns = key_columns + file_columns
file_path = folder_path + file_name

In [3]:
def load_csv_data(data_path, columns = [], skip_rows = 0):
    data = pd.read_csv(data_path, skiprows = skip_rows)
    if len(columns) > 0:
        data = data.loc[:,columns]
    lowercase = lambda x: str(x).lower()
    data.rename(lowercase, axis='columns', inplace=True)
    return data

def missing_values(data, columns = []):
    results = pd.DataFrame(columns = ["column_name", "total_missing"])
    for column in columns:
        col = column
        missing_value = data[column].isnull().sum()
        result = {"column_name": [col],
                  "total_missing": [missing_value]}
        result_df = pd.DataFrame(result, columns = ["column_name", "total_missing"])
        results = pd.concat([results, result_df])
    return results

In [4]:
df = load_csv_data(file_path, data_columns)

In [5]:
len(df)

2310

In [6]:
df.head()

Unnamed: 0,sa2_maincode_2016,p_tot_marrd_reg_marrge,p_tot_married_de_facto,p_tot_not_married,p_tot_total
0,101021007,1301,376,942,2616
1,101021008,2665,719,2657,6044
2,101021009,3036,916,3883,7837
3,101021010,1490,458,1589,3548
4,101021011,6740,1465,3303,11505


In [7]:
if "registered_marriage_percent" not in df:
    df["registered_marriage_percent"] = df["p_tot_marrd_reg_marrge"]/df["p_tot_total"] 
    
if "defacto_percent" not in df:
    df["defacto_percent"] = df["p_tot_married_de_facto"]/df["p_tot_total"] 
    
if "not_married_percent" not in df:
    df["not_married_percent"] = df["p_tot_not_married"]/df["p_tot_total"] 

In [10]:
df_columns = list(df)
missing_df = missing_values(df, df_columns)
missing_df

Unnamed: 0,column_name,total_missing
0,sa2_maincode_2016,0
0,p_tot_marrd_reg_marrge,0
0,p_tot_married_de_facto,0
0,p_tot_not_married,0
0,p_tot_total,0
0,registered_marriage_percent,0
0,defacto_percent,0
0,not_married_percent,0


In [9]:
# Replace the NaN's created by calculation
df.fillna(0, inplace = True)

In [11]:
df_columns = list(df)
missing_df = missing_values(df, df_columns)
missing_df

Unnamed: 0,column_name,total_missing
0,sa2_maincode_2016,0
0,p_tot_marrd_reg_marrge,0
0,p_tot_married_de_facto,0
0,p_tot_not_married,0
0,p_tot_total,0
0,registered_marriage_percent,0
0,defacto_percent,0
0,not_married_percent,0


In [12]:
# Write data to disk
df.to_csv(output_folder + output_name, index = False)