# Import Libraries

In [1]:
import configparser
import pandas as pd
from pymongo import MongoClient
import urllib 

# Read Config File

In [2]:
config = configparser.ConfigParser()
config.read('config.ini')
Host_IP = config['DEFAULT']['IP']
MongoDB_Port = config['DEFAULT']['MongoDB-Port']
DB_Name = config['DEFAULT']['DB-Name']
User_Name = config['DEFAULT']['User-Name']
Psword = config['DEFAULT']['Psword']
Collection_Name = config['DEFAULT']['Collection-Name']
Import_Option_File = config['DEFAULT']['Import-Option-File']
population_output = config['DEFAULT']['Output-Population-File']

# Download Collections from MongoDB

In [3]:
def connect_mongo(host, port, DB_Name, username, password):
    """ A util for making a connection to mongo """
    client = MongoClient("mongodb://{}:{}@{}:{}".format(username, password, host, 27017))
    db = client[DB_Name]
    return db

In [4]:
def identify_extract_collection_names(import_option_df, db):
    """ read "import_options2016" text file and identify list of files for downloading data """
    files = []
    for t in import_option_df['Table Number'].tolist():
        for table_name in db.list_collection_names():
            if t in table_name and table_name.split('_')[-1] == 'SA1':
                files.append(table_name)
    return files

In [5]:
def extract_collections(import_option_df, DB_Name, Host_IP, MongoDB_Port, username, password):
    """ 
        Read from "import_options2016" text file to identify list of collection for data extraction
        and save the extracted collections as dictionary (key:name of collection, value: dataframe)       
    """
    # Connect to MongoDB
    census_db = connect_mongo(Host_IP, MongoDB_Port, DB_Name, username, password)
    collections = identify_extract_collection_names(import_option_df, census_db)   
    # create a dictionary to store DFs
    collection_dictionary = {}
    for c in collections:
        cursor = census_db[c].find()
        df =  pd.DataFrame(list(cursor))
        collection_dictionary[c] = df
    return collection_dictionary

In [None]:
import_option_df = pd.read_csv('{}.txt'.format(Import_Option_File), delimiter = ",", comment='#')
username = urllib.parse.quote_plus(User_Name)
password = urllib.parse.quote_plus(Psword)
collection_dictionary = extract_collections(import_option_df, DB_Name, Host_IP, MongoDB_Port, username, password)

In [None]:
# create DF that store list of collection's name for population data extraction
pop_files_df = pd.DataFrame(columns = ['Table Number'])
pop_files_df = pop_files_df.append({'Table Number':'2016Census_G01_AUS_SA1'}, ignore_index=True)
pop_files_df = pop_files_df.append({'Table Number':'2016Census_G33_AUS_SA1'}, ignore_index=True)
pop_files_df = pop_files_df.append({'Table Number':'2016Census_G16B_AUS_SA1'}, ignore_index=True)

# download population data (collections)
pop_collection_dictionary = extract_collections(pop_files_df, DB_Name, Host_IP, MongoDB_Port, username, password)

# Filtering Columns In The Collections

In [None]:
def extract_columns(import_option_df, collection_dictionary):
    """ extract columns from DFs """
    extracted_collection_dictionary = {}
    for index, row in import_option_df.iterrows():
        for table in collection_dictionary.keys():
            if row['Table Number'] in table:
                extracted_collection_dictionary[table] = collection_dictionary[table].filter(regex = row['Regex'].replace("'", ""))
                extracted_collection_dictionary[table]['SA1_7DIGITCODE_2016'] = collection_dictionary[table]['SA1_7DIGITCODE_2016']
    return extracted_collection_dictionary

In [None]:
extracted_collection_dictionary = extract_columns(import_option_df, collection_dictionary)
# save collections as csv files
for df in extracted_collection_dictionary.keys():
    extracted_collection_dictionary[df].to_csv("{}.csv".format(df), sep=',', encoding='utf-8', index=False)

In [None]:
# create a new DF to store total population, total household, and total people aged over 15
SA1_code = 'SA1_7DIGITCODE_2016'
new_pop_df = pd.DataFrame(columns = [SA1_code])
new_pop_df[SA1_code] = pop_collection_dictionary['2016Census_G01_AUS_SA1'][SA1_code]
new_pop_df = pd.merge(new_pop_df, pop_collection_dictionary['2016Census_G01_AUS_SA1'][[SA1_code, 'Tot_P_P']], on='SA1_7DIGITCODE_2016')
new_pop_df = pd.merge(new_pop_df, pop_collection_dictionary['2016Census_G33_AUS_SA1'][[SA1_code, 'Total_Total']], on='SA1_7DIGITCODE_2016')
new_pop_df = pd.merge(new_pop_df, pop_collection_dictionary['2016Census_G16B_AUS_SA1'][[SA1_code, 'P_Tot_Tot']], on='SA1_7DIGITCODE_2016')
# rename columns as same as "Aggregate" in "import_options2016.txt"
new_pop_df.rename(columns = {'Tot_P_P':'pops', 'Total_Total':'hhs', 'P_Tot_Tot':'p15'}, inplace=True)
# save the population dataframe as csv file
new_pop_df.to_csv('{}.csv'.format(population_output), sep=',', encoding='utf-8', index=False)