# Import Libraries

In [2]:
import configparser
import pandas as pd
from pymongo import MongoClient

# Read Config File

In [3]:
config = configparser.ConfigParser()
config.read('config.ini')
HOST_IP = config['DEFAULT']['IP']
MONGO_PORT = config['DEFAULT']['MongoDB-Port']
DB_NAME = config['DEFAULT']['DB-Name']
USER_NAME = config['DEFAULT']['User-Name']
PASSWORD = config['DEFAULT']['Psword']
#Collection_Name = config['DEFAULT']['Collection-Name']
Import_Option_File = config['DEFAULT']['Import-Option-File']
population_output = config['DEFAULT']['Output-Population-File']

# Download Collections from MongoDB

In [4]:
def connect_mongo(HOST_IP, MONGO_PORT, DB_NAME, USER_NAME, PASSWORD):
    """ A util for making a connection to mongo """
    client = MongoClient("mongodb://{}:{}@{}:{}".format(USER_NAME, PASSWORD, HOST_IP, MONGO_PORT))
    db = client[DB_NAME]
    return db

In [5]:
def identify_extract_collection_names(import_option_df, db):
    """ read "import_options2016" text file and identify list of files for downloading data """
    files = []
    for t in import_option_df['Table Number'].tolist():
        for table_name in db.list_collection_names():
            if t in table_name and table_name.split('_')[-1] == 'SA1':
                files.append(table_name)
    return files

In [6]:
def extract_collections(import_option_df, DB_NAME, HOST_IP, MONGO_PORT, USER_NAME, PASSWORD):
    """ 
        Read from "import_options2016" text file to identify list of collection for data extraction
        and save the extracted collections as dictionary (key:name of collection, value: dataframe)       
    """
    # Connect to MongoDB
    census_db = connect_mongo(HOST_IP, MONGO_PORT, DB_NAME, USER_NAME, PASSWORD)
    collections = identify_extract_collection_names(import_option_df, census_db)
    # create a dictionary to store DFs
    collection_dictionary = {}
    for c in collections:
        cursor = census_db[c].find()
        df =  pd.DataFrame(list(cursor))
        collection_dictionary[c] = df
    return collection_dictionary

In [7]:
import_option_df = pd.read_csv('{}.txt'.format(Import_Option_File), delimiter = ",", comment='#')
collection_dictionary = extract_collections(import_option_df, DB_NAME, HOST_IP, MONGO_PORT, USER_NAME, PASSWORD)

In [10]:
# create DF that store list of collection's name for population data extraction
pop_files_df = pd.DataFrame(columns = ['Table Number'])
pop_files_df = pop_files_df.append({'Table Number':'2016Census_G01_AUS_SA1'}, ignore_index=True)
pop_files_df = pop_files_df.append({'Table Number':'2016Census_G33_AUS_SA1'}, ignore_index=True)
pop_files_df = pop_files_df.append({'Table Number':'2016Census_G16B_AUS_SA1'}, ignore_index=True)

# download population data (collections)
pop_collection_dictionary = extract_collections(pop_files_df, DB_NAME, HOST_IP, MONGO_PORT, USER_NAME, PASSWORD)

# Filtering Columns In The Collections

In [11]:
def extract_columns(import_option_df, collection_dictionary):
    """ extract columns from DFs """
    extracted_collection_dictionary = {}
    for index, row in import_option_df.iterrows():
        for table in collection_dictionary.keys():
            if row['Table Number'] in table:
                extracted_collection_dictionary[table] = collection_dictionary[table].filter(regex = row['Regex'].replace("'", ""))
                extracted_collection_dictionary[table]['SA1_7DIGITCODE_2016'] = collection_dictionary[table]['SA1_7DIGITCODE_2016']
    return extracted_collection_dictionary

In [12]:
extracted_collection_dictionary = extract_columns(import_option_df, collection_dictionary)
# save collections as csv files
for df in extracted_collection_dictionary.keys():
    extracted_collection_dictionary[df].to_csv("{}.csv".format(df), sep=',', encoding='utf-8', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_collection_dictionary[table]['SA1_7DIGITCODE_2016'] = collection_dictionary[table]['SA1_7DIGITCODE_2016']


In [None]:
# create a new DF to store total population, total household, and total people aged over 15
SA1_code = 'SA1_7DIGITCODE_2016'
new_pop_df = pd.DataFrame(columns = [SA1_code])
new_pop_df[SA1_code] = pop_collection_dictionary['2016Census_G01_AUS_SA1'][SA1_code]
new_pop_df = pd.merge(new_pop_df, pop_collection_dictionary['2016Census_G01_AUS_SA1'][[SA1_code, 'Tot_P_P']], on='SA1_7DIGITCODE_2016')
new_pop_df = pd.merge(new_pop_df, pop_collection_dictionary['2016Census_G33_AUS_SA1'][[SA1_code, 'Total_Total']], on='SA1_7DIGITCODE_2016')
new_pop_df = pd.merge(new_pop_df, pop_collection_dictionary['2016Census_G16B_AUS_SA1'][[SA1_code, 'P_Tot_Tot']], on='SA1_7DIGITCODE_2016')
# rename columns as same as "Aggregate" in "import_options2016.txt"
new_pop_df.rename(columns = {'Tot_P_P':'pops', 'Total_Total':'hhs', 'P_Tot_Tot':'p15'}, inplace=True)
# save the population dataframe as csv file
new_pop_df.to_csv('{}.csv'.format(population_output), sep=',', encoding='utf-8', index=False)