## Import Libraries

In [1]:
import os
import glob
import pandas as pd
import math

## Read Config File

In [3]:
import configparser

config = configparser.ConfigParser()
config.read('config.ini')
input_folder = config['DEFAULT']['Input-Folder']
out_folder = config['DEFAULT']['output-Folder']
column_list = config['DEFAULT']['Column-List']
column_type_list = config['DEFAULT']['Column-Type-List']

column_list = column_list.split(',')
column_type_list = column_type_list.split(',')

## Supporting Functions

In [None]:
def read_csv_to_df(li,file):
    df = pd.read_csv(file, index_col=None, header=0,encoding='UTF-8',lineterminator="\n")
    li.append(df)
    return li

In [None]:
# read all files form the folder
def read_csv_from_folder(input_folder):
    # use glob to match the pattern ‘csv’
    print("reading files from the folder ...")
    extension = 'csv'
    all_filenames = [i for i in glob.glob(input_folder + '*.{}'.format(extension))]
    print("reading files is finished.")
    return all_filenames

In [None]:
# create folder if not exist
def create_folder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
    return folder

In [None]:
def sparate_australia__and_other(all_filenames):
    file_australia = []
    file_other = []

    for filename in all_filenames:
        if "Australia" in filename:
            file_australia.append(filename)
        else:
            file_other.append(filename)
    return file_australia,file_other

In [None]:
def aggregate_csv(li):
    combined_df = pd.concat(li,axis=0, ignore_index=True)
    
    # convert the data typr to string to avoid lossing data after group
    for column in column_list:
        combined_df[column] = combined_df.user_location.astype(str)
    
    print("aggregating csv files...")
    
    # setup aggregation dictionary
    agg_dic = {}
    for column in column_list:
        index = column_list.index(column)
        if column_type_list[index] == "string":
            agg_dic[column] = 'first'
        elif column_type_list[index] in ['int','float']:
            agg_dic[column] = 'sum'
            
    # group by
    if len(column_list) == 1:
        group_df = combined_df.groupby(column_list[0]).agg(agg_dic)
    else:
        group_df = combined_df.groupby(column_list).agg(agg_dic)
        
    return group_df

In [None]:
def rm_file(input_folder,out_folder,file_list,i):
    for file in file_list[(0+2*i):(2+2*i)]: 
        os.remove(file) 
    print("original files are deleted")

In [None]:
# export to csv
def export_csv(out_folder,country, i,group_df) :  
    file_name = out_folder + "combined_user_location_" + country + str(i) + ".csv"
    group_df.to_csv(file_name, sep=',',index = False, encoding='UTF-8')
    print("combined_user_location_" + country + str(i) + ".csv is finished")

In [None]:
def main(input_folder,out_folder,file_list,country,i):
    
    li = []
    for file in file_list[(0+2*i):(2+2*i)]:     
        # read files
        print(file)
        li = read_csv_to_df(li,file)
    
    # aggregate csv
    group_df = aggregate_csv(li)
    # remove original files
    rm_file(input_folder,out_folder,file_list,i)
    # export csv
    export_csv(out_folder,country,i,group_df)


## Main Code

In [None]:
# get all files
all_filenames = read_csv_from_folder(input_folder)
file_australia,file_other = sparate_australia__and_other(all_filenames)

# length
l_australia = len(file_australia)
l_other = len(file_other)

# get max length
max_length = max(l_australia, l_other)
max_round = math.ceil(max_length/2)

# create output folder if not exist
create_folder(out_folder)

In [None]:
for r in range(0,max_round):
    
    print("Round: " + str(r))
    # read files
    all_filenames = read_csv_from_folder(input_folder)
    file_australia,file_other = sparate_australia__and_other(all_filenames)

    # sort list
    file_australia.sort()
    file_other.sort()

    # length
    l_australia = len(file_australia)
    l_other = len(file_other)

    # round
    r_australia = math.ceil(l_australia/2)
    r_other = math.ceil(l_other/2)
    
    if l_australia > 1:
        for i in range(0,r_australia):
            print("---------------------")
            if r == 0:
                main(input_folder,out_folder,file_australia,"Australia",i)
            else:
                main(out_folder,out_folder,file_australia,"Australia",i)
            print("---------------------")

    if l_other > 1:
        for i in range(0,r_other):
            print("---------------------")
            if r == 0:
                main(input_folder,out_folder,file_other,"Other",i)
            else:
                main(out_folder,out_folder,file_australia,"Other",i)
            print("---------------------")
    