In [None]:
import os
import glob
import pandas as pd
import configparser 
import math

config = configparser.ConfigParser()
config.read('config.ini')
input_folder = config['DEFAULT']['Input-Folder']
out_folder = config['DEFAULT']['output-Folder']


In [None]:
def read_csv_to_df(li,file):
    df = pd.read_csv(file, index_col=None, header=0)
    li.append(df)
    return li

In [None]:
# read all files form the folder
def read_csv_from_folder(input_folder):
    # use glob to match the pattern ‘csv’
    print("reading files from the folder ...")
    extension = 'csv'
    all_filenames = [i for i in glob.glob(input_folder + '*.{}'.format(extension))]
    print("reading files is finished.")
    return all_filenames

In [None]:
# create folder if not exist
def create_folder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
    return folder

In [None]:
def sparate_australia__and_other(all_filenames):
    file_australia = []
    file_other = []

    for filename in all_filenames:
        if "Australia" in filename:
            file_australia.append(filename)
        else:
            file_other.append(filename)
    return file_australia,file_other

In [None]:
def aggregate_csv(li):
    combined_df = pd.concat(li,axis=0, ignore_index=True)
    
    # convert the data typr to string to avoid lossing data after group
    combined_df['hashtag'] = combined_df.hashtag.astype(str)
    combined_df['user_location'] = combined_df.user_location.astype(str)

    print("aggregating csv files...")
    group_df = combined_df.groupby(['hashtag','user_location']).agg({'hashtag':'first','user_location':'first','count':"sum"})
    return group_df
    

In [None]:
def rm_file(input_folder,out_folder,file_list,i):
    for file in file_list[(0+2*i):(2+2*i)]: 
        os.remove(file) 
    print("original files are deleted")

In [None]:
# export to csv
def export_csv(out_folder,country, i,group_df) :  
    file_name = out_folder + "combined_csv_" + country + str(i) + ".csv"
    group_df.to_csv(file_name, sep=',',index = False, encoding='utf-8-sig')
    print("combined_csv_" + country + str(i) + ".csv is finished")

In [None]:
def main(input_folder,out_folder,file_list,country,i):
    li = []
    for file in file_list[(0+2*i):(2+2*i)]:     
        # read files
        print(file)
        li = read_csv_to_df(li,file)
    
    # aggregate csv
    group_df = aggregate_csv(li)
    # remove original files
    rm_file(input_folder,out_folder,file_list,i)
    # export csv
    export_csv(out_folder,country,i,group_df)


In [None]:
all_filenames = read_csv_from_folder(input_folder)

In [None]:
file_australia,file_other = sparate_australia__and_other(all_filenames)

In [None]:
# sort list
file_australia.sort()
file_other.sort()

# length
l_australia = len(file_australia)
l_other = len(file_other)

# round
r_australia = math.ceil(l_australia/2)
r_other = math.ceil(l_other/2)

create_folder(out_folder)

if l_australia > 1:
    for i in range(0,r_australia):
        print("---------------------")
        main(input_folder,out_folder,file_australia,"Australia",i)
        print("---------------------")

if l_other > 1:
    for i in range(0,r_other):
        print("---------------------")
        main(input_folder,out_folder,file_other,"Other",i)
        print("---------------------")