In [1]:
import glob
import os
import time
import pandas as pd
import configparser 

config = configparser.ConfigParser()
config.read('config.ini')
input_folder = config['DEFAULT']['Input-Folder']
out_folder = config['DEFAULT']['output-Folder']
aus_file_path = config['DEFAULT']['Aus-File-Path']
world_file_path = config['DEFAULT']['World-File-Path']

In [2]:
# read all files form the folder
def read_csv_from_folder(input_folder):
    # use glob to match the pattern ‘csv’
    print("reading files from the folder ...")
    extension = 'csv'
    all_filenames = [i for i in glob.glob(input_folder + '*.{}'.format(extension))]
    print("reading files is finished.")
    return all_filenames

In [3]:
# get list from csv file
def get_list_from_csv(data):
    ci = data.city.tolist()
    city = ci[0:]
    st = data.state.tolist()
    state = st[0:]
    co = data.country.tolist()
    country = co[0:]
    return city,state,country

In [4]:
# create foler if not exist
def create_folder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)

In [5]:
# add value to new column if meet certain condition
def add_column_value(s_geo,i,df,index,city,state,country):
    if s_geo == "city":
        df.at[i,'city'] = str(city[index])
    if s_geo == "city" or s_geo == "state":
        df.at[i,'state'] = str(state[index])    
    df.at[i,'country'] = str(country[index])
    return df

In [6]:
def update_location(geo,s_geo,row,i,df,exist,city,state,country,start_time,t):
    for x in geo:
        if isinstance(x,str):
            if s_geo == "city":
                loc = row['user_location'].lower().replace("australia","")
            else:
                loc = row['user_location'].lower()
            if x.lower() in loc:
                index = geo.index(x)
                df = add_column_value(s_geo,i,df,index,city,state,country)
                exist = 1
                
                # print every hour if it's still running
                t = calculate_time(start_time, t)
                break
    return df,exist,t

In [7]:
# export to csv
def export_csv(file_name,df,out_folder) : 
    print("exporting csv file...")
    if "australia" in file_name.lower():
        df[['state']] = df[['state']].replace(['Victoria'], ["VIC"])     
    df.to_csv(out_folder + file_name, sep=',',index = False, encoding='utf-8-sig')
    print(file_name + " is finished")

In [8]:
# calculate running time
def calculate_time(start_time, t):
    current_time = time.time()
    duration = current_time - start_time
    if (duration/60) >= (t+10):
        t += 10
        print("The program is still running, already run for about "+ str(t) + " minutes.")
    return t

In [9]:
def main(df,city,state,country,start_time,t):
    for i, row in df.iterrows():
        exist = 0
        s_geo = "city"
        df,exist,t = update_location(city,s_geo,row,i,df,exist,city,state,country,start_time,t)
        if exist == 0:
            s_geo = "state"
            df,exist,t = update_location(state,s_geo,row,i,df,exist,city,state,country,start_time,t)
        if exist == 0:
            s_geo = "country"
            df,exist,t = update_location(country,s_geo,row,i,df,exist,city,state,country,start_time,t)
    print("New csv file is finished.")
    return df

In [10]:
all_filenames = read_csv_from_folder(input_folder)
create_folder(out_folder)
for file in all_filenames:
    
    print("--------------------")
    file_name = os.path.basename(file)
    print("processing file: " + file_name)
    
    start_time = time.time()
    t = 0

    # read city, state, country from csv file
    if "australia" in file_name.lower():
        geo_data = pd.read_csv(aus_file_path, encoding="ISO-8859-1")
    else:
        geo_data = pd.read_csv(world_file_path, encoding="ISO-8859-1")

    city, state, country = get_list_from_csv(geo_data)

    df = pd.read_csv(file, encoding="ISO-8859-1")

    df = main(df,city,state,country,start_time,t)

    export_csv(file_name,df,out_folder)
    print("--------------------")

reading files from the folder ...
reading files is finished.
--------------------
processing file: combined_csv_Other0.csv
New csv file is finished.
exporting csv file...
combined_csv_Other0.csv is finished
--------------------
