# GeoName

## Import Libraries

In [1]:
import pandas
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import unidecode
import os
from multiprocessing import Pool
from functools import partial
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ResourceWarning)
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)

## Read Config File

In [2]:
import configparser
config = configparser.ConfigParser()
config.read('config.ini')
flag = config['DEFAULT']['Flag']
input_file = config['DEFAULT']['Input-File']
ratio_value = config['DEFAULT']['Ratio-Value']
column_number = config['DEFAULT']['Column-Number']
output_file = config['DEFAULT']['Output-File']
world_cities_file = config['DEFAULT']['World-Cities-File']
world_states_file = config['DEFAULT']['World-States-File']
world_countries_file = config['DEFAULT']['World-Countries-File']
au_cities_file = config['DEFAULT']['AU-Cities-File']
au_states_file = config['DEFAULT']['AU-States-File']
au_countries_file = config['DEFAULT']['AU-Countries-File']

## Supporting Functions

In [3]:
def read_city_file(city_file):
    cities = pandas.read_csv(city_file, header=0, encoding="UTF-8")
    city_list = cities["city"].tolist()
    city_state_list = cities["state"].tolist()
    city_country_list = cities["country"].tolist()

    city_list = [unidecode.unidecode(x) for x in city_list]
    city_gid_list = cities["geonameid"].tolist()
    
    return city_list,city_state_list,city_country_list,city_gid_list

In [4]:
def read_state_file(state_file):
    states = pandas.read_csv(state_file, header=0, encoding="UTF-8")
    state_list = states["state"].tolist()
    state_country_list = states["country"].tolist()

    state_list = [unidecode.unidecode(str(x)) for x in state_list]
    state_gid_list = states["geonameid"].tolist()
    
    return state_list,state_country_list,state_gid_list

In [5]:
def read_country_file(country_file):
    countries = pandas.read_csv(country_file, header=0, encoding="UTF-8")
    country_list = countries["country"].tolist()

    country_list = [unidecode.unidecode(x) for x in country_list]
    country_gid_list = countries["geonameid"].tolist()
    
    return country_list,country_gid_list

In [6]:
def get_max_result(i,city_list,state_list,country_list):
    result_list = []
    
    # delete non-ASCII Characters
    uni_i = unidecode.unidecode(i)
    
    # calculate ratio
    ci_result = process.extractOne(uni_i,city_list,scorer=fuzz.token_set_ratio)
    st_result = process.extractOne(uni_i,state_list,scorer=fuzz.token_set_ratio)
    co_result = process.extractOne(uni_i,country_list,scorer=fuzz.token_set_ratio)

    result_list.append(ci_result)
    result_list.append(st_result)
    result_list.append(co_result)

    # choose the result with highest ratio
    max_result = max(result_list, key=lambda x:x[1])
    
    return max_result,ci_result,st_result,co_result

In [15]:
# get geo information for each value in the list "user_location"
def get_geo_info(i):
    
    city = state = country = geonameid = ""
    city_list = file_dict["city"][geo]["city"]
    state_list = file_dict["state"][geo]["state"]
    country_list = file_dict["country"][geo]["country"]

    max_result,ci_result,st_result,co_result = get_max_result(i,city_list,state_list,country_list)

    # match relevant geoname information
    if max_result[1] >= int(ratio_value):
        if max_result == ci_result:  
            city = ci_result[0]
            index = city_list.index(city)
            state = file_dict["city"][geo]["state"][index]
            country = file_dict["city"][geo]["country"][index]
            geonameid = file_dict["city"][geo]["geonameid"][index]

        elif max_result == st_result:
            state = st_result[0]
            index = state_list.index(state)
            country = file_dict["state"][geo]["country"][index]
            geonameid = file_dict["state"][geo]["geonameid"][index]

        elif max_result == co_result:
            country = co_result[0]
            index = country_list.index(country)
            geonameid = file_dict["country"][geo]["geonameid"][index]

    output_list=[i, city, state,country,geonameid]
    
    return output_list

## Read "world" CSV Files

In [8]:
# read world-cities
world_city_list,world_city_state_list,world_city_country_list,world_city_gid_list = read_city_file(world_cities_file)

# read world-states
world_state_list,world_state_country_list,world_state_gid_list = read_state_file(world_states_file)

# read world-countries
world_country_list,world_country_gid_list = read_country_file(world_countries_file)

## Read "Australia" CSV Files

In [9]:
# read au-cities
au_city_list,au_city_state_list,au_city_country_list,au_city_gid_list = read_city_file(au_cities_file)

# read au-states
au_state_list,au_state_country_list,au_state_gid_list = read_state_file(au_states_file)

# read au-country
au_country_list,au_country_gid_list = read_country_file(au_countries_file)

## Read Collection Data

In [10]:
data = pandas.read_csv(input_file, encoding="UTF-8")
user_location = data.iloc[:,int(column_number)].tolist()

## Main Code

In [11]:
# create file list dictionary
file_dict = {"city":{"au":{"city":au_city_list,
                          "state":au_city_state_list,
                          "country":au_city_country_list,
                          "geonameid":au_city_gid_list},
                    "world":{"city":world_city_list,
                          "state":world_city_state_list,
                          "country":world_city_country_list,
                          "geonameid":world_city_gid_list}
                    },
            "state":{"au":{"state":au_state_list,
                          "country":au_state_country_list,
                          "geonameid":au_state_gid_list},
                    "world":{"state":world_state_list,
                          "country":world_state_country_list,
                          "geonameid":world_state_gid_list}
                    },
            "country":{"au":{"country":au_country_list,
                            "geonameid":au_country_gid_list},
                      "world":{"country":world_country_list,
                            "geonameid":world_country_gid_list}
                      }
            }

In [12]:
# check flag
# World
if flag == "1":
    geo = "world"
    
# Australia
elif flag == "2":
    geo = "au"

In [16]:
# multiprocessing
pool = Pool(processes=os.cpu_count())  
output_list = pool.map(get_geo_info,user_location)

# write into csv file
df_output =  pandas.DataFrame(output_list,columns=['user_location','geoname_city','geoname_state','geoname_country','geoname_id'])
df_output.to_csv(output_file, sep=',',index = False, encoding='UTF-8')

5.340670824050903
