# GeoName

## Import Libraries

In [None]:
import pandas
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import unidecode
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ResourceWarning)
warnings.filterwarnings("ignore", category=PendingDeprecationWarning)

## Read Config File

In [None]:
import configparser
config = configparser.ConfigParser()
config.read('config.ini')
flag = config['DEFAULT']['Flag']
input_file = config['DEFAULT']['Input-File']
ratio_value = config['DEFAULT']['Ratio-Value']
column_number = config['DEFAULT']['Column-Number']
output_file = config['DEFAULT']['Output-File']
world_cities_file = config['DEFAULT']['World-Cities-File']
world_states_file = config['DEFAULT']['World-States-File']
world_countries_file = config['DEFAULT']['World-Countries-File']
au_cities_file = config['DEFAULT']['AU-Cities-File']
au_states_file = config['DEFAULT']['AU-States-File']
au_countries_file = config['DEFAULT']['AU-Countries-File']

## Supporting Functions

In [None]:
def read_city_file(city_file):
    cities = pandas.read_csv(city_file, header=0, encoding="UTF-8")
    city_list = cities["city"].tolist()
    city_state_list = cities["state"].tolist()
    city_country_list = cities["country"].tolist()

    city_list = [unidecode.unidecode(x) for x in city_list]
    city_gid_list = cities["geonameid"].tolist()
    
    return city_list,city_state_list,city_country_list,city_gid_list

In [None]:
def read_state_file(state_file):
    states = pandas.read_csv(state_file, header=0, encoding="UTF-8")
    state_list = states["state"].tolist()
    state_country_list = states["country"].tolist()

    state_list = [unidecode.unidecode(str(x)) for x in state_list]
    state_gid_list = states["geonameid"].tolist()
    
    return state_list,state_country_list,state_gid_list

In [None]:
def read_country_file(country_file):
    countries = pandas.read_csv(country_file, header=0, encoding="UTF-8")
    country_list = countries["country"].tolist()

    country_list = [unidecode.unidecode(x) for x in country_list]
    country_gid_list = countries["geonameid"].tolist()
    
    return country_list,country_gid_list

In [None]:
# write csv file
def write_csv(output_file,header,user_location,city,state,country,geonameid): 
    with open(output_file, 'a') as f:
        if header == 0:
            f.write("user_location,geoname_city,geoname_state,geoname_country,geoname_id" + "\n")
        
        f.write('"{}","{}","{}","{}","{}"'.format(user_location,city,state, country,geonameid))
        f.write("\n")

In [None]:
def get_max_result(i,city_list,state_list,country_list):
    result_list = []
    
    # delete non-ASCII Characters
    uni_i = unidecode.unidecode(i)
    
    # calculate ratio
    ci_result = process.extractOne(uni_i,city_list,scorer=fuzz.token_set_ratio)
    st_result = process.extractOne(uni_i,state_list,scorer=fuzz.token_set_ratio)
    co_result = process.extractOne(uni_i,country_list,scorer=fuzz.token_set_ratio)

    result_list.append(ci_result)
    result_list.append(st_result)
    result_list.append(co_result)

    # choose the result with highest ratio
    max_result = max(result_list, key=lambda x:x[1])
    
    return max_result,ci_result,st_result,co_result

In [None]:
def get_geo_info(user_location,city_list,city_state_list,city_country_list,city_gid_list,\
                 state_list,state_country_list,state_gid_list,country_list,country_gid_list,ratio_value):
    header = 0
    for i in user_location:
        city = state = country = geonameid = ""

        max_result,ci_result,st_result,co_result = get_max_result(i,city_list,state_list,country_list)

        # match relevant geoname information
        if max_result[1] >= int(ratio_value):
            if max_result == ci_result:  
                city = ci_result[0]
                index = city_list.index(city)
                state = city_state_list[index]
                country = city_country_list[index]
                geonameid = city_gid_list[index]

            elif max_result == st_result:
                state = st_result[0]
                index = state_list.index(state)
                country = state_country_list[index]
                geonameid = state_gid_list[index]

            elif max_result == co_result:
                country = co_result[0]
                index = country_list.index(country)
                geonameid = country_gid_list[index]

        write_csv(output_file,header,i,city,state,country,geonameid)
        header = 1

## Read "world" CSV Files

In [None]:
# read world-cities
world_city_list,world_city_state_list,world_city_country_list,world_city_gid_list = read_city_file(world_cities_file)

# read world-states
world_state_list,world_state_country_list,world_state_gid_list = read_state_file(world_states_file)

# read world-countries
world_country_list,world_country_gid_list = read_country_file(world_countries_file)

## Read "Australia" CSV Files

In [None]:
# read au-cities
au_city_list,au_city_state_list,au_city_country_list,au_city_gid_list = read_city_file(au_cities_file)

# read au-states
au_state_list,au_state_country_list,au_state_gid_list = read_state_file(au_states_file)

# read au-country
au_country_list,au_country_gid_list = read_country_file(au_countries_file)

## Read Collection Data

In [None]:
data = pandas.read_csv(input_file, encoding="UTF-8")
user_location = data.iloc[:,int(column_number)].tolist()

## Main Code

In [None]:
# World
if flag == "1":
    get_geo_info(user_location,world_city_list,world_city_state_list,world_city_country_list,world_city_gid_list,\
                 world_state_list,world_state_country_list,world_state_gid_list,world_country_list,\
                 world_country_gid_list,ratio_value)
# Australia
elif flag == "2":
    get_geo_info(user_location,au_city_list,au_city_state_list,au_city_country_list,au_city_gid_list,\
                 au_state_list,au_state_country_list,au_state_gid_list,au_country_list,au_country_gid_list,ratio_value)