In [72]:
import os
import pandas as pd
import seaborn as sns
from pathlib import Path
import numpy as np

def set_correct_working_dir(working_dir: str):
    current_working_dir = Path(os.getcwd())
    if working_dir == current_working_dir.name:
        print('WD correct')
        return
    for directory in current_working_dir.parents:
        if working_dir == directory.name:
            os.chdir(str(directory.absolute()))
            print('New WD:', str(directory.absolute()))
            return
    raise FileNotFoundError

set_correct_working_dir('geo-locations')

WD correct


### Reading in data

In [73]:
country_code = 'mex'
source = 'geonames'
country_path = Path(os.getcwd()) / 'data' / country_code
source = country_path / '_'.join(['source', source]) / 'MX.txt'
# column description to be found at: https://download.geonames.org/export/zip/
geonames_df = pd.read_csv(source, sep='\t', names=['country_code ', 'postal_code', 'place_name', 'admin_name_1', 'admin_code_1', 'admin_name_2', 'admin_code_2', 'admin_name_3', 'admin_code_3', 'latitude', 'longitude', 'accuracy'], converters={'postal_code': str})
display(geonames_df.head())

Unnamed: 0,country_code,postal_code,place_name,admin_name_1,admin_code_1,admin_name_2,admin_code_2,admin_name_3,admin_code_3,latitude,longitude,accuracy
0,MX,20000,Zona Centro,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.8734,-102.2806,1
1,MX,20010,San Cayetano,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1
2,MX,20010,Olivares Santana,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1
3,MX,20010,Las Brisas,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1
4,MX,20010,Ramon Romo Franco,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1


### Cleanup for mexico city

In [74]:
geonames_df.loc[(geonames_df['admin_name_1'] == 'Distrito Federal'), 'admin_name_1'] = geonames_df['admin_name_1'] + ' ' + geonames_df['admin_name_3']  # Exception for larger cities
geonames_df.loc[(geonames_df['admin_name_1'] == 'Distrito Federal Ciudad de México'), 'admin_name_3'] = np.NaN  # Exception for larger cities
display(geonames_df[geonames_df['admin_name_1'] == 'Distrito Federal Ciudad de México'])

Unnamed: 0,country_code,postal_code,place_name,admin_name_1,admin_code_1,admin_name_2,admin_code_2,admin_name_3,admin_code_3,latitude,longitude,accuracy
27403,MX,02000,Centro de Azcapotzalco,Distrito Federal Ciudad de México,9,Azcapotzalco,2,,2.0,19.4815,-99.1862,4
27404,MX,02008,Delegación Política Azcapotzalco,Distrito Federal Ciudad de México,9,Azcapotzalco,2,,2.0,19.4815,-99.1862,1
27405,MX,02010,San Rafael,Distrito Federal Ciudad de México,9,Azcapotzalco,2,,2.0,19.3961,-99.1170,3
27406,MX,02010,Los Reyes,Distrito Federal Ciudad de México,9,Azcapotzalco,2,,2.0,19.4853,-99.1821,4
27407,MX,02010,Nuevo Barrio San Rafael,Distrito Federal Ciudad de México,9,Azcapotzalco,2,,2.0,19.3961,-99.1170,3
...,...,...,...,...,...,...,...,...,...,...,...,...
29416,MX,15960,Del Parque,Distrito Federal Ciudad de México,9,Venustiano Carranza,17,,15.0,19.4150,-99.0962,1
29417,MX,15968,Tribunales de La Suprema Corte de Justicia,Distrito Federal Ciudad de México,9,Venustiano Carranza,17,,15.0,19.4150,-99.0962,1
29418,MX,15970,Aeronáutica Militar,Distrito Federal Ciudad de México,9,Venustiano Carranza,17,,15.0,19.4244,-99.1174,4
29419,MX,15980,24 de Abril,Distrito Federal Ciudad de México,9,Venustiano Carranza,17,,15.0,19.4150,-99.0962,1


In [75]:
try:
    geonames_df.drop(columns=['admin_code_1', 'admin_code_2', 'admin_code_3', 'accuracy'], inplace=True)
except KeyError:
    pass
geonames_df.fillna(np.NaN)
for str_col in ['admin_name_1', 'admin_name_2', 'admin_name_3', 'place_name']:
    geonames_df.loc[(geonames_df[str_col].isnull(), 'admin_name_3')] = ''  # empty string for string columns

geonames_df.loc[(geonames_df['admin_name_2'] == geonames_df['admin_name_3'], 'admin_name_3')] = ''

geonames_df = geonames_df.groupby('postal_code').agg({'admin_name_1': lambda col: ';'.join(col),
                                                      'admin_name_2': lambda col: ';'.join(col),
                                                      'admin_name_3': lambda col: ';'.join(col), 
                                                      'place_name': lambda col: '||'.join(col),
                                                      'latitude': 'mean',
                                                      'longitude': 'mean'}).reset_index()

geonames_df['admin_name_1'] = geonames_df['admin_name_1'].apply(lambda x: ';'.join(list(set(x.split(';')))) if ';' in x else x)
geonames_df['admin_name_2'] = geonames_df['admin_name_2'].apply(lambda x: ';'.join(list(set(x.split(';')))) if ';' in x else x)
geonames_df['admin_name_3'] = geonames_df['admin_name_3'].apply(lambda x: ';'.join(list(set(x.split(';')))) if ';' in x else x)
geonames_df.loc[(geonames_df['admin_name_3'] == '', 'region_id')] = geonames_df['admin_name_1'] + '::' + geonames_df['admin_name_2'] + '::' + geonames_df['place_name']
geonames_df.loc[(geonames_df['admin_name_3'] != '', 'region_id')] = geonames_df['admin_name_1'] + '::' + geonames_df['admin_name_2'] + '::' + geonames_df['admin_name_3'] + '::' + geonames_df['place_name']
geonames_df.reset_index(inplace=True, drop=True)
geonames_df.sort_values('postal_code', inplace=True)
geonames_df.insert(loc=0, column='country_code', value=country_code)
geonames_df = geonames_df.round({'latitude': 4, 'longitude': 4})
display(geonames_df.head())

Unnamed: 0,country_code,postal_code,admin_name_1,admin_name_2,admin_name_3,place_name,latitude,longitude,region_id
0,mex,1000,Distrito Federal Ciudad de México,Álvaro Obregón,,San Ángel,19.3587,-99.2033,Distrito Federal Ciudad de México::Álvaro Obre...
1,mex,1010,Distrito Federal Ciudad de México,Álvaro Obregón,,Los Alpes,19.3569,-99.21,Distrito Federal Ciudad de México::Álvaro Obre...
2,mex,1020,Distrito Federal Ciudad de México,Álvaro Obregón,,Guadalupe Inn,19.5161,-99.1419,Distrito Federal Ciudad de México::Álvaro Obre...
3,mex,1029,Distrito Federal Ciudad de México,Álvaro Obregón,,INFONAVIT,19.3362,-99.2468,Distrito Federal Ciudad de México::Álvaro Obre...
4,mex,1030,Distrito Federal Ciudad de México,Álvaro Obregón,,Axotla||Florida,19.3567,-99.1784,Distrito Federal Ciudad de México::Álvaro Obre...


In [79]:
try:
    geonames_df.drop(columns=['admin_name_1', 'admin_name_2', 'admin_name_3', 'place_name'], inplace=True)
except KeyError:
    pass
file_path = country_path / '_'.join([country_code, 'geocoding.csv'])
geonames_df.to_csv(file_path, index=False)