In [2]:
import os
import pandas as pd
import seaborn as sns
from pathlib import Path
import numpy as np

def set_correct_working_dir(working_dir: str):
    current_working_dir = Path(os.getcwd())
    if working_dir == current_working_dir.name:
        print('WD correct')
        return
    for directory in current_working_dir.parents:
        if working_dir == directory.name:
            os.chdir(str(directory.absolute()))
            print('New WD:', str(directory.absolute()))
            return
    raise FileNotFoundError

set_correct_working_dir('geo-locations')

New WD: C:\Users\vzagolla\Documents\GitHub\geo-locations


### Reading in data

In [15]:
country_code = 'mex'
country_path = Path(os.getcwd()) / 'data' / country_code
source = country_path / 'source_1' / 'MX.txt'
# column description to be found at: https://download.geonames.org/export/zip/
geonames_df = pd.read_csv(source, sep='\t', names=['country_code ', 'postal_code', 'place_name', 'admin_name_1', 'admin_code_1', 'admin_name_2', 'admin_code_2', 'admin_name_3', 'admin_code_3', 'latitude', 'longitude', 'accuracy'], converters={'postal_code': str})
display(geonames_df.head())

Unnamed: 0,country_code,postal_code,place_name,admin_name_1,admin_code_1,admin_name_2,admin_code_2,admin_name_3,admin_code_3,latitude,longitude,accuracy
0,MX,20000,Zona Centro,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.8734,-102.2806,1
1,MX,20010,San Cayetano,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1
2,MX,20010,Olivares Santana,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1
3,MX,20010,Las Brisas,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1
4,MX,20010,Ramon Romo Franco,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1


### Combining Data
The wanted format is: country_code, npa_plz, town, state, latitude, longitude
- country code: ISO 3166, alpha3

#### Geonames cleanup

In [16]:
df = geonames_df.copy()
df.rename(columns={'country_code ': 'country_code', 'admin_name_1': 'region', 'admin_name_2': 'town'}, inplace=True)

#### Source specific cleanup
Place name doesn't seem to be needed. Information at that level would be too sparse

In [17]:
df.loc[(df['region'] == 'Distrito Federal'), 'region'] = df['region'] + ':' + df['admin_name_3']  # Exception for larger cities
reduced_df = df[['country_code', 'postal_code', 'town', 'region', 'latitude', 'longitude']].copy()

#### Reduce to unique postal codes
- Postal codes should be unique (makes data merging MUCH easier)
- the lat/long data of duplicate postal_codes is merged by taking the mean

In [22]:
reduced_df = reduced_df.groupby('postal_code').agg({'town': lambda col: ','.join(col),
                                                    'region': lambda col: ','.join(col), 
                                                    'latitude': 'mean',
                                                    'longitude': 'mean'}).reset_index()
reduced_df['town'] = reduced_df['town'].apply(lambda x: ';'.join(list(set(x.split(',')))) if ',' in x else x)
reduced_df['region'] = reduced_df['region'].apply(lambda x: ';'.join(list(set(x.split(',')))) if ',' in x else x)
reduced_df.reset_index(inplace=True, drop=True)
reduced_df.sort_values('postal_code', inplace=True)
reduced_df.insert(loc=0, column='country_code', value=country_code)
reduced_df = reduced_df.round({'latitude': 4, 'longitude': 4})
display(reduced_df.head())

Unnamed: 0,country_code,postal_code,town,region,latitude,longitude
0,mex,1000,Álvaro Obregón,Distrito Federal:Ciudad de México,19.3587,-99.2033
1,mex,1010,Álvaro Obregón,Distrito Federal:Ciudad de México,19.3569,-99.21
2,mex,1020,Álvaro Obregón,Distrito Federal:Ciudad de México,19.5161,-99.1419
3,mex,1029,Álvaro Obregón,Distrito Federal:Ciudad de México,19.3362,-99.2468
4,mex,1030,Álvaro Obregón,Distrito Federal:Ciudad de México,19.3567,-99.1784


### Writing file

In [23]:
file_path = country_path / '_'.join([country_code, 'geocoding.csv'])
reduced_df.to_csv(file_path, index=False)