In [8]:
import os
import pandas as pd
import seaborn as sns
from pathlib import Path
import numpy as np

def set_correct_working_dir(working_dir: str):
    current_working_dir = Path(os.getcwd())
    if working_dir == current_working_dir.name:
        print('WD correct')
        return
    for directory in current_working_dir.parents:
        if working_dir == directory.name:
            os.chdir(str(directory.absolute()))
            print('New WD:', str(directory.absolute()))
            return
    raise FileNotFoundError

set_correct_working_dir('geo-locations')

WD correct


### Reading in data

In [9]:
country_path = Path(os.getcwd()) / 'data' / 'americas' / 'MEX' 
source = country_path / 'source_1' / 'MX.txt'
# column description to be found at: https://download.geonames.org/export/zip/
mexico_df = pd.read_csv(source, sep='\t', header=0, names=['country_code ', 'postal_code', 'place_name', 'admin_name_1', 'admin_code_1', 'admin_name_2', 'admin_code_2', 'admin_name_3', 'admin_code_3', 'latitude', 'longitude', 'accuracy'])
display(mexico_df.head())

Unnamed: 0,country_code,postal_code,place_name,admin_name_1,admin_code_1,admin_name_2,admin_code_2,admin_name_3,admin_code_3,latitude,longitude,accuracy
0,MX,20010,San Cayetano,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1
1,MX,20010,Olivares Santana,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1
2,MX,20010,Las Brisas,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1
3,MX,20010,Ramon Romo Franco,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1
4,MX,20010,Colinas del Rio,Aguascalientes,1,Aguascalientes,1,Aguascalientes,1.0,21.9644,-102.3192,1


### Combining Data
The wanted format is: country_code, npa_plz, town, state, latitude, longitude
- country code: ISO 3166, alpha3

#### Source specific cleanup
Place name doesn't seem to be needed. Information at that level would be too sparse

In [12]:
df = mexico_df.copy()
df.rename(columns={'country_code ': 'country_code', 'admin_name_1': 'region', 'admin_name_2': 'town'}, inplace=True)
df.loc[(df['region'] == 'Distrito Federal'), 'region'] = df['admin_name_3'] + ', ' + df['region']  # Exception for larger cities
reduced_df = df[['country_code', 'postal_code', 'town', 'region', 'latitude', 'longitude']].copy()
reduced_df['country_code'] = 'MEX'
display(reduced_df.head())

Unnamed: 0,country_code,postal_code,town,region,latitude,longitude
0,MEX,20010,Aguascalientes,Aguascalientes,21.9644,-102.3192
1,MEX,20010,Aguascalientes,Aguascalientes,21.9644,-102.3192
2,MEX,20010,Aguascalientes,Aguascalientes,21.9644,-102.3192
3,MEX,20010,Aguascalientes,Aguascalientes,21.9644,-102.3192
4,MEX,20010,Aguascalientes,Aguascalientes,21.9644,-102.3192


#### Reduce to unique postal codes
- Postal codes should be unique (makes data merging MUCH easier)
- the lat/long data of duplicate postal_codes is merged by taking the mean

In [13]:
# display(reduced_df[reduced_df.duplicated()])  # shows duplicated lines
reduced_df.drop_duplicates(inplace=True)
display(reduced_df[reduced_df.duplicated('postal_code')])
df_long_lat = reduced_df.groupby('postal_code').agg('mean')
df_info = reduced_df[['country_code', 'postal_code', 'town', 'region']].copy()
df_m = df_info.merge(df_long_lat, how='left', on='postal_code')
df_m.drop_duplicates(inplace=True)
df_m = df_m.round({'latitude': 4, 'longitude': 4})
display(display(df_m[df_m.duplicated()]))

Unnamed: 0,country_code,postal_code,town,region,latitude,longitude
7,MEX,20016,Aguascalientes,Aguascalientes,21.8115,-102.2957
11,MEX,20020,Aguascalientes,Aguascalientes,21.7000,-102.3833
12,MEX,20020,Aguascalientes,Aguascalientes,21.8115,-102.2957
16,MEX,20030,Aguascalientes,Aguascalientes,21.9700,-102.3069
19,MEX,20040,Aguascalientes,Aguascalientes,21.8115,-102.2957
...,...,...,...,...,...,...
144648,MEX,99827,Santa María de la Paz,Zacatecas,21.4775,-103.3036
144649,MEX,99827,Santa María de la Paz,Zacatecas,21.5106,-103.2881
144651,MEX,99828,Santa María de la Paz,Zacatecas,21.5373,-103.2057
144652,MEX,99828,Santa María de la Paz,Zacatecas,21.5217,-103.2324


Unnamed: 0,country_code,postal_code,town,region,latitude,longitude


None

### Writing file

In [14]:
file_path = country_path / 'MEX_geocoding.csv'
df_m.to_csv(file_path, index=False)