In [1]:
import os
import pandas as pd
import seaborn as sns
from pathlib import Path
import numpy as np

def set_correct_working_dir(working_dir: str):
    current_working_dir = Path(os.getcwd())
    if working_dir == current_working_dir.name:
        print('WD correct')
        return
    for directory in current_working_dir.parents:
        if working_dir == directory.name:
            os.chdir(str(directory.absolute()))
            print('New WD:', str(directory.absolute()))
            return
    raise FileNotFoundError

set_correct_working_dir('geo-locations')

New WD: C:\Users\vzagolla\Documents\GitHub\geo-locations


### Reading in data

In [20]:
country_code = 'ind'
country_path = Path(os.getcwd()) / 'data' / country_code
source = country_path / 'source' / 'IN.txt'
# column description to be found at: https://download.geonames.org/export/zip/
geonames_df = pd.read_csv(source, sep='\t', names=['country_code ', 'postal_code', 'place_name', 'admin_name_1', 'admin_code_1', 'admin_name_2', 'admin_code_2', 'admin_name_3', 'admin_code_3', 'latitude', 'longitude', 'accuracy'], converters={'postal_code': str})
display(geonames_df.head())

Unnamed: 0,country_code,postal_code,place_name,admin_name_1,admin_code_1,admin_name_2,admin_code_2,admin_name_3,admin_code_3,latitude,longitude,accuracy
0,IN,744101,Marine Jetty,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.6667,92.75,3
1,IN,744101,Port Blair,Andaman & Nicobar Islands,1,South Andaman,,Port Blair,,11.6667,92.75,4
2,IN,744101,N.S.Building,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.6667,92.75,3
3,IN,744102,Haddo,Andaman & Nicobar Islands,1,South Andaman,,Port Blair,,11.6833,92.7167,4
4,IN,744102,Chatham,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.7,92.6667,3


### Combining Data
The wanted format is: country_code, npa_plz, town, state, latitude, longitude
- country code: ISO 3166, alpha3

#### Geonames cleanup

In [21]:
df = geonames_df.copy()
df.rename(columns={'country_code ': 'country_code', 'admin_name_1': 'region', 'admin_name_2': 'town'}, inplace=True)

#### Source specific cleanup
Place name doesn't seem to be needed. Information at that level would be too sparse

In [22]:
# nothing to do for india
pass

#### Reduce to unique postal codes
- Postal codes should be unique (makes data merging MUCH easier)
- the lat/long data of duplicate postal_codes is merged by taking the mean

In [23]:
reduced_df = reduced_df.groupby('postal_code').agg({'town': lambda col: ','.join(col),
                                                    'region': lambda col: ','.join(col), 
                                                    'latitude': 'mean',
                                                    'longitude': 'mean'}).reset_index()
reduced_df['town'] = reduced_df['town'].apply(lambda x: ';'.join(list(set(x.split(',')))) if ',' in x else x)
reduced_df['region'] = reduced_df['region'].apply(lambda x: ';'.join(list(set(x.split(',')))) if ',' in x else x)
reduced_df.reset_index(inplace=True, drop=True)
reduced_df.sort_values('postal_code', inplace=True)
reduced_df.insert(loc=0, column='country_code', value=country_code)
display(reduced_df.head())

Unnamed: 0,country_code,postal_code,town,region,latitude,longitude
0,ind,110001,New Delhi;Central Delhi,Delhi,28.637095,77.2162
1,ind,110002,Central Delhi,Delhi,28.563486,77.271714
2,ind,110003,South Delhi;Central Delhi,Delhi,28.6105,77.2329
3,ind,110004,Central Delhi,Delhi,28.6105,77.2329
4,ind,110005,Central Delhi,Delhi,28.6516,77.1886


### Writing file

In [25]:
file_path = country_path / '_'.join([country_code, 'geocoding.csv'])
df_m.to_csv(file_path, index=False)