In [4]:
import os
import pandas as pd
import seaborn as sns
from pathlib import Path
import numpy as np

def set_correct_working_dir(working_dir: str):
    current_working_dir = Path(os.getcwd())
    if working_dir == current_working_dir.name:
        print('WD correct')
        return
    for directory in current_working_dir.parents:
        if working_dir == directory.name:
            os.chdir(str(directory.absolute()))
            print('New WD:', str(directory.absolute()))
            return
    raise FileNotFoundError

set_correct_working_dir('geo-locations')

WD correct


### Reading in data

In [18]:
country_code = 'ind'
source = 'geonames'
country_path = Path(os.getcwd()) / 'data' / country_code
source = country_path / '_'.join(['source', source]) / 'IN.txt'
# column description to be found at: https://download.geonames.org/export/zip/
geonames_df = pd.read_csv(source, sep='\t', names=['country_code ', 'postal_code', 'place_name', 'admin_name_1', 'admin_code_1', 'admin_name_2', 'admin_code_2', 'admin_name_3', 'admin_code_3', 'latitude', 'longitude', 'accuracy'], converters={'postal_code': str})
display(geonames_df.head())

Unnamed: 0,country_code,postal_code,place_name,admin_name_1,admin_code_1,admin_name_2,admin_code_2,admin_name_3,admin_code_3,latitude,longitude,accuracy
0,IN,744101,Marine Jetty,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.6667,92.75,3
1,IN,744101,Port Blair,Andaman & Nicobar Islands,1,South Andaman,,Port Blair,,11.6667,92.75,4
2,IN,744101,N.S.Building,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.6667,92.75,3
3,IN,744102,Haddo,Andaman & Nicobar Islands,1,South Andaman,,Port Blair,,11.6833,92.7167,4
4,IN,744102,Chatham,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.7,92.6667,3


In [19]:
try:
    geonames_df.drop(columns=['admin_code_1', 'admin_code_2', 'admin_code_3', 'accuracy'], inplace=True)
except KeyError:
    pass
geonames_df.fillna(np.NaN)
for str_col in ['admin_name_1', 'admin_name_2', 'admin_name_3', 'place_name']:
    geonames_df.loc[(geonames_df[str_col].isnull(), 'admin_name_3')] = ''  # empty string for string columns

geonames_df.loc[(geonames_df['admin_name_2'] == geonames_df['admin_name_3'], 'admin_name_3')] = ''

geonames_df = geonames_df.groupby('postal_code').agg({'admin_name_1': lambda col: ';'.join(col),
                                                      'admin_name_2': lambda col: ';'.join(col),
                                                      'admin_name_3': lambda col: ';'.join(col), 
                                                      'place_name': lambda col: '||'.join(col),
                                                      'latitude': 'mean',
                                                      'longitude': 'mean'}).reset_index()

geonames_df['admin_name_1'] = geonames_df['admin_name_1'].apply(lambda x: ';'.join(list(set([y for y in x.split(';') if y]))) if ';' in x else x)
geonames_df['admin_name_2'] = geonames_df['admin_name_2'].apply(lambda x: ';'.join(list(set([y for y in x.split(';') if y]))) if ';' in x else x)
geonames_df['admin_name_3'] = geonames_df['admin_name_3'].apply(lambda x: ';'.join(list(set([y for y in x.split(';') if y]))) if ';' in x else x)
geonames_df.loc[(geonames_df['admin_name_3'] == '', 'region_id')] = geonames_df['admin_name_1'] + '::' + geonames_df['admin_name_2'] + '::' + geonames_df['place_name']
geonames_df.loc[(geonames_df['admin_name_3'] != '', 'region_id')] = geonames_df['admin_name_1'] + '::' + geonames_df['admin_name_2'] + '::' + geonames_df['admin_name_3'] + '::' + geonames_df['place_name']
geonames_df.reset_index(inplace=True, drop=True)
geonames_df.sort_values('postal_code', inplace=True)
geonames_df.insert(loc=0, column='country_code', value=country_code)
geonames_df = geonames_df.round({'latitude': 4, 'longitude': 4})
display(geonames_df.head(50))

Unnamed: 0,country_code,postal_code,admin_name_1,admin_name_2,admin_name_3,place_name,latitude,longitude,region_id
0,ind,110001,Delhi,New Delhi;Central Delhi,New Delhi,Connaught Place||North Avenue||New Delhi G.P.O...,28.6371,77.2162,Delhi::New Delhi;Central Delhi::New Delhi::Con...
1,ind,110002,Delhi,Central Delhi,New Delhi,Indraprastha||A.G.C.R.||Darya Ganj||Minto Road...,28.5635,77.2717,Delhi::Central Delhi::New Delhi::Indraprastha|...
2,ind,110003,Delhi,South Delhi;Central Delhi,New Delhi;Delhi,C G O Complex||Kasturba Nagar (South Delhi)||D...,28.6105,77.2329,Delhi::South Delhi;Central Delhi::New Delhi;De...
3,ind,110004,Delhi,Central Delhi,New Delhi,Rashtrapati Bhawan,28.6105,77.2329,Delhi::Central Delhi::New Delhi::Rashtrapati B...
4,ind,110005,Delhi,Central Delhi,New Delhi,Bank Street (Central Delhi)||Guru Gobind Singh...,28.6516,77.1886,Delhi::Central Delhi::New Delhi::Bank Street (...
5,ind,110006,Delhi,North Delhi;Central Delhi,Delhi,Delhi G.P.O.||Jama Masjid||Chawri Bazar||Chand...,28.6105,77.2329,Delhi::North Delhi;Central Delhi::Delhi::Delhi...
6,ind,110007,Delhi,North Delhi,Delhi,C.C.I.||Jawahar Nagar (North Delhi)||Kamla Nag...,28.6105,77.2329,Delhi::North Delhi::Delhi::C.C.I.||Jawahar Nag...
7,ind,110008,Delhi,Central Delhi,New Delhi,Dada Ghosh Bhawan||Patel Nagar West||Patel Nag...,28.6105,77.2329,Delhi::Central Delhi::New Delhi::Dada Ghosh Bh...
8,ind,110009,Delhi,North West Delhi,Delhi,Model Town II||Model Town III||Gujranwala Colo...,28.6105,77.2329,Delhi::North West Delhi::Delhi::Model Town II|...
9,ind,110010,Delhi,South West Delhi,,505 A B Workshop||COD (South West Delhi)||Dhau...,28.8047,77.6823,Delhi::South West Delhi::505 A B Workshop||COD...


In [20]:
try:
    geonames_df.drop(columns=['admin_name_1', 'admin_name_2', 'admin_name_3', 'place_name'], inplace=True)
except KeyError:
    pass
file_path = country_path / '_'.join([country_code, 'geocoding.csv'])
geonames_df.to_csv(file_path, index=False)