# Base

- DICT_long_province_alphanumerics `{'province alphanumeric': 'province key', ...}`
- DICT_long_district_alphanumerics `{'Province English': {'district alphanumeric': 'district key', ...}}`
- DICT_long_ward_alphanumerics `{'Province English': {'District English': {'long ward alphanumeric': 'ward key'}}}` 

- LIST_long_district_alphanumerics `['district alphanumeric', ...]`
- LIST_long_ward_alphanumerics `['long ward alphanumeric', ...]`

- LIST_province_keys_1 `['short province key', ...]`
- LIST_province_keys_2 `['short province key', ...]`
- LIST_province_keys_3 `['short province key', ...]`

- DICT_province_map `{'short province key': {'province': 'Hồ Chí Minh', 'long_province': 'Thành phố Hồ Chí Minh', ...}, ...}`
- DICT_district_map `{'Province English': {'district key': {'level english': {'district': 'Tân Bình', 'long_district': 'Quận Tân Bình', ...}}}}`
- DICT_ward_map `{'Province English':{'District English': {'ward key': {'level english': {'ward': 'Phú Trinh', 'long_ward': 'Phường Phú Trinh', ...}}}}}`

- LIST_duplicated_district_province_keys `['short province key', ...]`
- DICT_duplicated_district_keys `['short district key', ...]`

- LIST_duplicated_ward_district_keys `['short district key', ...]`
- DICT_duplicated_ward_keys `['short ward key', ...]`


- DICT_unique_long_district_alphanumerics `{'long_district_alphanumeric': 'province_key'}`
- DICT_not_unique_long_district_alphanumerics
- LIST_contains_province_key_long_district_alphanumerics
- DICT_unique_district_keys `{'district_key': 'province_key'}`
- DICT_not_unique_district_keys `{'district_key': {'province_key 1': ['ward key', ...], {'province_key 2': ['ward key', ...]}}`

- DICT_alias_province_keys `{'alias province key': 'province key', ...}`


In [1]:
import numpy as np
import pandas as pd
import re
from unidecode import unidecode
import pickle
import sys
sys.path.append('../../vietadminunits')
from utils import to_alphanumeric, to_key, abbreviate_alphanumeric_prefix
from tqdm.auto import tqdm

In [2]:
df = pd.read_csv('../../data/output/vietnam_administrative_units.csv')

In [3]:
df.shape

(10547, 20)

In [4]:
df['province_key'] = df['province'].apply(to_key, args=(1,))
df['district_key'] = df['short_district'].apply(to_key, args=(2,))
df['ward_key'] = df['short_ward'].apply(to_key, args=(3,))

df['long_province_alphanumeric'] = df['long_province'].apply(to_alphanumeric)
df['long_district_alphanumeric'] = df['long_district'].apply(to_alphanumeric)
df['long_ward_alphanumeric'] = df['long_ward'].apply(to_alphanumeric)

df['long_province_alphanumeric_english'] = df['long_province_english'].apply(to_alphanumeric)
df['long_district_alphanumeric_english'] = df['long_district_english'].apply(to_alphanumeric)
df['long_ward_alphanumeric_english'] = df['long_ward_english'].apply(to_alphanumeric)

df['province_alphanumeric'] = df['province'].apply(to_alphanumeric)
df['short_district_alphanumeric'] = df['short_district'].apply(to_alphanumeric)
df['short_ward_alphanumeric'] = df['short_ward'].apply(to_alphanumeric)

df['district_level_english'].fillna('', inplace=True)
df['ward_level_english'].fillna('', inplace=True)

## Create DICT long alphanumerics

In [5]:
DICT_long_province_alphanumerics = {}

for long_province_alphanumeric in df['long_province_alphanumeric'].unique().tolist():
    province_key = df.loc[df['long_province_alphanumeric']==long_province_alphanumeric, 'province_key'].values[0]
    DICT_long_province_alphanumerics[long_province_alphanumeric] = province_key
    if 'thanhpho' in long_province_alphanumeric:
        DICT_long_province_alphanumerics[long_province_alphanumeric.replace('thanhpho', 'tp.')] = province_key
        
for long_province_alphanumeric_english in df['long_province_alphanumeric_english'].unique().tolist():
    province_key = df.loc[df['long_province_alphanumeric_english']==long_province_alphanumeric_english, 'province_key'].values[0]
    DICT_long_province_alphanumerics[long_province_alphanumeric_english] = province_key

In [6]:
DICT_long_province_alphanumerics = dict(sorted(DICT_long_province_alphanumerics.items(), key=lambda x: len(x[0]), reverse=True))

In [7]:
DICT_long_province_alphanumerics

{'thuathienhueprovince': 'thuathienhue',
 'bariavungtauprovince': 'bariavungtau',
 'tuyenquangprovince': 'tuyenquang',
 'thainguyenprovince': 'thainguyen',
 'thanhphohochiminh': 'hochiminh',
 'quangninhprovince': 'quangninh',
 'quangbinhprovince': 'quangbinh',
 'quangngaiprovince': 'quangngai',
 'ninhthuanprovince': 'ninhthuan',
 'binhthuanprovince': 'binhthuan',
 'binhphuocprovince': 'binhphuoc',
 'binhduongprovince': 'binhduong',
 'tiengiangprovince': 'tiengiang',
 'kiengiangprovince': 'kiengiang',
 'thanhphohaiphong': 'haiphong',
 'tinhthuathienhue': 'thuathienhue',
 'tinhbariavungtau': 'bariavungtau',
 'dienbienprovince': 'dienbien',
 'bacgiangprovince': 'bacgiang',
 'vinhphucprovince': 'vinhphuc',
 'haiduongprovince': 'haiduong',
 'thaibinhprovince': 'thaibinh',
 'ninhbinhprovince': 'ninhbinh',
 'thanhhoaprovince': 'thanhhoa',
 'quangtriprovince': 'quangtri',
 'quangnamprovince': 'quangnam',
 'binhdinhprovince': 'binhdinh',
 'khanhhoaprovince': 'khanhhoa',
 'vinhlongprovince': 'vi

In [8]:
DICT_long_district_alphanumerics = {}
for province_english in df['province_english'].unique().tolist():
    district_data = {}
    for long_district_alphanumeric in df[df['province_english']==province_english]['long_district_alphanumeric'].unique().tolist():
        district_key = df.loc[(df['province_english']==province_english) & (df['long_district_alphanumeric']==long_district_alphanumeric), 'district_key'].values[0]
        district_data[long_district_alphanumeric] = district_key

        alias_district_alphanumeric = abbreviate_alphanumeric_prefix(long_district_alphanumeric)
        
        district_data[alias_district_alphanumeric] = district_key
        
    for long_district_alphanumeric_english in df[df['province_english']==province_english]['long_district_alphanumeric_english'].unique().tolist():
        district_key = df.loc[(df['province_english']==province_english) & (df['long_district_alphanumeric_english']==long_district_alphanumeric_english), 'district_key'].values[0]
        district_data[long_district_alphanumeric_english] = district_key
    
    district_data = dict(sorted(district_data.items(), key=lambda x: len(x[0]), reverse=True))
    DICT_long_district_alphanumerics[province_english] = district_data

In [9]:
DICT_long_district_alphanumerics

{'Ha Noi': {'haibatrungdistrict': 'haibatrung',
  'thanhxuandistrict': 'thanhxuan',
  'namtuliemdistrict': 'namtuliem',
  'bactuliemdistrict': 'bactuliem',
  'danphuongdistrict': 'danphuong',
  'thachthatdistrict': 'thachthat',
  'thuongtindistrict': 'thuongtin',
  'hoankiemdistrict': 'hoankiem',
  'longbiendistrict': 'longbien',
  'hoangmaidistrict': 'hoangmai',
  'thanhtridistrict': 'thanhtri',
  'chuongmydistrict': 'chuongmy',
  'thanhoaidistrict': 'thanhoai',
  'phuxuyendistrict': 'phuxuyen',
  'caugiaydistrict': 'caugiay',
  'donganhdistrict': 'donganh',
  'phucthodistrict': 'phuctho',
  'hoaiducdistrict': 'hoaiduc',
  'quocoaidistrict': 'quocoai',
  'quanhaibatrung': 'haibatrung',
  'huyendanphuong': 'danphuong',
  'huyenthachthat': 'thachthat',
  'huyenthuongtin': 'thuongtin',
  'badinhdistrict': 'badinh',
  'dongdadistrict': 'dongda',
  'socsondistrict': 'socson',
  'gialamdistrict': 'gialam',
  'melinhdistrict': 'melinh',
  'hadongdistrict': 'hadong',
  'unghoadistrict': 'ungh

In [10]:
DICT_long_ward_alphanumerics = {}
for province_english in df['province_english'].unique().tolist():
    district_data = {}
    for district_english in df[df['province_english']==province_english]['district_english'].unique().tolist():
        ward_data = {}
        for long_ward_alphanumeric in df[(df['province_english']==province_english) & (df['district_english']==district_english)]['long_ward_alphanumeric'].dropna().unique().tolist():
            ward_key = df.loc[(df['province_english']==province_english) & (df['district_english']==district_english) & (df['long_ward_alphanumeric']==long_ward_alphanumeric), 'ward_key'].values[0]
            ward_data[long_ward_alphanumeric] = ward_key
            
            alias_ward_alphanumeric = abbreviate_alphanumeric_prefix(long_ward_alphanumeric)
            
            ward_data[alias_ward_alphanumeric] = ward_key
        
        for long_ward_alphanumeric_english in df[(df['province_english']==province_english) & (df['district_english']==district_english)]['long_ward_alphanumeric_english'].dropna().unique().tolist():
            ward_key = df.loc[(df['province_english']==province_english) & (df['district_english']==district_english) & (df['long_ward_alphanumeric_english']==long_ward_alphanumeric_english), 'ward_key'].values[0]
            ward_data[long_ward_alphanumeric_english] = ward_key
        
        ward_data = dict(sorted(ward_data.items(), key=lambda x: len(x[0]), reverse=True))
        district_data[district_english] = ward_data
    
    DICT_long_ward_alphanumerics[province_english] = district_data

In [11]:
DICT_long_ward_alphanumerics

{'Ha Noi': {'Ba Dinh': {'phuongnguyentrungtruc': 'nguyentrungtruc',
   'nguyentrungtrucward': 'nguyentrungtruc',
   'p.nguyentrungtruc': 'nguyentrungtruc',
   'phuongquanthanh': 'quanthanh',
   'phuongngockhanh': 'ngockhanh',
   'phuongthanhcong': 'thanhcong',
   'phuongtrucbach': 'trucbach',
   'phuongvinhphuc': 'vinhphuc',
   'phuonglieugiai': 'lieugiai',
   'phuongdienbien': 'dienbien',
   'phuonggiangvo': 'giangvo',
   'quanthanhward': 'quanthanh',
   'ngockhanhward': 'ngockhanh',
   'thanhcongward': 'thanhcong',
   'phuongphucxa': 'phucxa',
   'phuongcongvi': 'congvi',
   'phuongngocha': 'ngocha',
   'phuongdoican': 'doican',
   'trucbachward': 'trucbach',
   'vinhphucward': 'vinhphuc',
   'lieugiaiward': 'lieugiai',
   'dienbienward': 'dienbien',
   'p.quanthanh': 'quanthanh',
   'p.ngockhanh': 'ngockhanh',
   'phuongkimma': 'kimma',
   'p.thanhcong': 'thanhcong',
   'giangvoward': 'giangvo',
   'p.trucbach': 'trucbach',
   'p.vinhphuc': 'vinhphuc',
   'p.lieugiai': 'lieugiai',
 

In [12]:
long_district_alphanumerics = df['long_district_alphanumeric'].unique().tolist()
long_ward_alphanumerics = df['long_ward_alphanumeric'].dropna().unique().tolist()

We use LIST_long_ward_alphanumerics to remove a part in address before searching a key. But ward level make some mistake.
- Eg: "Thị xã Sơn Tây" bị mất do "Xã Sơn Tây" -> Không parse được district.
- Eg: "Nghĩa Xá Lê Chân Hải Phòng" Bị mất "Xá Lê Chân H" do "Xã Lê Chánh" -> Không parse được province

In [13]:
len(long_ward_alphanumerics)

7366

In [14]:
for ward in ['long_ward_alphanumeric', 'short_ward_alphanumeric']:
    for district in ['long_district_alphanumeric', 'short_district_alphanumeric']:
        for province in ['long_province_alphanumeric', 'province_alphanumeric']:
            df[f"address_{ward}_{district}_{province}"] = df[ward] + df[district] + df[province]
            df[f"address_comma_{ward}_{district}_{province}"] = df[ward] + ',' + df[district] + ',' + df[province]
            df[f"address_{ward}_{province}_{district}"] = df[ward] + df[province] + df[district]
            df[f"address_comma_{ward}_{province}_{district}"] = df[ward] + ',' + df[province] + ',' + df[district]
               
for ward in ['long_ward_alphanumeric', 'short_ward_alphanumeric']:
    for district in ['long_district_alphanumeric', 'short_district_alphanumeric']:
        df[f"address_{ward}_add_tinh_{district}"] = df[ward] + 'tinh' + df[district]
        # xadongtinhvinhphuctamduong remove "tinhvinhphuc" con "xadongtamduong" sau do bi remove "xadongtam" -> Khong tim duoc tam duong
        df[f"address_{ward}_remove_tinh_{district}"] = df[ward].str.replace(r'tinh$', '', regex=True) + df[district]

In [15]:
address_cols = [col for col in df.columns if 'address' in col]

In [16]:
LIST_safe_long_ward_alphanumerics = []
for long_ward_alphanumeric in tqdm(long_ward_alphanumerics):
    tmp_df = df[df.long_ward_alphanumeric!=long_ward_alphanumeric][address_cols]
    addresses = str(tmp_df.values.tolist())
    if long_ward_alphanumeric not in addresses:
        LIST_safe_long_ward_alphanumerics.append(long_ward_alphanumeric)

  0%|          | 0/7366 [00:00<?, ?it/s]

In [17]:
len(LIST_safe_long_ward_alphanumerics)

6646

In [18]:
LIST_safe_long_ward_alphanumerics = LIST_safe_long_ward_alphanumerics + [abbreviate_alphanumeric_prefix(i) for i in LIST_safe_long_ward_alphanumerics]

In [19]:
LIST_safe_long_district_alphanumerics = []
for long_district_alphanumeric in tqdm(long_district_alphanumerics):
    tmp_df = df[df.long_district_alphanumeric!=long_district_alphanumeric][address_cols]
    addresses = str(tmp_df.values.tolist())
    if long_district_alphanumeric not in addresses:
        LIST_safe_long_district_alphanumerics.append(long_district_alphanumeric)

  0%|          | 0/685 [00:00<?, ?it/s]

In [20]:
len(LIST_safe_long_district_alphanumerics)

671

In [21]:
LIST_safe_long_district_alphanumerics = LIST_safe_long_district_alphanumerics + [abbreviate_alphanumeric_prefix(i) for i in LIST_safe_long_district_alphanumerics]

In [22]:
df.drop(columns=address_cols, inplace=True)

In [23]:
LIST_safe_long_district_alphanumerics = sorted(LIST_safe_long_district_alphanumerics, key=len, reverse=True)
LIST_safe_long_ward_alphanumerics = sorted(LIST_safe_long_ward_alphanumerics, key=len, reverse=True)

## Create sort key & map

### Sort province

In [24]:
province_keys = df['province_key'].unique().tolist()
district_keys = df['district_key'].unique().tolist()
ward_keys = df['ward_key'].unique().tolist()

In [25]:
print('province_keys:', len(province_keys))
print('district_keys:', len(district_keys))
print('ward_keys:', len(ward_keys))

province_keys: 63
district_keys: 669
ward_keys: 6648


In [24]:
# def find_province_key_match(text):
#     for province_key in province_keys:
#         if province_key in text:
#             return province_key

In [25]:
# containing_province_districts = df[df.district_key.str.contains('|'.join(province_keys))][['province', 'district', 'province_key','district_key']].drop_duplicates()
# containing_province_wards = df[(~df.ward.isna()) & (df.ward_key.str.contains('|'.join(province_keys)))][['province', 'ward', 'province_key','ward_key']].drop_duplicates()
# 
# containing_province_districts['match_province_key'] = containing_province_districts['district_key'].apply(find_province_key_match)
# containing_province_wards['match_province_key'] = containing_province_wards['ward_key'].apply(find_province_key_match)
# 
# containing_province_districts = containing_province_districts[containing_province_districts.province_key != containing_province_districts.match_province_key]
# containing_province_wards = containing_province_wards[containing_province_wards.province_key != containing_province_wards.match_province_key]

In [26]:
# containing_province_districts

Unnamed: 0,province,district,province_key,district_key,match_province_key
757,Hà Giang,Quang Bình,hagiang,quangbinh,quangbinh
1649,Sơn La,Phù Yên,sonla,phuyen,phuyen
3824,Hưng Yên,Văn Giang,hungyen,vangiang,angiang
5972,Quảng Bình,Quảng Ninh,quangbinh,quangninh,quangninh
9009,Long An,Thạnh Hóa,longan,thanhhoa,thanhhoa
10438,Bạc Liêu,Hòa Bình,baclieu,hoabinh,hoabinh


In [27]:
# containing_province_wards

Unnamed: 0,province,ward,province_key,ward_key,match_province_key
2,Hà Nội,Vĩnh Phúc,hanoi,vinhphuc,vinhphuc
8,Hà Nội,Điện Biên,hanoi,dienbien,dienbien
142,Hà Nội,Xuân Giang,hanoi,xuangiang,angiang
316,Hà Nội,Yên Bài,hanoi,yenbai,yenbai
353,Hà Nội,Đồng Tháp,hanoi,dongthap,dongthap
...,...,...,...,...,...
10154,Cần Thơ,Thạnh Hòa,cantho,thanhhoa,thanhhoa
10236,Hậu Giang,Thạnh Hòa,haugiang,thanhhoa,thanhhoa
10363,Sóc Trăng,Khánh Hòa,soctrang,khanhhoa,khanhhoa
10438,Bạc Liêu,Hòa Bình,baclieu,hoabinh,hoabinh


In [28]:
# # Should not use list(set()) because it will re-order
# province_key_order = []
# 
# for row in containing_province_districts.itertuples():
#     if row.province_key not in province_key_order:
#         province_key_order.append(row.province_key)
#     if row.match_province_key not in province_key_order:
#         province_key_order.append(row.match_province_key)
#         
# for row in containing_province_wards.itertuples():
#     if row.province_key not in province_key_order:
#         province_key_order.append(row.province_key)
#     if row.match_province_key not in province_key_order:
#         province_key_order.append(row.match_province_key)

In [29]:
# province_key_zero = [province_key for province_key in province_keys if province_key not in province_key_order]
# province_key_zero = sorted(province_key_zero, key=lambda x: len(x), reverse=True)

In [30]:
# province_key_order = province_key_zero + province_key_order

In [31]:
# len(province_key_order)

63

In [32]:
# province_key_order_map = {}
# for index, province_key in enumerate(province_key_order):
#     province_key_order_map[province_key] = index + 1

In [33]:
# province_key_order_map

{'ninhthuan': 1,
 'laocai': 2,
 'bentre': 3,
 'hagiang': 4,
 'quangbinh': 5,
 'sonla': 6,
 'phuyen': 7,
 'hungyen': 8,
 'angiang': 9,
 'quangninh': 10,
 'longan': 11,
 'thanhhoa': 12,
 'baclieu': 13,
 'hoabinh': 14,
 'hanoi': 15,
 'vinhphuc': 16,
 'dienbien': 17,
 'yenbai': 18,
 'dongthap': 19,
 'caobang': 20,
 'binhduong': 21,
 'backan': 22,
 'phutho': 23,
 'tuyenquang': 24,
 'thaibinh': 25,
 'laichau': 26,
 'khanhhoa': 27,
 'binhthuan': 28,
 'thainguyen': 29,
 'langson': 30,
 'bacgiang': 31,
 'hanam': 32,
 'binhdinh': 33,
 'bacninh': 34,
 'haiphong': 35,
 'lamdong': 36,
 'vinhlong': 37,
 'tayninh': 38,
 'haugiang': 39,
 'namdinh': 40,
 'haiduong': 41,
 'ninhbinh': 42,
 'nghean': 43,
 'hatinh': 44,
 'kiengiang': 45,
 'quangtri': 46,
 'thuathienhue': 47,
 'danang': 48,
 'quangnam': 49,
 'travinh': 50,
 'quangngai': 51,
 'binhphuoc': 52,
 'kontum': 53,
 'daknong': 54,
 'gialai': 55,
 'daklak': 56,
 'dongnai': 57,
 'bariavungtau': 58,
 'hochiminh': 59,
 'tiengiang': 60,
 'cantho': 61,
 '

In [34]:
# df['province_key_order'] = df['province_key'].map(province_key_order_map).fillna(0)
# df.sort_values(by='province_key_order', inplace=True)
# df.drop(columns=['province_key_order'], inplace=True)
# province_keys = df['province_key'].unique().tolist() # Re-create after sorting

### Create key & map

In [35]:
# LIST_province_keys_1 = []
# LIST_province_keys_2 = []
# LIST_province_keys_3 = []
# 
# for province_key in province_keys:
#     # Unique province key
#     if province_key not in str(district_keys) and province_key not in str(ward_keys):
#         LIST_province_keys_1.append(province_key)
#     # Same key with district
#     elif province_key in str(district_keys):
#         LIST_province_keys_2.append(province_key)
#     # Same key with ward
#     else:
#         LIST_province_keys_3.append(province_key)

In [26]:
df_provinces = df[[col for col in df.columns if 'province' in col]].drop_duplicates()

In [27]:
df_provinces

Unnamed: 0,province,long_province,province_english,long_province_english,province_key,long_province_alphanumeric,long_province_alphanumeric_english,province_alphanumeric
0,Hà Nội,Thành phố Hà Nội,Ha Noi,Ha Noi City,hanoi,thanhphohanoi,hanoicity,hanoi
579,Hà Giang,Tỉnh Hà Giang,Ha Giang,Ha Giang Province,hagiang,tinhhagiang,hagiangprovince,hagiang
772,Cao Bằng,Tỉnh Cao Bằng,Cao Bang,Cao Bang Province,caobang,tinhcaobang,caobangprovince,caobang
933,Bắc Kạn,Tỉnh Bắc Kạn,Bac Kan,Bac Kan Province,backan,tinhbackan,backanprovince,backan
1041,Tuyên Quang,Tỉnh Tuyên Quang,Tuyen Quang,Tuyen Quang Province,tuyenquang,tinhtuyenquang,tuyenquangprovince,tuyenquang
...,...,...,...,...,...,...,...,...
10116,Cần Thơ,Thành phố Cần Thơ,Can Tho,Can Tho City,cantho,thanhphocantho,canthocity,cantho
10199,Hậu Giang,Tỉnh Hậu Giang,Hau Giang,Hau Giang Province,haugiang,tinhhaugiang,haugiangprovince,haugiang
10274,Sóc Trăng,Tỉnh Sóc Trăng,Soc Trang,Soc Trang Province,soctrang,tinhsoctrang,soctrangprovince,soctrang
10382,Bạc Liêu,Tỉnh Bạc Liêu,Bac Lieu,Bac Lieu Province,baclieu,tinhbaclieu,baclieuprovince,baclieu


In [28]:
DICT_province_map = {}
for province_key in df_provinces.province_key.unique():
    df_province = df_provinces[df_provinces['province_key'] == province_key]
    province_record = df_province.to_dict(orient='records')[0]
    DICT_province_map[province_key] = province_record

In [33]:
DICT_province_map = dict(sorted(DICT_province_map.items(), key=lambda x: len(x[0]), reverse=True))

In [34]:
DICT_province_map

{'thuathienhue': {'province': 'Thừa Thiên Huế',
  'long_province': 'Tỉnh Thừa Thiên Huế',
  'province_english': 'Thua Thien Hue',
  'long_province_english': 'Thua Thien Hue Province',
  'province_key': 'thuathienhue',
  'long_province_alphanumeric': 'tinhthuathienhue',
  'long_province_alphanumeric_english': 'thuathienhueprovince',
  'province_alphanumeric': 'thuathienhue'},
 'bariavungtau': {'province': 'Bà Rịa - Vũng Tàu',
  'long_province': 'Tỉnh Bà Rịa - Vũng Tàu',
  'province_english': 'Ba Ria - Vung Tau',
  'long_province_english': 'Ba Ria - Vung Tau Province',
  'province_key': 'bariavungtau',
  'long_province_alphanumeric': 'tinhbariavungtau',
  'long_province_alphanumeric_english': 'bariavungtauprovince',
  'province_alphanumeric': 'bariavungtau'},
 'tuyenquang': {'province': 'Tuyên Quang',
  'long_province': 'Tỉnh Tuyên Quang',
  'province_english': 'Tuyen Quang',
  'long_province_english': 'Tuyen Quang Province',
  'province_key': 'tuyenquang',
  'long_province_alphanumeric'

In [35]:
df_districts = df[['province_english'] + [col for col in df.columns if 'district' in col]].drop_duplicates()

In [36]:
df_districts

Unnamed: 0,province_english,district,long_district,short_district,district_english,long_district_english,short_district_english,district_level,district_level_english,district_key,long_district_alphanumeric,long_district_alphanumeric_english,short_district_alphanumeric
0,Ha Noi,Ba Đình,Quận Ba Đình,Ba Đình,Ba Dinh,Ba Dinh District,Ba Dinh,Quận,District,badinh,quanbadinh,badinhdistrict,badinh
14,Ha Noi,Hoàn Kiếm,Quận Hoàn Kiếm,Hoàn Kiếm,Hoan Kiem,Hoan Kiem District,Hoan Kiem,Quận,District,hoankiem,quanhoankiem,hoankiemdistrict,hoankiem
32,Ha Noi,Tây Hồ,Quận Tây Hồ,Tây Hồ,Tay Ho,Tay Ho District,Tay Ho,Quận,District,tayho,quantayho,tayhodistrict,tayho
40,Ha Noi,Long Biên,Quận Long Biên,Long Biên,Long Bien,Long Bien District,Long Bien,Quận,District,longbien,quanlongbien,longbiendistrict,longbien
54,Ha Noi,Cầu Giấy,Quận Cầu Giấy,Cầu Giấy,Cau Giay,Cau Giay District,Cau Giay,Quận,District,caugiay,quancaugiay,caugiaydistrict,caugiay
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10496,Ca Mau,Cái Nước,Huyện Cái Nước,Cái Nước,Cai Nuoc,Cai Nuoc District,Cai Nuoc,Huyện,District,cainuoc,huyencainuoc,cainuocdistrict,cainuoc
10507,Ca Mau,Đầm Dơi,Huyện Đầm Dơi,Đầm Dơi,Dam Doi,Dam Doi District,Dam Doi,Huyện,District,damdoi,huyendamdoi,damdoidistrict,damdoi
10523,Ca Mau,Năm Căn,Huyện Năm Căn,Năm Căn,Nam Can,Nam Can District,Nam Can,Huyện,District,namcan,huyennamcan,namcandistrict,namcan
10531,Ca Mau,Phú Tân,Huyện Phú Tân,Phú Tân,Phu Tan,Phu Tan District,Phu Tan,Huyện,District,phutan,huyenphutan,phutandistrict,phutan


In [37]:
DICT_district_map = {}
for province_english in df_districts['province_english'].unique():
    district_data = {}
    for district_key in df_districts[df_districts['province_english'] == province_english]['district_key'].unique():
        df_district = df_districts[(df_districts['province_english'] == province_english) & (df_districts['district_key'] == district_key)].drop(columns=['province_english'])
        district_levels = df_district['district_level_english'].unique()
        
        level_data = {}
        for level in district_levels:
            district_record = df_district[df_district['district_level_english'] == level].to_dict(orient='records')[0]
            level_data[level] = district_record
            
        district_data[district_key] = level_data
        if re.search(r'^quan\d{1,2}', district_key):
            district_data[district_key.replace('quan', 'q.')] = level_data
            district_data[district_key.replace('quan', 'district')] = level_data
    
    district_data = dict(sorted(district_data.items(), key=lambda x: len(x[0]), reverse=True))
    DICT_district_map[province_english] = district_data

In [38]:
DICT_district_map

{'Ha Noi': {'haibatrung': {'District': {'district': 'Hai Bà Trưng',
    'long_district': 'Quận Hai Bà Trưng',
    'short_district': 'Hai Bà Trưng',
    'district_english': 'Hai Ba Trung',
    'long_district_english': 'Hai Ba Trung District',
    'short_district_english': 'Hai Ba Trung',
    'district_level': 'Quận',
    'district_level_english': 'District',
    'district_key': 'haibatrung',
    'long_district_alphanumeric': 'quanhaibatrung',
    'long_district_alphanumeric_english': 'haibatrungdistrict',
    'short_district_alphanumeric': 'haibatrung'}},
  'thanhxuan': {'District': {'district': 'Thanh Xuân',
    'long_district': 'Quận Thanh Xuân',
    'short_district': 'Thanh Xuân',
    'district_english': 'Thanh Xuan',
    'long_district_english': 'Thanh Xuan District',
    'short_district_english': 'Thanh Xuan',
    'district_level': 'Quận',
    'district_level_english': 'District',
    'district_key': 'thanhxuan',
    'long_district_alphanumeric': 'quanthanhxuan',
    'long_district

In [39]:
DICT_ward_map = {}
for province_english in df['province_english'].unique():
    district_data = {}
    for district_english in df[df['province_english'] == province_english]['district_english'].unique():
        ward_data = {}
        for ward_key in df[(df['province_english'] == province_english) & (df['district_english'] == district_english)]['ward_key'].dropna().unique():
            df_ward = df[(df['province_english'] == province_english) & (df['district_english'] == district_english) & (df['ward_key'] == ward_key)][[col for col in df.columns if 'ward' in col]]
            ward_levels = df_ward['ward_level_english'].unique()
            level_data = {}
            for level in ward_levels:
                ward_record = df_ward[df_ward['ward_level_english']==level].to_dict(orient='records')[0]
                level_data[level] = ward_record
                
            ward_data[ward_key] = level_data
            if re.search(r'^phuong\d{1,2}', ward_key):
                ward_data[ward_key.replace('phuong', 'p.')] = level_data
                ward_data[ward_key.replace('phuong', 'ward')] = level_data
        
        ward_data = dict(sorted(ward_data.items(), key=lambda x: len(x[0]), reverse=True))
        district_data[district_english] = ward_data
    DICT_ward_map[province_english] = district_data

In [40]:
DICT_ward_map['Ho Chi Minh']['Binh Thanh']

{'phuong13': {'Ward': {'ward': 'Phường 13',
   'long_ward': 'Phường 13',
   'short_ward': 'Phường 13',
   'ward_english': 'Ward 13',
   'long_ward_english': 'Ward 13',
   'short_ward_english': 'Ward 13',
   'ward_level': 'Phường',
   'ward_level_english': 'Ward',
   'ward_key': 'phuong13',
   'long_ward_alphanumeric': 'phuong13',
   'long_ward_alphanumeric_english': 'ward13',
   'short_ward_alphanumeric': 'phuong13'}},
 'phuong11': {'Ward': {'ward': 'Phường 11',
   'long_ward': 'Phường 11',
   'short_ward': 'Phường 11',
   'ward_english': 'Ward 11',
   'long_ward_english': 'Ward 11',
   'short_ward_english': 'Ward 11',
   'ward_level': 'Phường',
   'ward_level_english': 'Ward',
   'ward_key': 'phuong11',
   'long_ward_alphanumeric': 'phuong11',
   'long_ward_alphanumeric_english': 'ward11',
   'short_ward_alphanumeric': 'phuong11'}},
 'phuong27': {'Ward': {'ward': 'Phường 27',
   'long_ward': 'Phường 27',
   'short_ward': 'Phường 27',
   'ward_english': 'Ward 27',
   'long_ward_english

## Create duplicate district keys checking data

### District

In [41]:
district_count = df[['province', 'long_district', 'district_key']].drop_duplicates()[['province', 'district_key']].value_counts().reset_index()
duplicated_districts = district_count[district_count['count'] > 1]['district_key'].tolist()

In [42]:
df_duplicated_districts = df[df.district_key.isin(duplicated_districts)][['long_province', 'long_district' ,'province_key','district_key', 'province_english']].drop_duplicates()

In [43]:
df_duplicated_districts

Unnamed: 0,long_province,long_district,province_key,district_key,province_english
5835,Tỉnh Hà Tĩnh,Huyện Kỳ Anh,hatinh,kyanh,Ha Tinh
5867,Tỉnh Hà Tĩnh,Thị xã Kỳ Anh,hatinh,kyanh,Ha Tinh
9160,Tỉnh Tiền Giang,Thị xã Cai Lậy,tiengiang,cailay,Tien Giang
9213,Tỉnh Tiền Giang,Huyện Cai Lậy,tiengiang,cailay,Tien Giang
9552,Tỉnh Trà Vinh,Huyện Duyên Hải,travinh,duyenhai,Tra Vinh
9559,Tỉnh Trà Vinh,Thị xã Duyên Hải,travinh,duyenhai,Tra Vinh
9673,Tỉnh Đồng Tháp,Thành phố Cao Lãnh,dongthap,caolanh,Dong Thap
9697,Tỉnh Đồng Tháp,Thành phố Hồng Ngự,dongthap,hongngu,Dong Thap
9713,Tỉnh Đồng Tháp,Huyện Hồng Ngự,dongthap,hongngu,Dong Thap
9748,Tỉnh Đồng Tháp,Huyện Cao Lãnh,dongthap,caolanh,Dong Thap


My idea: We will use their wards to decide their district level.

To make sure this idea is valid, I will check whether they have some wards with the same ward_key

In [44]:
for district_key in duplicated_districts:
    long_districts = df[df.district_key==district_key]['long_district'].unique().tolist()
    a = []
    b = []
    long_district_a = long_districts[0]
    long_district_b = long_districts[1]
    ward_key_a = df[df.long_district==long_district_a]['ward_key'].unique().tolist()
    ward_key_b = df[df.long_district==long_district_b]['ward_key'].unique().tolist()
    common = list(set(a) & set(b))
    if common:
        print(district_key, 'has duplicated ward_key')

Nice! We also have default option if there are no ward in the address, I use Google Trend to decide default district level.

In [45]:
# LIST_duplicated_district_province_keys = df_duplicated_districts['province_key'].unique().tolist()
# 
# # Use Google Trend and search to decide level
# DICT_duplicated_district_keys = {
#     'kyanh': {'default':'Town'},
#     'cailay': {'default':'Town'},
#     'duyenhai': {'default':'District'},
#     'caolanh': {'default':'City'},
#     'hongngu': {'default':'City'},
#     'longmy': {'default':'District'}
# }
# 
# for district_key in duplicated_districts:
#     df_temp = df[df.district_key == district_key]
#     level_data = {}
#     levels = df_temp.district_level_english.unique().tolist()
#     for district_level_english in levels:
#         tmp_ward_keys = df_temp[df_temp.district_level_english==district_level_english]['ward_key'].unique().tolist()
#         level_data[district_level_english] = tmp_ward_keys
#         DICT_duplicated_district_keys[district_key]['levels'] = level_data

In [46]:
default_option_for_duplicated_district_keys = {
    'kyanh': 'Town',
    'cailay': 'Town',
    'duyenhai': 'District',
    'caolanh': 'City',
    'hongngu': 'City',
    'longmy': 'District'
}

In [51]:
DICT_duplicated_district_keys = {}
for province_english in df_duplicated_districts.province_english.unique():
    tmp_df = df_duplicated_districts[df_duplicated_districts.province_english == province_english]
    tmp_duplicated_districts = tmp_df['district_key'].unique().tolist()
    district_data = {}
    for district_key in tmp_duplicated_districts:
        tmp_df_district = df[(df.district_key == district_key) & (df.province_english==province_english)]
        level_data = {}
        levels = tmp_df_district.district_level_english.unique().tolist()
        for district_level_english in levels:
            tmp_ward_keys = tmp_df_district[tmp_df_district.district_level_english==district_level_english]['ward_key'].unique().tolist()
            
            tmp_ward_keys = sorted(tmp_ward_keys, key=len, reverse=True)
            level_data[district_level_english] = tmp_ward_keys
        
        
        district_data[district_key] = {
            'default': default_option_for_duplicated_district_keys[district_key],
            'levels': level_data
        }
    
    district_data = dict(sorted(district_data.items(), key= lambda x:len(x[0]), reverse=True))
    DICT_duplicated_district_keys[province_english] = district_data

In [52]:
DICT_duplicated_district_keys['Ha Tinh']['kyanh']['default']

'Town'

In [53]:
DICT_duplicated_district_keys['Ha Tinh']['kyanh']['levels']

{'District': ['kythuong',
  'kyphong',
  'kygiang',
  'kykhang',
  'kytrung',
  'kyxuan',
  'kytien',
  'kydong',
  'kychau',
  'lamhop',
  'kybac',
  'kyphu',
  'kyvan',
  'kytho',
  'kytay',
  'kyhai',
  'kythu',
  'kytan',
  'kyson',
  'kylac'],
 'Town': ['kyphuong',
  'hungtri',
  'kytrinh',
  'kythinh',
  'kyninh',
  'kylong',
  'kylien',
  'kyloi',
  'kyhoa',
  'kynam',
  'kyha']}

### Ward

In [54]:
ward_count = df[['long_province', 'long_district', 'short_ward']].value_counts().reset_index()
df_duplicated_wards = ward_count[ward_count['count'] > 1].copy()
df_duplicated_wards = df[(df.long_province.isin(df_duplicated_wards.long_province)) & (df.long_district.isin(df_duplicated_wards.long_district)) & (df.short_ward.isin(df_duplicated_wards.short_ward))][['province', 'long_district', 'ward','ward_level_english', 'district_key', 'ward_key', 'province_english', 'district_english']]
df_duplicated_wards.sort_values(by=df_duplicated_wards.columns.tolist(), inplace=True)

In [55]:
df_duplicated_wards

Unnamed: 0,province,long_district,ward,ward_level_english,district_key,ward_key,province_english,district_english
10401,Bạc Liêu,Huyện Phước Long,Thị trấn Phước Long,Town,phuoclong,phuoclong,Bac Lieu,Phuoc Long
10404,Bạc Liêu,Huyện Phước Long,Xã Phước Long,Commune,phuoclong,phuoclong,Bac Lieu,Phuoc Long
10471,Cà Mau,Huyện Thới Bình,Thị trấn Thới Bình,Town,thoibinh,thoibinh,Ca Mau,Thoi Binh
10477,Cà Mau,Huyện Thới Bình,Xã Thới Bình,Commune,thoibinh,thoibinh,Ca Mau,Thoi Binh
10456,Cà Mau,Thành phố Cà Mau,Phường Tân Thành,Ward,camau,tanthanh,Ca Mau,Ca Mau
10457,Cà Mau,Thành phố Cà Mau,Xã Tân Thành,Commune,camau,tanthanh,Ca Mau,Ca Mau
176,Hà Nội,Huyện Gia Lâm,Thị trấn Yên Viên,Town,gialam,yenvien,Ha Noi,Gia Lam
178,Hà Nội,Huyện Gia Lâm,Xã Yên Viên,Commune,gialam,yenvien,Ha Noi,Gia Lam
10084,Kiên Giang,Huyện Vĩnh Thuận,Thị trấn Vĩnh Thuận,Town,vinhthuan,vinhthuan,Kien Giang,Vinh Thuan
10088,Kiên Giang,Huyện Vĩnh Thuận,Xã Vĩnh Thuận,Commune,vinhthuan,vinhthuan,Kien Giang,Vinh Thuan


Following Google Trend, "xã ..." is higher "thị trấn ..." significant. But "phường ..." is higher than "xã ..." and we have only one ward in the list.

In [56]:
# LIST_duplicated_ward_district_keys = df_duplicated_wards.ward_key.unique().tolist()

In [57]:
# DICT_duplicated_ward_keys = {}
# for ward_key in df_duplicated_wards['ward_key'].unique():
#     ward_levels = df_duplicated_wards[df_duplicated_wards.ward_key == ward_key]['ward_level_english'].tolist()
#     if 'Ward' in ward_levels:
#         default_ward_level = 'Ward'
#     elif 'Commune' in ward_levels:
#         default_ward_level = 'Commune'
#     else:
#         default_ward_level = 'Town'
#     DICT_duplicated_ward_keys[ward_key] = default_ward_level

In [58]:
DICT_duplicated_ward_keys = {}

for province_english in df_duplicated_wards.province_english.unique():
    district_data = {}
    tmp_df = df_duplicated_wards[df_duplicated_wards.province_english==province_english]
    for district_english in tmp_df.district_english.unique():
        ward_data = {}
        for ward_key in tmp_df[tmp_df.district_english==district_english]['ward_key'].unique():
            ward_levels = df_duplicated_wards[df_duplicated_wards.ward_key == ward_key]['ward_level_english'].tolist()
            if 'Ward' in ward_levels:
                default_ward_level = 'Ward'
            elif 'Commune' in ward_levels:
                default_ward_level = 'Commune'
            else:
                default_ward_level = 'Town'
            ward_data[ward_key] = default_ward_level
        
        ward_data = dict(sorted(ward_data.items(), key=lambda x: len(x[0]), reverse=True))
        district_data[district_english] = ward_data 
        
    DICT_duplicated_ward_keys[province_english] = district_data

In [62]:
DICT_duplicated_ward_keys

{'Bac Lieu': {'Phuoc Long': {'phuoclong': 'Commune'}},
 'Ca Mau': {'Thoi Binh': {'thoibinh': 'Commune'},
  'Ca Mau': {'tanthanh': 'Ward'}},
 'Ha Noi': {'Gia Lam': {'yenvien': 'Commune'}},
 'Kien Giang': {'Vinh Thuan': {'vinhthuan': 'Commune'}},
 'Lai Chau': {'Muong Te': {'muongte': 'Commune'}},
 'Long An': {'Duc Hoa': {'hiephoa': 'Commune'}},
 'Lam Dong': {'Da Huoai': {'madaguoi': 'Commune'}},
 'Lang Son': {'Chi Lang': {'chilang': 'Commune'},
  'Dinh Lap': {'dinhlap': 'Commune'}},
 'Quang Ninh': {'Dam Ha': {'damha': 'Commune'}},
 'Soc Trang': {'Long Phu': {'longphu': 'Commune'},
  'Tran De': {'lichhoithuong': 'Commune'}},
 'Son La': {'Mai Son': {'hatlot': 'Commune'}},
 'Tra Vinh': {'Tra Cu': {'dinhan': 'Commune'}},
 'Yen Bai': {'Tram Tau': {'tramtau': 'Commune'},
  'Yen Binh': {'yenbinh': 'Commune'}},
 'Dong Thap': {'Cao Lanh District': {'mytho': 'Commune'},
  'Thap Muoi': {'myan': 'Commune'}}}

## Find province from district

- DICT_unique_long_district_alphanumerics `{'long_district_alphanumeric': 'province_key'}`
- DICT_unique_district_keys `{'district_key': 'province_key'}`
- DICT_not_unique_district_keys `{'district_key': {'province_key 1': ['ward key', ...], {'province_key 2': ['ward key', ...]}}`


Điều kiện 1: district_key not in ward_keys.
Điều kiện 2: district_count = 1, sau khi đã drop_duplicated [province_key, district_key]
Điều kiện 3: district_key not in province_keys của tỉnh khác.

Nếu district_key = province_key của chính nó:
>> Lấy danh sách long_district_alphanumerices (bao gồm Việt và English), nếu tồn tại trong address thì sau khi tìm được province_key sẽ không bị xóa province_key trong địa chỉ.


Nếu district_key != province_key của chính nó, tức là unique tuyệt đối:
>> Lấy danh sách unique_district_keys, nếu không tìm được province_key thì tìm unique_district_key để suy ngược ra province_key

In [63]:
print('province_keys:', len(province_keys))
print('district_keys:', len(district_keys))
print('ward_keys:', len(ward_keys))

province_keys: 63
district_keys: 669
ward_keys: 6648


### Use long_district_alphanumeric

Check if long_district_alphanumeric is in long_provinces_alphanumerics and long_ward_alphanumerics

In [64]:
df[~df['long_district_alphanumeric'].apply(lambda x: x in str(DICT_long_province_alphanumerics))].shape

(10547, 32)

In [65]:
df[~df['long_district_alphanumeric'].apply(lambda x: x in str(long_ward_alphanumerics))].shape

(10547, 32)

Yes, we will not mistake with long_district_alphanumeric with province or ward. But they can be duplicated.

In [66]:
df_long_district_alphanumeric_count = df[['province_key', 'long_district_alphanumeric']].drop_duplicates()['long_district_alphanumeric'].value_counts().reset_index()

In [67]:
df_long_district_alphanumeric_count

Unnamed: 0,long_district_alphanumeric,count
0,huyenchauthanh,10
1,huyentamnong,2
2,huyenthanhtri,2
3,huyentamduong,2
4,huyenphutan,2
...,...,...
680,huyenquynhphu,1
681,huyenhungha,1
682,huyendonghung,1
683,huyenthaithuy,1


In [68]:
unique_long_district_alphanumerics = df_long_district_alphanumeric_count[df_long_district_alphanumeric_count['count']==1]['long_district_alphanumeric'].values.tolist()
not_unique_long_district_alphanumerics = df_long_district_alphanumeric_count[df_long_district_alphanumeric_count['count']>1]['long_district_alphanumeric'].values.tolist()

In [69]:
df_unique_long_district_alphanumerics = df[df['long_district_alphanumeric'].isin(unique_long_district_alphanumerics)][['province_key', 'district_key', 'long_district_alphanumeric', 'long_district_alphanumeric_english']].drop_duplicates()
df_not_unique_long_district_alphanumerics = df[df['long_district_alphanumeric'].isin(not_unique_long_district_alphanumerics)][['province_key', 'district_key', 'long_district_alphanumeric', 'long_district_alphanumeric_english']].drop_duplicates()

In [70]:
df_unique_long_district_alphanumerics

Unnamed: 0,province_key,district_key,long_district_alphanumeric,long_district_alphanumeric_english
0,hanoi,badinh,quanbadinh,badinhdistrict
14,hanoi,hoankiem,quanhoankiem,hoankiemdistrict
32,hanoi,tayho,quantayho,tayhodistrict
40,hanoi,longbien,quanlongbien,longbiendistrict
54,hanoi,caugiay,quancaugiay,caugiaydistrict
...,...,...,...,...
10483,camau,tranvanthoi,huyentranvanthoi,tranvanthoidistrict
10496,camau,cainuoc,huyencainuoc,cainuocdistrict
10507,camau,damdoi,huyendamdoi,damdoidistrict
10523,camau,namcan,huyennamcan,namcandistrict


In [71]:
df_not_unique_long_district_alphanumerics

Unnamed: 0,province_key,district_key,long_district_alphanumeric,long_district_alphanumeric_english
208,hanoi,thanhtri,huyenthanhtri,thanhtridistrict
783,caobang,baolam,huyenbaolam,baolamdistrict
1010,backan,chomoi,huyenchomoi,chomoidistrict
1466,laichau,tamduong,huyentamduong,tamduongdistrict
2948,phutho,phuninh,huyenphuninh,phuninhdistrict
3006,phutho,tamnong,huyentamnong,tamnongdistrict
3120,vinhphuc,tamduong,huyentamduong,tamduongdistrict
3697,haiphong,anlao,huyenanlao,anlaodistrict
6191,thuathienhue,phongdien,huyenphongdien,phongdiendistrict
6577,quangnam,phuninh,huyenphuninh,phuninhdistrict


In [72]:
df_unique_long_district_alphanumerics['district_key_equal_province_key'] = np.where(df_unique_long_district_alphanumerics['province_key']==df_unique_long_district_alphanumerics['district_key'], True, False)
df_not_unique_long_district_alphanumerics['district_key_equal_province_key'] = np.where(df_not_unique_long_district_alphanumerics['province_key']==df_not_unique_long_district_alphanumerics['district_key'], True, False)

In [73]:
df_unique_long_district_alphanumerics['district_key_equal_province_key'].value_counts()

district_key_equal_province_key
False    642
True      32
Name: count, dtype: int64

In [74]:
df_not_unique_long_district_alphanumerics['district_key_equal_province_key'].value_counts()

district_key_equal_province_key
False    30
Name: count, dtype: int64

In [75]:
DICT_unique_long_district_alphanumerics = {}
for row in df_unique_long_district_alphanumerics.itertuples():
    long_district_alphanumeric = row.long_district_alphanumeric
    
    alias_district_alphanumeric = abbreviate_alphanumeric_prefix(long_district_alphanumeric)
    
    # alias_district_alphanumeric = re.sub(r'^thanhpho', 'tp.', long_district_alphanumeric, flags=re.IGNORECASE)
    # alias_district_alphanumeric = re.sub(r'^quan', 'q.', alias_district_alphanumeric, flags=re.IGNORECASE)
    # alias_district_alphanumeric = re.sub(r'^huyen', 'h.', alias_district_alphanumeric, flags=re.IGNORECASE)
    # alias_district_alphanumeric = re.sub(r'^thixa', 'tx.', alias_district_alphanumeric, flags=re.IGNORECASE)
    
    DICT_unique_long_district_alphanumerics[long_district_alphanumeric] = row.province_key
    DICT_unique_long_district_alphanumerics[alias_district_alphanumeric] = row.province_key
    DICT_unique_long_district_alphanumerics[row.long_district_alphanumeric_english] = row.province_key

In [76]:
DICT_unique_long_district_alphanumerics = dict(sorted(DICT_unique_long_district_alphanumerics.items(), key=lambda x: len(x[0]), reverse=True))

In [77]:
DICT_unique_long_district_alphanumerics

{'thanhphophanrangthapcham': 'ninhthuan',
 'duongminhchaudistrict': 'tayninh',
 'dienbiendongdistrict': 'dienbien',
 'phanrangthapchamcity': 'ninhthuan',
 'muongkhuongdistrict': 'laocai',
 'thanhphodienbienphu': 'dienbien',
 'thanhchuongdistrict': 'nghean',
 'tp.phanrangthapcham': 'ninhthuan',
 'hamthuanbacdistrict': 'binhthuan',
 'hamthuannamdistrict': 'binhthuan',
 'thanhphobuonmathuot': 'daklak',
 'uminhthuongdistrict': 'kiengiang',
 'tranvanthoidistrict': 'camau',
 'haibatrungdistrict': 'hanoi',
 'hoangsuphidistrict': 'hagiang',
 'trungkhanhdistrict': 'caobang',
 'nguyenbinhdistrict': 'caobang',
 'thanhphotuyenquang': 'tuyenquang',
 'mucangchaidistrict': 'yenbai',
 'thanhphothainguyen': 'thainguyen',
 'thuynguyendistrict': 'haiphong',
 'bachlongvidistrict': 'haiphong',
 'xuantruongdistrict': 'namdinh',
 'thachthanhdistrict': 'thanhhoa',
 'thuongxuandistrict': 'thanhhoa',
 'quangxuongdistrict': 'thanhhoa',
 'tuongduongdistrict': 'nghean',
 'hungnguyendistrict': 'nghean',
 'quangtrac

In [78]:
df['long_district_alphanumeric_ward_key_unit'] = df['long_district_alphanumeric'] + '_' + df['ward_key']
long_district_alphanumeric_ward_key_unit_count = df['long_district_alphanumeric_ward_key_unit'].value_counts().reset_index()
unique_long_district_alphanumeric_ward_key_units = long_district_alphanumeric_ward_key_unit_count[long_district_alphanumeric_ward_key_unit_count['count']==1]['long_district_alphanumeric_ward_key_unit'].tolist()

In [79]:
long_district_alphanumeric_ward_key_unit_count

Unnamed: 0,long_district_alphanumeric_ward_key_unit,count
0,huyenchauthanh_chauthanh,4
1,huyenchauthanh_anhiep,3
2,huyenchauthanh_tanphu,3
3,huyenlongphu_longphu,2
4,thixasapa_sapa,2
...,...,...
10478,huyengialoc_toanthang,1
10479,huyengialoc_hoangdieu,1
10480,huyengialoc_honghung,1
10481,huyengialoc_phamtran,1


In [80]:
DICT_not_unique_long_district_alphanumerics = {}
for row in df_not_unique_long_district_alphanumerics.itertuples():
    long_district_alphanumeric = row.long_district_alphanumeric
    alias_district_alphanumeric = abbreviate_alphanumeric_prefix(long_district_alphanumeric)
    province_data = {}
    tmp_province_keys = df[df.long_district_alphanumeric==long_district_alphanumeric]['province_key'].unique()
    for province_key in tmp_province_keys:
        tmp_ward_keys = df[(df.long_district_alphanumeric==long_district_alphanumeric) & (df.province_key==province_key) & (df.long_district_alphanumeric_ward_key_unit.isin(unique_long_district_alphanumeric_ward_key_units))]['ward_key'].dropna().tolist()
        for ward_key in tmp_ward_keys:
            if re.search(r'^phuong\d{1,2}', ward_key):
                tmp_ward_keys.append(ward_key.replace('phuong', 'ward'))
                tmp_ward_keys.append(ward_key.replace('phuong', 'p.'))
        province_data[province_key] = tmp_ward_keys
    DICT_not_unique_long_district_alphanumerics[long_district_alphanumeric] = province_data
    DICT_not_unique_long_district_alphanumerics[alias_district_alphanumeric] = province_data
    DICT_not_unique_long_district_alphanumerics[row.long_district_alphanumeric_english] = province_data

In [85]:
DICT_not_unique_long_district_alphanumerics = dict(sorted(DICT_not_unique_long_district_alphanumerics.items(), key=lambda x: len(x[0]), reverse=True))

In [86]:
DICT_not_unique_long_district_alphanumerics

{'phongdiendistrict': {'thuathienhue': ['dienhuong',
   'dienmon',
   'dienloc',
   'phongbinh',
   'dienhoa',
   'phongchuong',
   'phonghai',
   'dienhai',
   'phonghoa',
   'phongthu',
   'phonghien',
   'phongmy',
   'phongan',
   'phongxuan',
   'phongson'],
  'cantho': ['nhonai',
   'giaixuan',
   'tanthoi',
   'truonglong',
   'mykhanh',
   'nhonnghia']},
 'vinhthanhdistrict': {'binhdinh': ['vinhson',
   'vinhkim',
   'vinhhiep',
   'vinhhao',
   'vinhhoa',
   'vinhthinh',
   'vinhthuan',
   'vinhquang'],
  'cantho': ['vinhbinh',
   'thanhmy',
   'vinhtrinh',
   'thanhtien',
   'thanhthang',
   'thanhloi',
   'thanhquoi',
   'thanhloc']},
 'chauthanhdistrict': {'tayninh': ['haoduoc',
   'phuocvinh',
   'dongkhoi',
   'thaibinh',
   'anco',
   'biengioi',
   'hoathanh',
   'tribinh',
   'hoahoi',
   'anbinh',
   'thanhdien',
   'thanhlong',
   'ninhdien',
   'longvinh'],
  'longan': ['tamvu',
   'binhquoi',
   'hoaphu',
   'phungaitri',
   'vinhcong',
   'thuanmy',
   'hiepthanh'

In [87]:
LIST_contains_province_key_long_district_alphanumerics = []
for long_district_alphanumeric in df_unique_long_district_alphanumerics[df_unique_long_district_alphanumerics['district_key_equal_province_key']]['long_district_alphanumeric'].unique():
    alias_district_alphanumeric = abbreviate_alphanumeric_prefix(long_district_alphanumeric)
    LIST_contains_province_key_long_district_alphanumerics.append(long_district_alphanumeric)
    LIST_contains_province_key_long_district_alphanumerics.append(alias_district_alphanumeric)
LIST_contains_province_key_long_district_alphanumerics += df_not_unique_long_district_alphanumerics[df_not_unique_long_district_alphanumerics['district_key_equal_province_key']]['long_district_alphanumeric'].unique().tolist()

In [88]:
LIST_contains_province_key_long_district_alphanumerics = sorted(LIST_contains_province_key_long_district_alphanumerics, key=len, reverse=True)

In [89]:
LIST_contains_province_key_long_district_alphanumerics

['thanhphotuyenquang',
 'thanhphothainguyen',
 'thanhphoquangngai',
 'thanhphobacgiang',
 'thanhphohaiduong',
 'thanhphothaibinh',
 'thanhphoninhbinh',
 'thanhphothanhhoa',
 'thanhphovinhlong',
 'thanhphosoctrang',
 'thanhphohagiang',
 'thanhphocaobang',
 'thanhpholaichau',
 'thanhphohoabinh',
 'thanhpholangson',
 'thanhphobacninh',
 'thanhphohungyen',
 'thanhphonamdinh',
 'thanhphotayninh',
 'thanhphotravinh',
 'thanhphobaclieu',
 'thanhphobackan',
 'thanhpholaocai',
 'thanhphoyenbai',
 'thanhphohatinh',
 'thanhphokontum',
 'thanhphobentre',
 'tp.tuyenquang',
 'huyendienbien',
 'thanhphosonla',
 'tp.thainguyen',
 'thixaquangtri',
 'thanhphocamau',
 'tp.quangngai',
 'tp.bacgiang',
 'thixaphutho',
 'tp.haiduong',
 'tp.thaibinh',
 'tp.ninhbinh',
 'tp.thanhhoa',
 'tt.quangtri',
 'tp.vinhlong',
 'tp.soctrang',
 'tp.hagiang',
 'tp.caobang',
 'h.dienbien',
 'tp.laichau',
 'tp.hoabinh',
 'tp.langson',
 'tp.bacninh',
 'tp.hungyen',
 'tp.namdinh',
 'tp.tayninh',
 'tp.travinh',
 'tp.baclieu',
 '

### Use district_key

In [90]:
df_district_filter = df[~df.district_key.apply(lambda x: x in str(long_ward_alphanumerics))].copy()

In [91]:
def check_district_equal_other_province_keys(province_key, district_key):
    tmp_province_keys = province_keys.copy()
    tmp_province_keys.remove(province_key)
    if district_key in tmp_province_keys:
        return True
    else:
        return False

In [92]:
df_district_filter['district_key_equal_other_province_keys'] = df_district_filter.apply(lambda x: check_district_equal_other_province_keys(x.province_key, x.district_key), axis=1)

In [93]:
df_district_filter['district_key_equal_other_province_keys'].value_counts()

district_key_equal_other_province_keys
False    2893
Name: count, dtype: int64

In [94]:
df_district_filter = df_district_filter[~df_district_filter['district_key_equal_other_province_keys']]

In [95]:
df_district_filter[df_district_filter.district_key=='quan5']

Unnamed: 0,province,district,ward,long_province,long_district,long_ward,short_district,short_ward,province_english,district_english,...,long_district_alphanumeric,long_ward_alphanumeric,long_province_alphanumeric_english,long_district_alphanumeric_english,long_ward_alphanumeric_english,province_alphanumeric,short_district_alphanumeric,short_ward_alphanumeric,long_district_alphanumeric_ward_key_unit,district_key_equal_other_province_keys
8818,Hồ Chí Minh,Quận 5,Phường 4,Thành phố Hồ Chí Minh,Quận 5,Phường 4,Quận 5,Phường 4,Ho Chi Minh,District 5,...,quan5,phuong4,hochiminhcity,district5,ward4,hochiminh,quan5,phuong4,quan5_phuong4,False
8819,Hồ Chí Minh,Quận 5,Phường 9,Thành phố Hồ Chí Minh,Quận 5,Phường 9,Quận 5,Phường 9,Ho Chi Minh,District 5,...,quan5,phuong9,hochiminhcity,district5,ward9,hochiminh,quan5,phuong9,quan5_phuong9,False
8820,Hồ Chí Minh,Quận 5,Phường 3,Thành phố Hồ Chí Minh,Quận 5,Phường 3,Quận 5,Phường 3,Ho Chi Minh,District 5,...,quan5,phuong3,hochiminhcity,district5,ward3,hochiminh,quan5,phuong3,quan5_phuong3,False
8821,Hồ Chí Minh,Quận 5,Phường 12,Thành phố Hồ Chí Minh,Quận 5,Phường 12,Quận 5,Phường 12,Ho Chi Minh,District 5,...,quan5,phuong12,hochiminhcity,district5,ward12,hochiminh,quan5,phuong12,quan5_phuong12,False
8822,Hồ Chí Minh,Quận 5,Phường 2,Thành phố Hồ Chí Minh,Quận 5,Phường 2,Quận 5,Phường 2,Ho Chi Minh,District 5,...,quan5,phuong2,hochiminhcity,district5,ward2,hochiminh,quan5,phuong2,quan5_phuong2,False
8823,Hồ Chí Minh,Quận 5,Phường 8,Thành phố Hồ Chí Minh,Quận 5,Phường 8,Quận 5,Phường 8,Ho Chi Minh,District 5,...,quan5,phuong8,hochiminhcity,district5,ward8,hochiminh,quan5,phuong8,quan5_phuong8,False
8824,Hồ Chí Minh,Quận 5,Phường 7,Thành phố Hồ Chí Minh,Quận 5,Phường 7,Quận 5,Phường 7,Ho Chi Minh,District 5,...,quan5,phuong7,hochiminhcity,district5,ward7,hochiminh,quan5,phuong7,quan5_phuong7,False
8825,Hồ Chí Minh,Quận 5,Phường 1,Thành phố Hồ Chí Minh,Quận 5,Phường 1,Quận 5,Phường 1,Ho Chi Minh,District 5,...,quan5,phuong1,hochiminhcity,district5,ward1,hochiminh,quan5,phuong1,quan5_phuong1,False
8826,Hồ Chí Minh,Quận 5,Phường 11,Thành phố Hồ Chí Minh,Quận 5,Phường 11,Quận 5,Phường 11,Ho Chi Minh,District 5,...,quan5,phuong11,hochiminhcity,district5,ward11,hochiminh,quan5,phuong11,quan5_phuong11,False
8827,Hồ Chí Minh,Quận 5,Phường 14,Thành phố Hồ Chí Minh,Quận 5,Phường 14,Quận 5,Phường 14,Ho Chi Minh,District 5,...,quan5,phuong14,hochiminhcity,district5,ward14,hochiminh,quan5,phuong14,quan5_phuong14,False


In [96]:
df_district_filter_count = df_district_filter[['province_key', 'district_key']].drop_duplicates()['district_key'].value_counts().reset_index()

In [97]:
df_district_filter_count

Unnamed: 0,district_key,count
0,hoangmai,2
1,tamnong,2
2,hoankiem,1
3,dahuoai,1
4,cukuin,1
...,...,...
200,cualo,1
201,quychau,1
202,tuongduong,1
203,nghiloc,1


In [98]:
unique_district_keys = df_district_filter_count[df_district_filter_count['count']==1]['district_key'].values.tolist()
not_unique_district_keys = df_district_filter_count[df_district_filter_count['count']>1]['district_key'].values.tolist()

In [99]:
df_unique_district_keys = df[df['district_key'].isin(unique_district_keys)][['province_key', 'district_key']].drop_duplicates()
df_not_unique_district_keys = df[df['district_key'].isin(not_unique_district_keys)][['province_key', 'district_key']].drop_duplicates()

In [100]:
df_unique_district_keys

Unnamed: 0,province_key,district_key
14,hanoi,hoankiem
54,hanoi,caugiay
198,hanoi,namtuliem
224,hanoi,bactuliem
396,hanoi,thachthat
...,...,...
10382,baclieu,baclieu
10392,baclieu,hongdan
10417,baclieu,giarai
10446,camau,camau


In [101]:
df_not_unique_district_keys

Unnamed: 0,province_key,district_key
101,hanoi,hoangmai
3006,phutho,tamnong
5652,nghean,hoangmai
9723,dongthap,tamnong


In [102]:
DICT_unique_district_keys = {}
for row in df_unique_district_keys.itertuples():
    district_key = row.district_key
    DICT_unique_district_keys[district_key] = row.province_key
    if re.search(r'quan\d{1,2}', district_key):
        DICT_unique_district_keys[district_key.replace('quan', 'q.')] = row.province_key
        DICT_unique_district_keys[district_key.replace('quan', 'district')] = row.province_key

In [104]:
DICT_unique_district_keys = dict(sorted(DICT_unique_district_keys.items(), key=lambda x:len(x[0]), reverse=True))

In [105]:
DICT_unique_district_keys 

{'phanrangthapcham': 'ninhthuan',
 'dienbienphu': 'dienbien',
 'hamthuanbac': 'binhthuan',
 'hamthuannam': 'binhthuan',
 'buonmathuot': 'daklak',
 'uminhthuong': 'kiengiang',
 'hoangsuphi': 'hagiang',
 'tuyenquang': 'tuyenquang',
 'thuynguyen': 'haiphong',
 'bachlongvi': 'haiphong',
 'thachthanh': 'thanhhoa',
 'quangxuong': 'thanhhoa',
 'tuongduong': 'nghean',
 'trieuphong': 'quangtri',
 'nguhanhson': 'danang',
 'bactanuyen': 'binhduong',
 'district12': 'hochiminh',
 'district10': 'hochiminh',
 'district11': 'hochiminh',
 'gocongdong': 'tiengiang',
 'giangthanh': 'kiengiang',
 'chauthanha': 'haugiang',
 'namtuliem': 'hanoi',
 'bactuliem': 'hanoi',
 'thachthat': 'hanoi',
 'bachthong': 'backan',
 'quynhnhai': 'sonla',
 'trangdinh': 'langson',
 'duongkinh': 'haiphong',
 'thanhliem': 'hanam',
 'lienchieu': 'danang',
 'tienphuoc': 'quangnam',
 'krongbong': 'daklak',
 'chonthanh': 'binhphuoc',
 'thudaumot': 'binhduong',
 'nhontrach': 'dongnai',
 'district1': 'hochiminh',
 'district3': 'hochi

In [106]:
df['district_key_ward_key_unit'] = df['district_key'] + '_' + df['ward_key']
district_key_ward_key_unit_count = df['district_key_ward_key_unit'].value_counts().reset_index()
unique_district_key_ward_key_units = district_key_ward_key_unit_count[district_key_ward_key_unit_count['count']==1]['district_key_ward_key_unit'].tolist()

In [115]:
DICT_not_unique_district_keys = {}
for row in df_not_unique_district_keys.itertuples():
    district_key = row.district_key
    province_data = {}
    tmp_province_keys = df[df.district_key==district_key]['province_key'].unique()
    for province_key in tmp_province_keys:
        tmp_ward_keys = df[(df.district_key==district_key) & (df.province_key==province_key) & (df.district_key_ward_key_unit.isin(unique_district_key_ward_key_units))]['ward_key'].unique().tolist()
        for ward_key in tmp_ward_keys:
            if re.search(r'^phuong\d{1,2}', ward_key):
                tmp_ward_keys.append(ward_key.replace('phuong', 'p.'))
                tmp_ward_keys.append(ward_key.replace('phuong', 'ward'))
        province_data[province_key] = tmp_ward_keys
    
    DICT_not_unique_district_keys[district_key] = province_data
    if re.search(r'quan\d{1,2}', district_key):
        DICT_not_unique_district_keys[district_key.replace('quan', 'q.')] = province_data
        DICT_not_unique_district_keys[district_key.replace('quan', 'district')] = province_data

In [116]:
DICT_not_unique_district_keys = dict(sorted(DICT_not_unique_district_keys.items(), key=lambda x: len(x[0]), reverse=True))

In [117]:
DICT_not_unique_district_keys

{'hoangmai': {'hanoi': ['thanhtri',
   'vinhhung',
   'dinhcong',
   'maidong',
   'tuongmai',
   'daikim',
   'tanmai',
   'hoangvanthu',
   'giapbat',
   'linhnam',
   'thinhliet',
   'tranphu',
   'hoangliet',
   'yenso'],
  'nghean': ['quynhvinh',
   'quynhloc',
   'quynhthien',
   'quynhlap',
   'quynhtrang',
   'maihung',
   'quynhdi',
   'quynhxuan',
   'quynhphuong',
   'quynhlien']},
 'tamnong': {'phutho': ['hunghoa',
   'hienquan',
   'bacson',
   'thanhuyen',
   'lamson',
   'vanxuan',
   'quanghuc',
   'huongnon',
   'tele',
   'thovan',
   'dinau',
   'danquyen'],
  'dongthap': ['tramchim',
   'hoabinh',
   'tancongsinh',
   'phuhiep',
   'phuduc',
   'phuthanhb',
   'anhoa',
   'anlong',
   'phucuong',
   'phuninh',
   'phutho',
   'phuthanha']}}

## Create alias province_keys

In [118]:
DICT_alias_province_keys = {'hcm':'hochiminh'}

## Save data


- DICT_long_province_alphanumerics `{'province alphanumeric': 'province key', ...}`
- DICT_long_district_alphanumerics `{'Province English': {'district alphanumeric': 'district key', ...}}`
- DICT_long_ward_alphanumerics `{'Province English': {'District English': {'long ward alphanumeric': 'ward key'}}}` 

- LIST_long_district_alphanumerics `['district alphanumeric', ...]`
- LIST_long_ward_alphanumerics `['long ward alphanumeric', ...]`

- LIST_province_keys_1 `['short province key', ...]`
- LIST_province_keys_2 `['short province key', ...]`
- LIST_province_keys_3 `['short province key', ...]`

- DICT_province_map `{'short province key': {'province': 'Hồ Chí Minh', 'long_province': 'Thành phố Hồ Chí Minh', ...}, ...}`
- DICT_district_map `{'Province English': {'district key': {'level english': {'district': 'Tân Bình', 'long_district': 'Quận Tân Bình', ...}}}}`
- DICT_ward_map `{'Province English':{'District English': {'ward key': {'level english': {'ward': 'Phú Trinh', 'long_ward': 'Phường Phú Trinh', ...}}}}}`

- LIST_duplicated_district_province_keys `['short province key', ...]`
- DICT_duplicated_district_keys

- LIST_duplicated_ward_district_keys `['short district key', ...]`
- DICT_duplicated_ward_keys

In [119]:
data = {
    'DICT_long_province_alphanumerics': DICT_long_province_alphanumerics,
    'DICT_long_district_alphanumerics': DICT_long_district_alphanumerics,
    'DICT_long_ward_alphanumerics': DICT_long_ward_alphanumerics,
    
    'LIST_safe_long_district_alphanumerics': LIST_safe_long_district_alphanumerics,
    'LIST_safe_long_ward_alphanumerics': LIST_safe_long_ward_alphanumerics,
    
    # 'LIST_province_keys_1': LIST_province_keys_1,
    # 'LIST_province_keys_2': LIST_province_keys_2,
    # 'LIST_province_keys_3': LIST_province_keys_3,
    
    'DICT_province_map': DICT_province_map,
    'DICT_district_map': DICT_district_map,
    'DICT_ward_map': DICT_ward_map,
    
    # 'LIST_duplicated_district_province_keys': LIST_duplicated_district_province_keys,
    'DICT_duplicated_district_keys': DICT_duplicated_district_keys,
    
    # 'LIST_duplicated_ward_district_keys': LIST_duplicated_ward_district_keys,
    'DICT_duplicated_ward_keys': DICT_duplicated_ward_keys,
    
    'DICT_unique_long_district_alphanumerics': DICT_unique_long_district_alphanumerics,
    'DICT_not_unique_long_district_alphanumerics': DICT_not_unique_long_district_alphanumerics,
    'LIST_contains_province_key_long_district_alphanumerics': LIST_contains_province_key_long_district_alphanumerics,
    'DICT_unique_district_keys': DICT_unique_district_keys,
    'DICT_not_unique_district_keys': DICT_not_unique_district_keys,
    
    'DICT_alias_province_keys': DICT_alias_province_keys
    
}

In [120]:
with open('../../vietadminunits/data/parse.pkl', 'wb') as f:
    pickle.dump(data, f)