# Build parse data

In [1]:
import numpy as np
import pandas as pd
import re
from unidecode import unidecode
import pickle

## Functions

In [2]:
def create_province_key(text):
    u_text = unidecode(text) # Unidecode first help remove special characters
    c_text = str(u_text).lower().strip() # Case must be second step
    c_text = re.sub(r'\sProvince$|^Tinh\s|^Thanh\sPho\s|\sCity$', '', c_text, flags=re.IGNORECASE)
    c_text = re.sub(r"\-|\'", ' ', c_text)
    c_text = c_text.replace(' ', '')
    return c_text

In [3]:
def create_district_key(text):
    if not isinstance(text, str):
        return text
    u_text = unidecode(text) # Unidecode first help remove special characters
    c_text = str(u_text).lower().strip() # Case must be second step
    c_text = re.sub(r"\-|\'", ' ', c_text)
    c_text = c_text.replace(' ', '')
    return c_text

In [4]:
def add_province_key(df_province, province_english, province_key):
    if not df_province[df_province.province_english==province_english].shape[0]:
        raise ValueError(f'{province_english} is not exist in province_english')
    elif df_province[(df_province.province_english==province_english) & (df_province.province_key==province_key)].shape[0]:
        raise ValueError(f'{province_key} is exist in province_key')
    
    df_new = df_province.loc[df_province.province_english==province_english].head(1)
    df_new['province_key'] = province_key
    df_province = pd.concat([df_province, df_new])
    return df_province

In [5]:
def add_district_key(df_district, province_english, district_english, district_key):
    if not df_district[df_district.province_english==province_english].shape[0]:
        raise ValueError(f'{province_english} is not exist in province_english')
    elif not df_district[(df_district.province_english==province_english) & (df_district.district_english==district_english)].shape[0]:
        raise ValueError(f'{district_english} is not exist in district_english of {province_english}')
    elif df_district[(df_district.province_english==province_english) & (df_district.district_english==district_english) & (df_district.district_key==district_key)].shape[0]:
        raise ValueError(f'{district_key} is exist in district_key of {province_english}, {district_english}')
    
    df_new = df_district.loc[(df_district.province_english==province_english) & (df_district.district_english==district_english)].head(1)
    df_new['district_key'] = district_key
    df_district = pd.concat([df_district, df_new])
    return df_district

## Building

In [6]:
df = pd.read_csv('../../data/output/vietnam_administrative_units.csv')

In [7]:
df.head()

Unnamed: 0,province,district,ward,long_province,long_district,long_ward,short_district,short_ward,province_english,district_english,ward_english,long_province_english,long_district_english,long_ward_english,short_district_english,short_ward_english,district_level,ward_level,district_level_english,ward_level_english
0,Hà Nội,Ba Đình,Phúc Xá,Thành phố Hà Nội,Quận Ba Đình,Phường Phúc Xá,Ba Đình,Phúc Xá,Ha Noi,Ba Dinh,Phuc Xa,Ha Noi City,Ba Dinh District,Phuc Xa Ward,Ba Dinh,Phuc Xa,Quận,Phường,District,Ward
1,Hà Nội,Ba Đình,Trúc Bạch,Thành phố Hà Nội,Quận Ba Đình,Phường Trúc Bạch,Ba Đình,Trúc Bạch,Ha Noi,Ba Dinh,Truc Bach,Ha Noi City,Ba Dinh District,Truc Bach Ward,Ba Dinh,Truc Bach,Quận,Phường,District,Ward
2,Hà Nội,Ba Đình,Vĩnh Phúc,Thành phố Hà Nội,Quận Ba Đình,Phường Vĩnh Phúc,Ba Đình,Vĩnh Phúc,Ha Noi,Ba Dinh,Vinh Phuc,Ha Noi City,Ba Dinh District,Vinh Phuc Ward,Ba Dinh,Vinh Phuc,Quận,Phường,District,Ward
3,Hà Nội,Ba Đình,Cống Vị,Thành phố Hà Nội,Quận Ba Đình,Phường Cống Vị,Ba Đình,Cống Vị,Ha Noi,Ba Dinh,Cong Vi,Ha Noi City,Ba Dinh District,Cong Vi Ward,Ba Dinh,Cong Vi,Quận,Phường,District,Ward
4,Hà Nội,Ba Đình,Liễu Giai,Thành phố Hà Nội,Quận Ba Đình,Phường Liễu Giai,Ba Đình,Liễu Giai,Ha Noi,Ba Dinh,Lieu Giai,Ha Noi City,Ba Dinh District,Lieu Giai Ward,Ba Dinh,Lieu Giai,Quận,Phường,District,Ward


In [8]:
df.shape

(10547, 20)

### Create keyword columns and fillna NULL columns

In [9]:
df['province_key'] = df['province'].apply(create_province_key)
df['district_key'] = df['short_district'].apply(create_district_key)
df['ward_key'] = df['short_ward'].apply(create_district_key)
df['district_level_english'].fillna('', inplace=True)
df['ward_level_english'].fillna('', inplace=True)

### Create keyword lists that are duplicated district_keyword

In [10]:
district_count = df[['province', 'long_district', 'district_key']].drop_duplicates()[['province', 'district_key']].value_counts().reset_index()
duplicated_districts = district_count[district_count['count'] > 1]['district_key'].tolist()

In [11]:
df_duplicated_districts = df[df.district_key.isin(duplicated_districts)][['long_province', 'long_district' ,'province_key','district_key']].drop_duplicates()

In [12]:
df_duplicated_districts

Unnamed: 0,long_province,long_district,province_key,district_key
5835,Tỉnh Hà Tĩnh,Huyện Kỳ Anh,hatinh,kyanh
5867,Tỉnh Hà Tĩnh,Thị xã Kỳ Anh,hatinh,kyanh
9160,Tỉnh Tiền Giang,Thị xã Cai Lậy,tiengiang,cailay
9213,Tỉnh Tiền Giang,Huyện Cai Lậy,tiengiang,cailay
9552,Tỉnh Trà Vinh,Huyện Duyên Hải,travinh,duyenhai
9559,Tỉnh Trà Vinh,Thị xã Duyên Hải,travinh,duyenhai
9673,Tỉnh Đồng Tháp,Thành phố Cao Lãnh,dongthap,caolanh
9697,Tỉnh Đồng Tháp,Thành phố Hồng Ngự,dongthap,hongngu
9713,Tỉnh Đồng Tháp,Huyện Hồng Ngự,dongthap,hongngu
9748,Tỉnh Đồng Tháp,Huyện Cao Lãnh,dongthap,caolanh


In [13]:
# Idea: Use their ward_key if possible
for district_key in duplicated_districts:
    long_districts = df[df.district_key==district_key]['long_district'].unique().tolist()
    a = []
    b = []
    long_district_a = long_districts[0]
    long_district_b = long_districts[1]
    ward_key_a = df[df.long_district==long_district_a]['ward_key'].unique().tolist()
    ward_key_b = df[df.long_district==long_district_b]['ward_key'].unique().tolist()
    common = list(set(a) & set(b))
    if common:
        print(district_key, 'has duplicated ward_key')

In [15]:
# Pickle
# Use Google Trend and search to decide level
duplicated_district_province_keys = df_duplicated_districts['province_key'].unique().tolist()

duplicated_district_keys = {
    'kyanh': {'default':'Town'},
    'cailay': {'default':'Town'},
    'duyenhai': {'default':'District'},
    'caolanh': {'default':'City'},
    'hongngu': {'default':'City'},
    'longmy': {'default':'District'}
}

for district_key in duplicated_districts:
    df_temp = df[df.district_key == district_key]
    level_data = {}
    levels = df_temp.district_level_english.unique().tolist()
    for district_level_english in levels:
        ward_keys = df_temp[df_temp.district_level_english==district_level_english]['ward_key'].unique().tolist()
        level_data[district_level_english] = ward_keys
        duplicated_district_keys[district_key]['levels'] = level_data

In [17]:
print('Provinces:', duplicated_district_province_keys)
print('Districts:', duplicated_district_keys)

Provinces: ['hatinh', 'tiengiang', 'travinh', 'dongthap', 'haugiang']
Districts: {'kyanh': {'default': 'Town', 'levels': {'District': ['kyxuan', 'kybac', 'kyphu', 'kyphong', 'kytien', 'kygiang', 'kydong', 'kykhang', 'kyvan', 'kytrung', 'kytho', 'kytay', 'kythuong', 'kyhai', 'kythu', 'kychau', 'kytan', 'lamhop', 'kyson', 'kylac'], 'Town': ['hungtri', 'kyninh', 'kyloi', 'kyha', 'kytrinh', 'kythinh', 'kyhoa', 'kyphuong', 'kylong', 'kylien', 'kynam']}}, 'cailay': {'default': 'Town', 'levels': {'Town': ['phuong1', 'phuong2', 'phuong3', 'phuong4', 'phuong5', 'myphuoctay', 'myhanhdong', 'myhanhtrung', 'tanphu', 'tanbinh', 'tanhoi', 'nhimy', 'nhiquy', 'thanhhoa', 'phuquy', 'longkhanh'], 'District': ['thanhloc', 'mythanhbac', 'phucuong', 'mythanhnam', 'phunhuan', 'binhphu', 'camson', 'phuan', 'mylong', 'longtien', 'hiepduc', 'longtrung', 'hoixuan', 'tanphong', 'tambinh', 'nguhiep']}}, 'duyenhai': {'default': 'District', 'levels': {'District': ['donxuan', 'donchau', 'longthanh', 'longkhanh', 'ng

### Create keyword lists that are duplicated ward_keyword

In [18]:
ward_count = df[['long_province', 'long_district', 'short_ward']].value_counts().reset_index()
df_duplicated_wards = ward_count[ward_count['count'] > 1].copy()
df_duplicated_wards.sort_values(by=['long_province', 'long_district', 'short_ward'], inplace=True)

In [19]:
df_duplicated_wards = df[(df.long_province.isin(df_duplicated_wards.long_province)) & (df.long_district.isin(df_duplicated_wards.long_district)) & (df.short_ward.isin(df_duplicated_wards.short_ward))][['province', 'long_district', 'ward','ward_level_english', 'district_key', 'ward_key']]

In [20]:
df_duplicated_wards

Unnamed: 0,province,long_district,ward,ward_level_english,district_key,ward_key
176,Hà Nội,Huyện Gia Lâm,Thị trấn Yên Viên,Town,gialam,yenvien
178,Hà Nội,Huyện Gia Lâm,Xã Yên Viên,Commune,gialam,yenvien
1479,Lai Châu,Huyện Mường Tè,Thị trấn Mường Tè,Town,muongte,muongte
1484,Lai Châu,Huyện Mường Tè,Xã Mường Tè,Commune,muongte,muongte
1706,Sơn La,Huyện Mai Sơn,Thị trấn Hát Lót,Town,maison,hatlot
1716,Sơn La,Huyện Mai Sơn,Xã Hát Lót,Commune,maison,hatlot
1882,Yên Bái,Huyện Trạm Tấu,Thị trấn Trạm Tấu,Town,tramtau,tramtau
1887,Yên Bái,Huyện Trạm Tấu,Xã Trạm Tấu,Commune,tramtau,tramtau
1918,Yên Bái,Huyện Yên Bình,Thị trấn Yên Bình,Town,yenbinh,yenbinh
1937,Yên Bái,Huyện Yên Bình,Xã Yên Bình,Commune,yenbinh,yenbinh


Following Google Trend, "xã ..." is higher "thị trấn ..." significant. But "phường ..." is higher than "xã ..." and we have only one ward in the list.

In [21]:
# Pickle
duplicated_ward_keys = df_duplicated_wards.ward_key.unique().tolist()
duplicated_ward_district_keys = df_duplicated_wards.district_key.unique().tolist()

new_data = {}

for ward_key in duplicated_ward_keys:
    levels = df_duplicated_wards[df_duplicated_wards.ward_key == ward_key]['ward_level_english'].tolist()
    if 'Ward' in levels:
        ward_level = 'Ward'
    elif 'Commune' in levels:
        ward_level = 'Commune'
    else:
        ward_level = 'Town'
    new_data[ward_key] = ward_level
    
duplicated_ward_keys = new_data.copy()

In [22]:
print('Districts:' ,duplicated_ward_district_keys)
print('Wards:', duplicated_ward_keys)

Districts: ['gialam', 'muongte', 'maison', 'tramtau', 'yenbinh', 'chilang', 'dinhlap', 'damha', 'dahuoai', 'duchoa', 'tracu', 'thapmuoi', 'caolanh', 'vinhthuan', 'longphu', 'trande', 'phuoclong', 'camau', 'thoibinh']
Wards: {'yenvien': 'Commune', 'muongte': 'Commune', 'hatlot': 'Commune', 'tramtau': 'Commune', 'yenbinh': 'Commune', 'chilang': 'Commune', 'dinhlap': 'Commune', 'damha': 'Commune', 'madaguoi': 'Commune', 'hiephoa': 'Commune', 'dinhan': 'Commune', 'myan': 'Commune', 'mytho': 'Commune', 'vinhthuan': 'Commune', 'longphu': 'Commune', 'lichhoithuong': 'Commune', 'phuoclong': 'Commune', 'tanthanh': 'Ward', 'thoibinh': 'Commune'}


### Sort province_key to prioritize some provinces
- Some province has districts, or wards that are the same keyword with another province.
- We need to run module_testing to check whether any address is wrong > Use ChatGPT to sort keywords > Add manually if we found new wrong keyword.

For example:
- 'Huyện Quang Bình, Tỉnh Hà Giang' -> quangbinh
- 'Huyện Phù Yên, Tỉnh Sơn La' -> phuyen
- 'Huyện Văn Giang, Tỉnh Hưng Yên' -> angiang
- 'Huyện Quảng Ninh, Tỉnh Quảng Bình' -> quangninh
- Bac Lieu, Hoa Binh District -> hoabinh

In [23]:
province_keys = df.province_key.unique().tolist()

In [24]:
def find_province_key_match(text):
    for province_key in province_keys:
        if province_key in text:
            return province_key

In [25]:
match_district = df[df.district_key.str.contains('|'.join(province_keys))][['province', 'district', 'province_key','district_key']].drop_duplicates()

In [26]:
match_ward = df[(~df.ward.isna()) & (df.ward_key.str.contains('|'.join(province_keys)))][['province', 'ward', 'province_key','ward_key']].drop_duplicates()

In [27]:
match_district['match_province_key'] = match_district['district_key'].apply(find_province_key_match)

In [28]:
match_ward['match_province_key'] = match_ward['ward_key'].apply(find_province_key_match)

In [29]:
match_district = match_district[match_district.province_key != match_district.match_province_key]

In [30]:
match_district

Unnamed: 0,province,district,province_key,district_key,match_province_key
757,Hà Giang,Quang Bình,hagiang,quangbinh,quangbinh
1649,Sơn La,Phù Yên,sonla,phuyen,phuyen
3824,Hưng Yên,Văn Giang,hungyen,vangiang,angiang
5972,Quảng Bình,Quảng Ninh,quangbinh,quangninh,quangninh
9009,Long An,Thạnh Hóa,longan,thanhhoa,thanhhoa
10438,Bạc Liêu,Hòa Bình,baclieu,hoabinh,hoabinh


In [31]:
match_ward = match_ward[match_ward.province_key != match_ward.match_province_key]

In [32]:
match_ward

Unnamed: 0,province,ward,province_key,ward_key,match_province_key
2,Hà Nội,Vĩnh Phúc,hanoi,vinhphuc,vinhphuc
8,Hà Nội,Điện Biên,hanoi,dienbien,dienbien
142,Hà Nội,Xuân Giang,hanoi,xuangiang,angiang
316,Hà Nội,Yên Bài,hanoi,yenbai,yenbai
353,Hà Nội,Đồng Tháp,hanoi,dongthap,dongthap
...,...,...,...,...,...
10154,Cần Thơ,Thạnh Hòa,cantho,thanhhoa,thanhhoa
10236,Hậu Giang,Thạnh Hòa,haugiang,thanhhoa,thanhhoa
10363,Sóc Trăng,Khánh Hòa,soctrang,khanhhoa,khanhhoa
10438,Bạc Liêu,Hòa Bình,baclieu,hoabinh,hoabinh


In [33]:
# Should not use list(set()) because it will re-order
province_order = []

for row in match_district.itertuples():
    if row.province_key not in province_order:
        province_order.append(row.province_key)
    if row.match_province_key not in province_order:
        province_order.append(row.match_province_key)
        
for row in match_ward.itertuples():
    if row.province_key not in province_order:
        province_order.append(row.province_key)
    if row.match_province_key not in province_order:
        province_order.append(row.match_province_key)

In [34]:
province_key_order = {}
for index, province_key in enumerate(province_order):
    province_key_order[province_key] = index + 1
    

In [35]:
print(province_key_order)

{'hagiang': 1, 'quangbinh': 2, 'sonla': 3, 'phuyen': 4, 'hungyen': 5, 'angiang': 6, 'quangninh': 7, 'longan': 8, 'thanhhoa': 9, 'baclieu': 10, 'hoabinh': 11, 'hanoi': 12, 'vinhphuc': 13, 'dienbien': 14, 'yenbai': 15, 'dongthap': 16, 'caobang': 17, 'binhduong': 18, 'backan': 19, 'phutho': 20, 'tuyenquang': 21, 'thaibinh': 22, 'laichau': 23, 'khanhhoa': 24, 'binhthuan': 25, 'thainguyen': 26, 'langson': 27, 'bacgiang': 28, 'hanam': 29, 'binhdinh': 30, 'bacninh': 31, 'haiphong': 32, 'lamdong': 33, 'vinhlong': 34, 'tayninh': 35, 'haugiang': 36, 'namdinh': 37, 'haiduong': 38, 'ninhbinh': 39, 'nghean': 40, 'hatinh': 41, 'kiengiang': 42, 'quangtri': 43, 'thuathienhue': 44, 'danang': 45, 'quangnam': 46, 'travinh': 47, 'quangngai': 48, 'binhphuoc': 49, 'kontum': 50, 'daknong': 51, 'gialai': 52, 'daklak': 53, 'dongnai': 54, 'bariavungtau': 55, 'hochiminh': 56, 'tiengiang': 57, 'cantho': 58, 'soctrang': 59, 'camau': 60}


In [36]:


# province_key_order = {
#     'yenbai': 1,
#     'khanhhoa': 2,
#     'binhthuan': 3,
#     'thainguyen': 4,
#     'thaibinh': 5,
#     'longan': 5,
#     'thanhhoa': 6,
#     'hagiang': 6,
#     'quangbinh': 7,
#     'quangninh': 8,
#     'quangngai': 8,
#     'quangtri': 9,
#     'binhdinh': 10,
#     'binhduong': 11,
#     'binhphuoc': 12,
#     'dongnai': 13,
#     'lamdong': 14,
#     'phuyen': 15,
#     'vinhlong': 16,
#     'soctrang': 18,
#     'bacgiang': 19,
#     'camau': 20,
#     'angiang': 21,
#     'haiphong': 22,
#     'hoabinh': 23,
#     'ninhbinh': 25,
#     'cambang': 26,
#     'backan': 27,
#     'caobang': 28
# 
# 
# }


df['province_key_order'] = df['province_key'].map(province_key_order).fillna(0)

df.sort_values(by='province_key_order', inplace=True)

### Add alias province_keyword
- We should not add "hn" because it will cause many wrong matches. I will replace `\bhn\b` to `\bha noi\b` of the address, this step is in `parse.py`

In [37]:
df_province = df[[col for col in df.columns if 'province' in col]].drop_duplicates()

province_alias_keys = [
    ('Ho Chi Minh', 'hcm')
]

for key in province_alias_keys:
    province_english, province_key = key
    df_province = add_province_key(df_province, province_english, province_key)

### Add alias district_keyword
There are many district was changed it's name or be combined with other districts. For instance: Quan 9 > Thanh pho Thu Duc.

Use `find_district_alias_keywords.ipynb` to create a list of tuples.

In [38]:
df_district = df[['province_english'] + [col for col in df.columns if 'district' in col]].drop_duplicates()

district_alias_keys = [
    ('An Giang', 'Chau Thanh', 'hueduc'),
    ('Ba Ria - Vung Tau', 'Phu My', 'tanthanh'),
    ('Bac Kan', 'Ba Be', 'chora'),
    ('Bac Lieu', 'Bac Lieu', 'minhhai'),
    ('Binh Duong', 'Tan Uyen', 'chauthanh'),
    ('Binh Duong', 'Thuan An', 'laithieu'),
    ('Binh Phuoc', 'Phuoc Long', 'phuocbinh'),
    ('Ca Mau', 'Dam Doi', 'ngochien'),
    ('Cao Bang', 'Quang Hoa', 'phuchoa'),
    ('Cao Bang', 'Quang Hoa', 'quanguyen'),
    ('Cao Bang', 'Ha Quang', 'thongnong'),
    ('Cao Bang', 'Trung Khanh', 'tralinh'),
    ('Dien Bien', 'Muong Lay', 'laichau'),
    ('Dien Bien', 'Muong Cha', 'muonglay'),
    ('Dong Nai', 'Vinh Cuu', 'vinhan'),
    ('Dong Thap', 'Lap Vo', 'thanhhung'),
    ('Ha Nam', 'Phu Ly', 'hanam'),
    ('Ha Noi', 'Soc Son', 'daphuc'),
    ('Ha Noi', 'Soc Son', 'kimanh'),
    ('Hai Phong', 'Cat Hai', 'catba'),
    ('Hau Giang', 'Vi Thanh', 'mythanh'),
    ('Hau Giang', 'Nga Bay', 'tanhiep'),
    ('Hoa Binh', 'Hoa Binh', 'kyson'),
    ('Khanh Hoa', 'Dien Khanh', 'khanhxuong'),
    ('Kien Giang', 'Kien Luong', 'hatien'),
    ('Lao Cai', 'Lao Cai', 'camduong'),
    ('Nam Dinh', 'Nam Dinh', 'myloc'),
    ('Ninh Binh', 'Hoa Lu', 'giakhanh'),
    ('Ninh Binh', 'Nho Quan', 'hoanglong'),
    ('Ninh Binh', 'Yen Mo', 'tamdiep'),
    ('Phu Tho', 'Cam Khe', 'songthao'),
    ('Quang Nam', 'Nam Giang', 'giang'),
    ('Quang Ngai', 'Tra Bong', 'taytra'),
    ('Quang Ninh', 'Ha Long', 'honggai'),
    ('Quang Ninh', 'Van Don', 'campha'),
    ('Quang Ninh', 'Mong Cai', 'haininh'),
    ('Quang Ninh', 'Quang Yen', 'yenhung'),
    ('Quang Ninh', 'Ha Long', 'hoanhbo'),
    ('Tay Ninh', 'Hoa Thanh', 'phukhuong'),
    ('Thanh Hoa', 'Dong Son', 'dongthieu'),
    ('Thanh Hoa', 'Yen Dinh', 'thieuyen'),
    ('Thanh Hoa', 'Nghi Son', 'tinhgia'),
    ('Ho Chi Minh', 'Can Gio', 'duyenhai'),
    ('Ho Chi Minh', 'Thu Duc', 'quan2'),
    ('Ho Chi Minh', 'Thu Duc', 'quan9'),
    # ('Ho Chi Minh', 'Thu Duc', 'thuduc(quan)'),
    ('Tra Vinh', 'Cang Long', 'chauthanhdong'),
    ('Vinh Long', 'Long Ho', 'cainhum'),
    ('Vinh Long', 'Long Ho', 'chauthanhtay'),
    ('Ben Tre', 'Mo Cay Nam', 'mocay'),
    ('Binh Thuan', 'Ham Thuan Nam', 'hamthuan'),
    ('Ha Noi', 'Nam Tu Liem', 'tuliem'),
    ('Quang Nam', 'Nam Tra My', 'tramy'),
    ('Tien Giang', 'Go Cong Tay', 'gocong')

]

for key in district_alias_keys:
    province_english, district_english, district_key = key
    df_district = add_district_key(df_district, province_english, district_english, district_key)

Some districts contain "quan" with number in their keywords. We need to make a copy and replace "quan" to "district".

In [39]:
hcm_districts = df_district[df_district['district_key'].str.contains(r'quan\d{1,2}')].copy()
hcm_districts['district_key'] = hcm_districts['district_key'].str.replace('quan', 'district')
df_district = pd.concat([df_district, hcm_districts])

### Create alias ward_key
Some wards contain "phuong" with number in their keywords. We need to make a copy and replace "phuong" to "ward".


In [40]:
df_ward = df[['province_english', 'district_english'] + [col for col in df.columns if 'ward' in col]].drop_duplicates()

In [41]:
number_wards = df_ward[df_ward['ward_key'].fillna('').str.contains(r'phuong\d{1,2}')].copy()
number_wards['ward_key'] = number_wards['ward_key'].str.replace('phuong', 'ward')
df_ward = pd.concat([df_ward, number_wards])

### Create province_keys lists
We will prioritize provinces that are not in district_key or ward_key when searching a province_key in address.

In [42]:
province_keys = df_province['province_key'].tolist()

# Pickle
province_keys_1 = []
province_keys_2 = []
province_keys_3 = []

district_keys = str(df.district_key.unique().tolist())
ward_keys = str(df.ward_key.unique().tolist())
for province_key in province_keys:
    if (province_key not in district_keys) and (province_key not in ward_keys):
        province_keys_1.append(province_key)
    elif province_key not in ward_keys:
        province_keys_2.append(province_key)
    else:
        province_keys_3.append(province_key)

### Create mapping dictionaries

In [43]:
# Pickle
province_map = {}

for province_key in df_province.province_key.unique():
    province = df_province[df_province['province_key'] == province_key]
    province_record = province.to_dict(orient='records')[0]
    province_map[province_key] = province_record

In [44]:
print(province_map)

{'laocai': {'province': 'Lào Cai', 'long_province': 'Tỉnh Lào Cai', 'province_english': 'Lao Cai', 'long_province_english': 'Lao Cai Province', 'province_key': 'laocai', 'province_key_order': 0.0}, 'bentre': {'province': 'Bến Tre', 'long_province': 'Tỉnh Bến Tre', 'province_english': 'Ben Tre', 'long_province_english': 'Ben Tre Province', 'province_key': 'bentre', 'province_key_order': 0.0}, 'ninhthuan': {'province': 'Ninh Thuận', 'long_province': 'Tỉnh Ninh Thuận', 'province_english': 'Ninh Thuan', 'long_province_english': 'Ninh Thuan Province', 'province_key': 'ninhthuan', 'province_key_order': 0.0}, 'hagiang': {'province': 'Hà Giang', 'long_province': 'Tỉnh Hà Giang', 'province_english': 'Ha Giang', 'long_province_english': 'Ha Giang Province', 'province_key': 'hagiang', 'province_key_order': 1.0}, 'quangbinh': {'province': 'Quảng Bình', 'long_province': 'Tỉnh Quảng Bình', 'province_english': 'Quang Binh', 'long_province_english': 'Quang Binh Province', 'province_key': 'quangbinh', 

In [45]:
# Pickle
district_map = {}

for province_english in df_province.province_english.unique():
    district_keys = {}
    for district_key in df_district[df_district.province_english == province_english]['district_key'].unique():
        district_levels = {}
        for district_level_english in df_district[(df_district.province_english == province_english) & (df_district.district_key == district_key)]['district_level_english'].unique():
            district = df_district[(df_district.province_english == province_english) & (df_district.district_key == district_key) & (df_district.district_level_english == district_level_english)]
            district_record = district[[col for col in district.columns if 'district' in col]].to_dict('records')[0]
            district_levels[district_level_english] = district_record
        district_keys[district_key] = district_levels
        
    district_keys = dict(sorted(district_keys.items(), key=lambda item: len(item[0]), reverse=True))
    
    district_map[province_english] = district_keys

In [46]:
print(district_map['Ho Chi Minh'])

{'district10': {'District': {'district': 'Quận 10', 'long_district': 'Quận 10', 'short_district': 'Quận 10', 'district_english': 'District 10', 'long_district_english': 'District 10', 'short_district_english': 'District 10', 'district_level': 'Quận', 'district_level_english': 'District', 'district_key': 'district10'}}, 'district11': {'District': {'district': 'Quận 11', 'long_district': 'Quận 11', 'short_district': 'Quận 11', 'district_english': 'District 11', 'long_district_english': 'District 11', 'short_district_english': 'District 11', 'district_level': 'Quận', 'district_level_english': 'District', 'district_key': 'district11'}}, 'district12': {'District': {'district': 'Quận 12', 'long_district': 'Quận 12', 'short_district': 'Quận 12', 'district_english': 'District 12', 'long_district_english': 'District 12', 'short_district_english': 'District 12', 'district_level': 'Quận', 'district_level_english': 'District', 'district_key': 'district12'}}, 'binhthanh': {'District': {'district': 

In [47]:
# Pickle
ward_map = {}

for province_english in df_province.province_english.unique():
    districts = {}
    for district_english in df_district[df_district.province_english==province_english].district_english.unique():
        wards = {}
        for ward_key in df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english)].ward_key.dropna().unique():
            ward_levels = {}
            for ward_level_english in df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english) & (df_ward.ward_key==ward_key)].ward_level_english.unique():
                ward = df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english) & (df_ward.ward_key==ward_key) & (df_ward.ward_level_english==ward_level_english)]
                ward_record = ward[[col for col in ward.columns if 'ward' in col]].to_dict('records')[0]
                ward_levels[ward_level_english] = ward_record
            wards[ward_key] = ward_levels
        wards = dict(sorted(wards.items(), key=lambda item: len(str(item[0])), reverse=True))
        districts[district_english] = wards
    ward_map[province_english] = districts

In [48]:
print(ward_map['Ho Chi Minh']['Tan Binh'])

{'phuong15': {'Ward': {'ward': 'Phường 15', 'long_ward': 'Phường 15', 'short_ward': 'Phường 15', 'ward_english': 'Ward 15', 'long_ward_english': 'Ward 15', 'short_ward_english': 'Ward 15', 'ward_level': 'Phường', 'ward_level_english': 'Ward', 'ward_key': 'phuong15'}}, 'phuong14': {'Ward': {'ward': 'Phường 14', 'long_ward': 'Phường 14', 'short_ward': 'Phường 14', 'ward_english': 'Ward 14', 'long_ward_english': 'Ward 14', 'short_ward_english': 'Ward 14', 'ward_level': 'Phường', 'ward_level_english': 'Ward', 'ward_key': 'phuong14'}}, 'phuong10': {'Ward': {'ward': 'Phường 10', 'long_ward': 'Phường 10', 'short_ward': 'Phường 10', 'ward_english': 'Ward 10', 'long_ward_english': 'Ward 10', 'short_ward_english': 'Ward 10', 'ward_level': 'Phường', 'ward_level_english': 'Ward', 'ward_key': 'phuong10'}}, 'phuong11': {'Ward': {'ward': 'Phường 11', 'long_ward': 'Phường 11', 'short_ward': 'Phường 11', 'ward_english': 'Ward 11', 'long_ward_english': 'Ward 11', 'short_ward_english': 'Ward 11', 'ward_l

### Create double check dictionaries
Some address will cause wrong mapping because they have ward_key is as same as other province's province_key.

In [49]:
double_check_provinces = {
    'angiang': ['thainguyen', 'ninhbinh', 'thanhhoa'],
    'hoabinh': ['thainguyen',
    'langson',
    'haiphong',
    'thaibinh',
    'dongnai',
    'vinhlong',
    'dongthap'],
    'binhthuan': ['thainguyen', 'quangngai', 'binhdinh'],
    'thanhhoa': ['langson', 'binhphuoc', 'travinh', 'kiengiang', 'haugiang'],
    'thaibinh': ['langson', 'tayninh'],
    'hagiang': ['thaibinh', 'thanhhoa'],
    'quangbinh': ['thaibinh', 'thanhhoa'],
    'sonla': ['ninhbinh', 'quangngai', 'khanhhoa'],
    'khanhhoa': ['ninhbinh', 'kiengiang'],
    'longan': ['thanhhoa', 'dongnai', 'vinhlong'],
    'quangninh': ['thanhhoa'],
    'vinhlong': ['quangtri'],
    'haiduong': ['quangtri'],
    'haiphong': ['quangtri'],
    'binhduong': ['quangnam', 'quangngai', 'binhdinh'],
    'binhdinh': ['quangnam'],
    'hungyen': ['kiengiang']
}

# double_check_provinces = {} # testing

Some address will cause wrong mapping because they have ward_key is as same as other district's district_key.

In [50]:
double_check_districts = {'unghoa': ['caugiay', 'chuongmy'],
 'thanhtri': ['hoangmai'],
 'thanhxuan': ['socson'],
 'thanhoai': ['thanhtri'],
 'thongnong': ['haquang'],
 'tralinh': ['trungkhanh'],
 'quanguyen': ['quanghoa'],
 'chora': ['babe'],
 'yenson': ['sonduong'],
 'camduong': ['laocai'],
 'muongla': ['phuyen', 'songma', 'sopcop'],
 'tranyen': ['lucyen', 'yenbinh'],
 'kyson': ['hoabinh', 'tanky'],
 'chilang': ['langson', 'trangdinh'],
 'honggai': ['halong'],
 'hoanhbo': ['halong'],
 'uongbi': ['mongcai'],
 'halong': ['vandon'],
 'haiduong': ['binhgiang'],
 'anduong': ['lechan', 'duongkinh', 'thuynguyen', 'vinhbao'],
 'catba': ['cathai'],
 'phucu': ['hungyen'],
 'myloc': ['namdinh'],
 'dongson': ['thanhhoa', 'bimson'],
 'hatrung': ['bathuoc'],
 'thanhhoa': ['nhuxuan', 'benluc'],
 'vinh': ['anhson', 'yenthanh', 'hoangmai'],
 'yenthanh': ['thanhchuong'],
 'anhson': ['thanhchuong', 'namdan'],
 'thachha': ['hatinh'],
 'huongthuy': ['hue'],
 'giang': ['thangbinh', 'bactramy', 'nuithanh'],
 'tramy': ['bactramy'],
 'sontinh': ['sontay'],
 'ducpho': ['moduc'],
 'dakto': ['konray', 'tumorong'],
 'dakha': ['tumorong'],
 'dakpo': ['kongchro'],
 'iapa': ['chuse'],
 'krongbuk': ['krongpac'],
 'phuocbinh': ['phuoclong'],
 'hoathanh': ['chauthanh'],
 'laithieu': ['thuanan'],
 'thongnhat': ['bienhoa'],
 'vinhan': ['vinhcuu'],
 'binhchanh': ['thuduc'],
 'tanphu': ['thuduc', 'quan7', 'cuchi'],
 'tanthanh': ['mochoa', 'thuthua'],
 'tanan': ['canduoc'],
 'tanhung': ['chauthanh'],
 'tanphuoc': ['gocongdong'],
 'mocay': ['mocaynam'],
 'thanhphu': ['giongtrom', 'binhdai'],
 'cainhum': ['mangthit'],
 'longho': ['mangthit'],
 'chauphu': ['chaudoc'],
 'anphu': ['phutan', 'tinhbien', 'chauthanh', 'thoaison'],
 'anminh': ['chauthanh', 'uminhthuong'],
 'vithanh': ['vithuy']}

# double_check_districts = {} # tesing

In [52]:
with open('../../vietadminunits/data/parse.pkl', 'wb') as f:
    pickle.dump((duplicated_district_keys, duplicated_district_province_keys, duplicated_ward_keys, duplicated_ward_district_keys, province_keys_1, province_keys_2, province_keys_3, province_map, district_map, ward_map, double_check_provinces, double_check_districts), f)