# Build parse data

In [1]:
import numpy as np
import pandas as pd
import re
from unidecode import unidecode
import pickle

## Functions

In [2]:
def create_province_key(text):
    u_text = unidecode(text) # Unidecode first help remove special characters
    c_text = str(u_text).lower().strip() # Case must be second step
    c_text = re.sub(r'\sProvince$|^Tinh\s|^Thanh\sPho\s|\sCity$', '', c_text, flags=re.IGNORECASE)
    c_text = re.sub(r"\-|\'", ' ', c_text)
    c_text = c_text.replace(' ', '')
    return c_text

In [3]:
def create_district_key(text):
    if not isinstance(text, str):
        return text
    u_text = unidecode(text) # Unidecode first help remove special characters
    c_text = str(u_text).lower().strip() # Case must be second step
    c_text = re.sub(r"\-|\'", ' ', c_text)
    c_text = c_text.replace(' ', '')
    return c_text

In [4]:
def add_province_key(df_province, province_english, province_key):
    if not df_province[df_province.province_english==province_english].shape[0]:
        raise ValueError(f'{province_english} is not exist in province_english')
    elif df_province[(df_province.province_english==province_english) & (df_province.province_key==province_key)].shape[0]:
        raise ValueError(f'{province_key} is exist in province_key')
    
    df_new = df_province.loc[df_province.province_english==province_english].head(1)
    df_new['province_key'] = province_key
    df_province = pd.concat([df_province, df_new])
    return df_province

In [5]:
def add_district_key(df_district, province_english, district_english, district_key):
    if not df_district[df_district.province_english==province_english].shape[0]:
        raise ValueError(f'{province_english} is not exist in province_english')
    elif not df_district[(df_district.province_english==province_english) & (df_district.district_english==district_english)].shape[0]:
        raise ValueError(f'{district_english} is not exist in district_english of {province_english}')
    elif df_district[(df_district.province_english==province_english) & (df_district.district_english==district_english) & (df_district.district_key==district_key)].shape[0]:
        raise ValueError(f'{district_key} is exist in district_key of {province_english}, {district_english}')
    
    df_new = df_district.loc[(df_district.province_english==province_english) & (df_district.district_english==district_english)].head(1)
    df_new['district_key'] = district_key
    df_district = pd.concat([df_district, df_new])
    return df_district

## Building

In [6]:
df = pd.read_csv('../data/output/vietnam_administrative_units.csv')

In [7]:
df.head()

Unnamed: 0,province,district,ward,long_province,long_district,long_ward,short_district,short_ward,province_english,district_english,ward_english,long_province_english,long_district_english,long_ward_english,short_district_english,short_ward_english,district_level,ward_level,district_level_english,ward_level_english
0,Hà Nội,Ba Đình,Phúc Xá,Thành phố Hà Nội,Quận Ba Đình,Phường Phúc Xá,Ba Đình,Phúc Xá,Ha Noi,Ba Dinh,Phuc Xa,Ha Noi City,Ba Dinh District,Phuc Xa Ward,Ba Dinh,Phuc Xa,Quận,Phường,District,Ward
1,Hà Nội,Ba Đình,Trúc Bạch,Thành phố Hà Nội,Quận Ba Đình,Phường Trúc Bạch,Ba Đình,Trúc Bạch,Ha Noi,Ba Dinh,Truc Bach,Ha Noi City,Ba Dinh District,Truc Bach Ward,Ba Dinh,Truc Bach,Quận,Phường,District,Ward
2,Hà Nội,Ba Đình,Vĩnh Phúc,Thành phố Hà Nội,Quận Ba Đình,Phường Vĩnh Phúc,Ba Đình,Vĩnh Phúc,Ha Noi,Ba Dinh,Vinh Phuc,Ha Noi City,Ba Dinh District,Vinh Phuc Ward,Ba Dinh,Vinh Phuc,Quận,Phường,District,Ward
3,Hà Nội,Ba Đình,Cống Vị,Thành phố Hà Nội,Quận Ba Đình,Phường Cống Vị,Ba Đình,Cống Vị,Ha Noi,Ba Dinh,Cong Vi,Ha Noi City,Ba Dinh District,Cong Vi Ward,Ba Dinh,Cong Vi,Quận,Phường,District,Ward
4,Hà Nội,Ba Đình,Liễu Giai,Thành phố Hà Nội,Quận Ba Đình,Phường Liễu Giai,Ba Đình,Liễu Giai,Ha Noi,Ba Dinh,Lieu Giai,Ha Noi City,Ba Dinh District,Lieu Giai Ward,Ba Dinh,Lieu Giai,Quận,Phường,District,Ward


In [8]:
df.shape

(10547, 20)

### Create keyword columns and fillna NULL columns

In [9]:
df['province_key'] = df['province'].apply(create_province_key)
df['district_key'] = df['short_district'].apply(create_district_key)
df['ward_key'] = df['short_ward'].apply(create_district_key)
df['district_level_english'].fillna('', inplace=True)
df['ward_level_english'].fillna('', inplace=True)

### Create keyword lists that are duplicated district_keyword

In [10]:
district_count = df[['province', 'long_district', 'short_district']].drop_duplicates()[['province', 'short_district']].value_counts().reset_index()
duplicated_districts = district_count[district_count['count'] > 1]['short_district'].tolist()

In [11]:
print(duplicated_districts)

['Duyên Hải', 'Long Mỹ', 'Hồng Ngự', 'Cao Lãnh', 'Kỳ Anh', 'Cai Lậy']


In [12]:
# Pickle
duplicated_district_keys = df[df.short_district.isin(duplicated_districts)]['district_key'].unique().tolist()
duplicated_district_province_keys = df[df.short_district.isin(duplicated_districts)]['province_key'].unique().tolist()

In [13]:
print('Provinces:', duplicated_district_province_keys)
print('Districts:', duplicated_district_keys)

Provinces: ['hatinh', 'tiengiang', 'travinh', 'dongthap', 'haugiang']
Districts: ['kyanh', 'cailay', 'duyenhai', 'caolanh', 'hongngu', 'longmy']


### Create keyword lists that are duplicated ward_keyword

In [14]:
ward_count = df[['long_province', 'long_district', 'short_ward']].value_counts().reset_index()
df_duplicated_wards = ward_count[ward_count['count'] > 1].copy()
df_duplicated_wards.sort_values(by=['long_province', 'long_district', 'short_ward'], inplace=True)

In [15]:
df_duplicated_wards = df[(df.long_province.isin(df_duplicated_wards.long_province)) & (df.long_district.isin(df_duplicated_wards.long_district)) & (df.short_ward.isin(df_duplicated_wards.short_ward))]

In [16]:
df_duplicated_wards

Unnamed: 0,province,district,ward,long_province,long_district,long_ward,short_district,short_ward,province_english,district_english,...,long_ward_english,short_district_english,short_ward_english,district_level,ward_level,district_level_english,ward_level_english,province_key,district_key,ward_key
176,Hà Nội,Gia Lâm,Thị trấn Yên Viên,Thành phố Hà Nội,Huyện Gia Lâm,Thị trấn Yên Viên,Gia Lâm,Yên Viên,Ha Noi,Gia Lam,...,Yen Vien Town,Gia Lam,Yen Vien,Huyện,Thị trấn,District,Town,hanoi,gialam,yenvien
178,Hà Nội,Gia Lâm,Xã Yên Viên,Thành phố Hà Nội,Huyện Gia Lâm,Xã Yên Viên,Gia Lâm,Yên Viên,Ha Noi,Gia Lam,...,Yen Vien Commune,Gia Lam,Yen Vien,Huyện,Xã,District,Commune,hanoi,gialam,yenvien
1479,Lai Châu,Mường Tè,Thị trấn Mường Tè,Tỉnh Lai Châu,Huyện Mường Tè,Thị trấn Mường Tè,Mường Tè,Mường Tè,Lai Chau,Muong Te,...,Muong Te Town,Muong Te,Muong Te,Huyện,Thị trấn,District,Town,laichau,muongte,muongte
1484,Lai Châu,Mường Tè,Xã Mường Tè,Tỉnh Lai Châu,Huyện Mường Tè,Xã Mường Tè,Mường Tè,Mường Tè,Lai Chau,Muong Te,...,Muong Te Commune,Muong Te,Muong Te,Huyện,Xã,District,Commune,laichau,muongte,muongte
1706,Sơn La,Mai Sơn,Thị trấn Hát Lót,Tỉnh Sơn La,Huyện Mai Sơn,Thị trấn Hát Lót,Mai Sơn,Hát Lót,Son La,Mai Son,...,Hat Lot Town,Mai Son,Hat Lot,Huyện,Thị trấn,District,Town,sonla,maison,hatlot
1716,Sơn La,Mai Sơn,Xã Hát Lót,Tỉnh Sơn La,Huyện Mai Sơn,Xã Hát Lót,Mai Sơn,Hát Lót,Son La,Mai Son,...,Hat Lot Commune,Mai Son,Hat Lot,Huyện,Xã,District,Commune,sonla,maison,hatlot
1882,Yên Bái,Trạm Tấu,Thị trấn Trạm Tấu,Tỉnh Yên Bái,Huyện Trạm Tấu,Thị trấn Trạm Tấu,Trạm Tấu,Trạm Tấu,Yen Bai,Tram Tau,...,Tram Tau Town,Tram Tau,Tram Tau,Huyện,Thị trấn,District,Town,yenbai,tramtau,tramtau
1887,Yên Bái,Trạm Tấu,Xã Trạm Tấu,Tỉnh Yên Bái,Huyện Trạm Tấu,Xã Trạm Tấu,Trạm Tấu,Trạm Tấu,Yen Bai,Tram Tau,...,Tram Tau Commune,Tram Tau,Tram Tau,Huyện,Xã,District,Commune,yenbai,tramtau,tramtau
1918,Yên Bái,Yên Bình,Thị trấn Yên Bình,Tỉnh Yên Bái,Huyện Yên Bình,Thị trấn Yên Bình,Yên Bình,Yên Bình,Yen Bai,Yen Binh,...,Yen Binh Town,Yen Binh,Yen Binh,Huyện,Thị trấn,District,Town,yenbai,yenbinh,yenbinh
1937,Yên Bái,Yên Bình,Xã Yên Bình,Tỉnh Yên Bái,Huyện Yên Bình,Xã Yên Bình,Yên Bình,Yên Bình,Yen Bai,Yen Binh,...,Yen Binh Commune,Yen Binh,Yen Binh,Huyện,Xã,District,Commune,yenbai,yenbinh,yenbinh


In [17]:
# Pickle
duplicated_ward_keys = df_duplicated_wards.ward_key.unique().tolist()
duplicated_ward_district_keys = df_duplicated_wards.district_key.unique().tolist()

In [18]:
print('Districts:' ,duplicated_ward_district_keys)
print('Wards:', duplicated_ward_keys)

Districts: ['gialam', 'muongte', 'maison', 'tramtau', 'yenbinh', 'chilang', 'dinhlap', 'damha', 'dahuoai', 'duchoa', 'tracu', 'thapmuoi', 'caolanh', 'vinhthuan', 'longphu', 'trande', 'phuoclong', 'camau', 'thoibinh']
Wards: ['yenvien', 'muongte', 'hatlot', 'tramtau', 'yenbinh', 'chilang', 'dinhlap', 'damha', 'madaguoi', 'hiephoa', 'dinhan', 'myan', 'mytho', 'vinhthuan', 'longphu', 'lichhoithuong', 'phuoclong', 'tanthanh', 'thoibinh']


### Sort province_key to prioritize some provinces
- Some province has districts, or wards that are the same keyword with another province.
- We need to run module_testing to check whether any address is wrong > Use ChatGPT to sort keywords > Add manually if we found new wrong keyword.

In [19]:
# After testing module, I added this code
# 'Huyện Quang Bình, Tỉnh Hà Giang' -> quangbinh
# 'Huyện Phù Yên, Tỉnh Sơn La' -> phuyen
# 'Huyện Văn Giang, Tỉnh Hưng Yên' -> angiang
# 'Huyện Quảng Ninh, Tỉnh Quảng Bình' -> quangninh
# 'Bac Lieu, Hoa Binh District -> hoabinh

province_key_order = {
    'yenbai': 1,
    'khanhhoa': 2,
    'binhthuan': 3,
    'thainguyen': 4,
    'thaibinh': 5,
    'longan': 5,
    'thanhhoa': 6,
    'hagiang': 6,
    'quangbinh': 7,
    'quangninh': 8,
    'quangngai': 8,
    'quangtri': 9,
    'binhdinh': 10,
    'binhduong': 11,
    'binhphuoc': 12,
    'dongnai': 13,
    'lamdong': 14,
    'phuyen': 15,
    'vinhlong': 16,
    'soctrang': 18,
    'bacgiang': 19,
    'camau': 20,
    'angiang': 21,
    'haiphong': 22,
    'hoabinh': 23,
    'ninhbinh': 25,
    'cambang': 26,
    'backan': 27,
    'caobang': 28
    
    
}


df['province_key_order'] = df['province_key'].map(province_key_order).fillna(0)

df.sort_values(by='province_key_order', inplace=True)

### Add alias province_keyword
- We should not add "hn" because it will cause many wrong matches. I will replace `\bhn\b` to `\bha noi\b` of the address, this step is in `parse.py`

In [20]:
df_province = df[[col for col in df.columns if 'province' in col]].drop_duplicates()

province_alias_keys = [
    ('Ho Chi Minh', 'hcm')
]

for key in province_alias_keys:
    province_english, province_key = key
    df_province = add_province_key(df_province, province_english, province_key)

### Add alias district_keyword
There are many district was changed it's name or be combined with other districts. For instance: Quan 9 > Thanh pho Thu Duc.

Use `find_district_alias_keywords.ipynb` to create a list of tuples.

In [21]:
df_district = df[['province_english'] + [col for col in df.columns if 'district' in col]].drop_duplicates()

district_alias_keys = [
    ('An Giang', 'Chau Thanh', 'hueduc'),
    ('Ba Ria - Vung Tau', 'Phu My', 'tanthanh'),
    ('Bac Kan', 'Ba Be', 'chora'),
    ('Bac Lieu', 'Bac Lieu', 'minhhai'),
    ('Binh Duong', 'Tan Uyen', 'chauthanh'),
    ('Binh Duong', 'Thuan An', 'laithieu'),
    ('Binh Phuoc', 'Phuoc Long', 'phuocbinh'),
    ('Ca Mau', 'Dam Doi', 'ngochien'),
    ('Cao Bang', 'Quang Hoa', 'phuchoa'),
    ('Cao Bang', 'Quang Hoa', 'quanguyen'),
    ('Cao Bang', 'Ha Quang', 'thongnong'),
    ('Cao Bang', 'Trung Khanh', 'tralinh'),
    ('Dien Bien', 'Muong Lay', 'laichau'),
    ('Dien Bien', 'Muong Cha', 'muonglay'),
    ('Dong Nai', 'Vinh Cuu', 'vinhan'),
    ('Dong Thap', 'Lap Vo', 'thanhhung'),
    ('Ha Nam', 'Phu Ly', 'hanam'),
    ('Ha Noi', 'Soc Son', 'daphuc'),
    ('Ha Noi', 'Soc Son', 'kimanh'),
    ('Hai Phong', 'Cat Hai', 'catba'),
    ('Hau Giang', 'Vi Thanh', 'mythanh'),
    ('Hau Giang', 'Nga Bay', 'tanhiep'),
    ('Hoa Binh', 'Hoa Binh', 'kyson'),
    ('Khanh Hoa', 'Dien Khanh', 'khanhxuong'),
    ('Kien Giang', 'Kien Luong', 'hatien'),
    ('Lao Cai', 'Lao Cai', 'camduong'),
    ('Nam Dinh', 'Nam Dinh', 'myloc'),
    ('Ninh Binh', 'Hoa Lu', 'giakhanh'),
    ('Ninh Binh', 'Nho Quan', 'hoanglong'),
    ('Ninh Binh', 'Yen Mo', 'tamdiep'),
    ('Phu Tho', 'Cam Khe', 'songthao'),
    ('Quang Nam', 'Nam Giang', 'giang'),
    ('Quang Ngai', 'Tra Bong', 'taytra'),
    ('Quang Ninh', 'Ha Long', 'honggai'),
    ('Quang Ninh', 'Van Don', 'campha'),
    ('Quang Ninh', 'Mong Cai', 'haininh'),
    ('Quang Ninh', 'Quang Yen', 'yenhung'),
    ('Quang Ninh', 'Ha Long', 'hoanhbo'),
    ('Tay Ninh', 'Hoa Thanh', 'phukhuong'),
    ('Thanh Hoa', 'Dong Son', 'dongthieu'),
    ('Thanh Hoa', 'Yen Dinh', 'thieuyen'),
    ('Thanh Hoa', 'Nghi Son', 'tinhgia'),
    ('Ho Chi Minh', 'Can Gio', 'duyenhai'),
    ('Ho Chi Minh', 'Thu Duc', 'quan2'),
    ('Ho Chi Minh', 'Thu Duc', 'quan9'),
    # ('Ho Chi Minh', 'Thu Duc', 'thuduc(quan)'),
    ('Tra Vinh', 'Cang Long', 'chauthanhdong'),
    ('Vinh Long', 'Long Ho', 'cainhum'),
    ('Vinh Long', 'Long Ho', 'chauthanhtay'),
    ('Ben Tre', 'Mo Cay Nam', 'mocay'),
    ('Binh Thuan', 'Ham Thuan Nam', 'hamthuan'),
    ('Ha Noi', 'Nam Tu Liem', 'tuliem'),
    ('Quang Nam', 'Nam Tra My', 'tramy'),
    ('Tien Giang', 'Go Cong Tay', 'gocong')

]

for key in district_alias_keys:
    province_english, district_english, district_key = key
    df_district = add_district_key(df_district, province_english, district_english, district_key)

Some districts contain "quan" with number in their keywords. We need to make a copy and replace "quan" to "district".

In [22]:
hcm_districts = df_district[df_district['district_key'].str.contains(r'quan\d{1,2}')].copy()
hcm_districts['district_key'] = hcm_districts['district_key'].str.replace('quan', 'district')
df_district = pd.concat([df_district, hcm_districts])

### Create alias ward_key
Some wards contain "phuong" with number in their keywords. We need to make a copy and replace "phuong" to "ward".


In [23]:
df_ward = df[['province_english', 'district_english'] + [col for col in df.columns if 'ward' in col]].drop_duplicates()

In [24]:
number_wards = df_ward[df_ward['ward_key'].fillna('').str.contains(r'phuong\d{1,2}')].copy()
number_wards['ward_key'] = number_wards['ward_key'].str.replace('phuong', 'ward')
df_ward = pd.concat([df_ward, number_wards])

### Create province_keys lists
We will prioritize provinces that are not in district_key or ward_key when searching a province_key in address.

In [25]:
province_keys = df_province['province_key'].tolist()

# Pickle
province_keys_1 = []
province_keys_2 = []
province_keys_3 = []

district_keys = str(df.district_key.unique().tolist())
ward_keys = str(df.ward_key.unique().tolist())
for province_key in province_keys:
    if (province_key not in district_keys) and (province_key not in ward_keys):
        province_keys_1.append(province_key)
    elif province_key not in ward_keys:
        province_keys_2.append(province_key)
    else:
        province_keys_3.append(province_key)

### Create mapping dictionaries

In [26]:
# Pickle
province_map = {}

for province_key in df_province.province_key.unique():
    province = df_province[df_province['province_key'] == province_key]
    province_record = province.to_dict(orient='records')[0]
    province_map[province_key] = province_record

In [27]:
# Pickle
district_map = {}

for province_english in df_province.province_english.unique():
    district_keys = {}
    for district_key in df_district[df_district.province_english == province_english]['district_key'].unique():
        district_levels = {}
        for district_level_english in df_district[(df_district.province_english == province_english) & (df_district.district_key == district_key)]['district_level_english'].unique():
            district = df_district[(df_district.province_english == province_english) & (df_district.district_key == district_key) & (df_district.district_level_english == district_level_english)]
            district_record = district[[col for col in district.columns if 'district' in col]].to_dict('records')[0]
            district_levels[district_level_english] = district_record
        district_keys[district_key] = district_levels
        
    district_keys = dict(sorted(district_keys.items(), key=lambda item: len(item[0]), reverse=True))
    
    district_map[province_english] = district_keys

In [28]:
# Pickle
ward_map = {}

for province_english in df_province.province_english.unique():
    districts = {}
    for district_english in df_district[df_district.province_english==province_english].district_english.unique():
        wards = {}
        for ward_key in df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english)].ward_key.unique():
            ward_levels = {}
            for ward_level_english in df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english) & (df_ward.ward_key==ward_key)].ward_level_english.unique():
                ward = df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english) & (df_ward.ward_key==ward_key) & (df_ward.ward_level_english==ward_level_english)]
                ward_record = ward[[col for col in ward.columns if 'ward' in col]].to_dict('records')[0]
                ward_levels[ward_level_english] = ward_record
            wards[ward_key] = ward_levels
        wards = dict(sorted(wards.items(), key=lambda item: len(str(item[0])), reverse=True))
        districts[district_english] = wards
    ward_map[province_english] = districts

### Create double check dictionaries
Some address will cause wrong mapping because they have ward_key is as same as other province's province_key.

In [29]:
double_check_provinces = {'vinhphuc': ['hagiang', 'thanhhoa'],
 'thaibinh': ['hoabinh', 'longan'],
 'binhthuan': ['thainguyen', 'quangngai', 'binhdinh'],
 'lamdong': ['haiphong'],
 'vinhlong': ['haiphong'],
 'thainguyen': ['thaibinh'],
 'tayninh': ['thaibinh'],
 'angiang': ['ninhbinh'],
 'sonla': ['ninhbinh', 'quangngai', 'khanhhoa'],
 'khanhhoa': ['ninhbinh', 'angiang'],
 'dienbien': ['thanhhoa'],
 'thanhhoa': ['quangbinh', 'binhphuoc'],
 'kiengiang': ['quangbinh'],
 'haiduong': ['quangtri'],
 'quangnam': ['phuyen'],
 'quangtri': ['lamdong'],
 'quangngai': ['lamdong'],
 'dongnai': ['lamdong'],
 'phutho': ['binhduong', 'angiang'],
 'longan': ['dongnai', 'vinhlong', 'angiang'],
 'binhphuoc': ['vinhlong', 'angiang'],
 'hungyen': ['kiengiang']}

Some address will cause wrong mapping because they have ward_key is as same as other district's district_key.

In [30]:
double_check_districts = {'unghoa': ['caugiay', 'chuongmy'],
 'thanhtri': ['hoangmai'],
 'thanhxuan': ['socson'],
 'thanhoai': ['thanhtri'],
 'muongla': ['phuyen', 'songma', 'sopcop'],
 'tranyen': ['lucyen', 'yenbinh'],
 'uongbi': ['mongcai'],
 'halong': ['vandon'],
 'haiduong': ['binhgiang'],
 'phucu': ['hungyen'],
 'dongson': ['thanhhoa', 'bimson'],
 'thanhhoa': ['nhuxuan'],
 'kyson': ['tanky'],
 'vinh': ['anhson', 'yenthanh', 'hoangmai'],
 'huongthuy': ['hue', 'hue', 'hue'],
 'giang': ['thangbinh', 'bactramy', 'nuithanh'],
 'tramy': ['bactramy'],
 'sontinh': ['sontay'],
 'ducpho': ['moduc'],
 'iapa': ['chuse'],
 'krongbuk': ['krongpac'],
 'hoathanh': ['chauthanh'],
 'thongnhat': ['bienhoa'],
 'tanphu': ['thuduc', 'quan7'],
 'tanan': ['canduoc'],
 'tanphuoc': ['gocongdong'],
 'cainhum': ['mangthit'],
 'chauphu': ['chaudoc', 'chaudoc'],
 'anminh': ['chauthanh', 'uminhthuong']}

In [31]:
with open('../vietadminunits/data/parse.pkl', 'wb') as f:
    pickle.dump((duplicated_district_keys, duplicated_district_province_keys, duplicated_ward_keys, duplicated_ward_district_keys, province_keys_1, province_keys_2, province_keys_3, province_map, district_map, ward_map, double_check_provinces, double_check_districts), f)