# Build parse data

In [1]:
import numpy as np
import pandas as pd
import re
from unidecode import unidecode
import pickle
import sys
sys.path.append('../../vietadminunits')
from utils import to_alphanumeric, to_key

## Functions

In [2]:
def add_province_key(df_province, province_english, province_key):
    if not df_province[df_province.province_english==province_english].shape[0]:
        raise ValueError(f'{province_english} is not exist in province_english')
    elif df_province[(df_province.province_english==province_english) & (df_province.province_key==province_key)].shape[0]:
        raise ValueError(f'{province_key} is exist in province_key')
    
    df_new = df_province.loc[df_province.province_english==province_english].head(1)
    df_new['province_key'] = province_key
    df_province = pd.concat([df_province, df_new])
    return df_province

In [3]:
def add_district_key(df_district, province_english, district_english, district_key):
    if not df_district[df_district.province_english==province_english].shape[0]:
        raise ValueError(f'{province_english} is not exist in province_english')
    elif not df_district[(df_district.province_english==province_english) & (df_district.district_english==district_english)].shape[0]:
        raise ValueError(f'{district_english} is not exist in district_english of {province_english}')
    elif df_district[(df_district.province_english==province_english) & (df_district.district_english==district_english) & (df_district.district_key==district_key)].shape[0]:
        raise ValueError(f'{district_key} is exist in district_key of {province_english}, {district_english}')
    
    df_new = df_district.loc[(df_district.province_english==province_english) & (df_district.district_english==district_english)].head(1)
    df_new['district_key'] = district_key
    df_district = pd.concat([df_district, df_new])
    return df_district

In [4]:
def add_ward_key(df_ward, province_english, district_english, ward_english, ward_key):
    if not df_ward[df_ward.province_english==province_english].shape[0]:
        raise ValueError(f'{province_english} is not exist in province_english')
    
    elif not df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english)].shape[0]:
        raise ValueError(f'{district_english} is not exist in district_english of {province_english}')
    
    elif not df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english) & (df_ward.ward_english==ward_english)].shape[0]:
        raise ValueError(f'{ward_english} is not exist in ward_english of {province_english}, {district_english}')
    
    elif df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english) & (df_ward.ward_english==ward_english) & (df_ward.ward_key==ward_key)].shape[0]:
        raise ValueError(f'{ward_key} is exist in ward_key of {province_english}, {district_english}, {ward_english}')
    
    df_new = df_ward.loc[(df_ward.province_english==province_english) & (df_ward.district_english==district_english) & (df_ward.ward_english==ward_english)].head(1)
    df_new['ward_key'] = ward_key
    df_ward = pd.concat([df_ward, df_new])
    return df_ward

## Base dataset

In [5]:
df = pd.read_csv('../../data/output/vietnam_administrative_units.csv')

In [6]:
df.head()

Unnamed: 0,province,district,ward,long_province,long_district,long_ward,short_district,short_ward,province_english,district_english,ward_english,long_province_english,long_district_english,long_ward_english,short_district_english,short_ward_english,district_level,ward_level,district_level_english,ward_level_english
0,Hà Nội,Ba Đình,Phúc Xá,Thành phố Hà Nội,Quận Ba Đình,Phường Phúc Xá,Ba Đình,Phúc Xá,Ha Noi,Ba Dinh,Phuc Xa,Ha Noi City,Ba Dinh District,Phuc Xa Ward,Ba Dinh,Phuc Xa,Quận,Phường,District,Ward
1,Hà Nội,Ba Đình,Trúc Bạch,Thành phố Hà Nội,Quận Ba Đình,Phường Trúc Bạch,Ba Đình,Trúc Bạch,Ha Noi,Ba Dinh,Truc Bach,Ha Noi City,Ba Dinh District,Truc Bach Ward,Ba Dinh,Truc Bach,Quận,Phường,District,Ward
2,Hà Nội,Ba Đình,Vĩnh Phúc,Thành phố Hà Nội,Quận Ba Đình,Phường Vĩnh Phúc,Ba Đình,Vĩnh Phúc,Ha Noi,Ba Dinh,Vinh Phuc,Ha Noi City,Ba Dinh District,Vinh Phuc Ward,Ba Dinh,Vinh Phuc,Quận,Phường,District,Ward
3,Hà Nội,Ba Đình,Cống Vị,Thành phố Hà Nội,Quận Ba Đình,Phường Cống Vị,Ba Đình,Cống Vị,Ha Noi,Ba Dinh,Cong Vi,Ha Noi City,Ba Dinh District,Cong Vi Ward,Ba Dinh,Cong Vi,Quận,Phường,District,Ward
4,Hà Nội,Ba Đình,Liễu Giai,Thành phố Hà Nội,Quận Ba Đình,Phường Liễu Giai,Ba Đình,Liễu Giai,Ha Noi,Ba Dinh,Lieu Giai,Ha Noi City,Ba Dinh District,Lieu Giai Ward,Ba Dinh,Lieu Giai,Quận,Phường,District,Ward


In [7]:
df.shape

(10547, 20)

In [8]:
df['province_key'] = df['province'].apply(to_key, args=(1,))
df['district_key'] = df['short_district'].apply(to_key, args=(2,))
df['ward_key'] = df['short_ward'].apply(to_key, args=(3,))

df['province_alphanumeric'] = df['long_province'].apply(to_alphanumeric)
df['district_alphanumeric'] = df['long_district'].apply(to_alphanumeric)
df['ward_alphanumeric'] = df['long_ward'].apply(to_alphanumeric)

df['province_alphanumeric_english'] = df['long_province_english'].apply(to_alphanumeric)
df['district_alphanumeric_english'] = df['long_district_english'].apply(to_alphanumeric)
df['ward_alphanumeric_english'] = df['long_ward_english'].apply(to_alphanumeric)

df['district_level_english'].fillna('', inplace=True)
df['ward_level_english'].fillna('', inplace=True)

### Create keyword lists that are duplicated district_keyword

Eg: Hà Tĩnh has Huyện Kỳ Anh and Thị xã Kỳ Anh.

In [9]:
district_count = df[['province', 'long_district', 'district_key']].drop_duplicates()[['province', 'district_key']].value_counts().reset_index()
duplicated_districts = district_count[district_count['count'] > 1]['district_key'].tolist()

In [10]:
df_duplicated_districts = df[df.district_key.isin(duplicated_districts)][['long_province', 'long_district' ,'province_key','district_key']].drop_duplicates()

In [11]:
df_duplicated_districts

Unnamed: 0,long_province,long_district,province_key,district_key
5835,Tỉnh Hà Tĩnh,Huyện Kỳ Anh,hatinh,kyanh
5867,Tỉnh Hà Tĩnh,Thị xã Kỳ Anh,hatinh,kyanh
9160,Tỉnh Tiền Giang,Thị xã Cai Lậy,tiengiang,cailay
9213,Tỉnh Tiền Giang,Huyện Cai Lậy,tiengiang,cailay
9552,Tỉnh Trà Vinh,Huyện Duyên Hải,travinh,duyenhai
9559,Tỉnh Trà Vinh,Thị xã Duyên Hải,travinh,duyenhai
9673,Tỉnh Đồng Tháp,Thành phố Cao Lãnh,dongthap,caolanh
9697,Tỉnh Đồng Tháp,Thành phố Hồng Ngự,dongthap,hongngu
9713,Tỉnh Đồng Tháp,Huyện Hồng Ngự,dongthap,hongngu
9748,Tỉnh Đồng Tháp,Huyện Cao Lãnh,dongthap,caolanh


My idea: We will use their wards to decide their district level.

To make sure this idea is valid, I will check whether they have some wards with the same ward_key

In [12]:
for district_key in duplicated_districts:
    long_districts = df[df.district_key==district_key]['long_district'].unique().tolist()
    a = []
    b = []
    long_district_a = long_districts[0]
    long_district_b = long_districts[1]
    ward_key_a = df[df.long_district==long_district_a]['ward_key'].unique().tolist()
    ward_key_b = df[df.long_district==long_district_b]['ward_key'].unique().tolist()
    common = list(set(a) & set(b))
    if common:
        print(district_key, 'has duplicated ward_key')

Nice! We also have default option if there are no ward in the address, I use Google Trend to decide default district level.

In [13]:
# Pickle
duplicated_district_province_keys = df_duplicated_districts['province_key'].unique().tolist()

# Use Google Trend and search to decide level
duplicated_district_keys = {
    'kyanh': {'default':'Town'},
    'cailay': {'default':'Town'},
    'duyenhai': {'default':'District'},
    'caolanh': {'default':'City'},
    'hongngu': {'default':'City'},
    'longmy': {'default':'District'}
}

for district_key in duplicated_districts:
    df_temp = df[df.district_key == district_key]
    level_data = {}
    levels = df_temp.district_level_english.unique().tolist()
    for district_level_english in levels:
        ward_keys = df_temp[df_temp.district_level_english==district_level_english]['ward_key'].unique().tolist()
        level_data[district_level_english] = ward_keys
        duplicated_district_keys[district_key]['levels'] = level_data

In [14]:
print('Provinces:', duplicated_district_province_keys, '\n')
print('Districts:', duplicated_district_keys)

Provinces: ['hatinh', 'tiengiang', 'travinh', 'dongthap', 'haugiang'] 

Districts: {'kyanh': {'default': 'Town', 'levels': {'District': ['kyxuan', 'kybac', 'kyphu', 'kyphong', 'kytien', 'kygiang', 'kydong', 'kykhang', 'kyvan', 'kytrung', 'kytho', 'kytay', 'kythuong', 'kyhai', 'kythu', 'kychau', 'kytan', 'lamhop', 'kyson', 'kylac'], 'Town': ['hungtri', 'kyninh', 'kyloi', 'kyha', 'kytrinh', 'kythinh', 'kyhoa', 'kyphuong', 'kylong', 'kylien', 'kynam']}}, 'cailay': {'default': 'Town', 'levels': {'Town': ['phuong1', 'phuong2', 'phuong3', 'phuong4', 'phuong5', 'myphuoctay', 'myhanhdong', 'myhanhtrung', 'tanphu', 'tanbinh', 'tanhoi', 'nhimy', 'nhiquy', 'thanhhoa', 'phuquy', 'longkhanh'], 'District': ['thanhloc', 'mythanhbac', 'phucuong', 'mythanhnam', 'phunhuan', 'binhphu', 'camson', 'phuan', 'mylong', 'longtien', 'hiepduc', 'longtrung', 'hoixuan', 'tanphong', 'tambinh', 'nguhiep']}}, 'duyenhai': {'default': 'District', 'levels': {'District': ['donxuan', 'donchau', 'longthanh', 'longkhanh', '

At this step, we collect these data to save to module data:
- `duplicated_district_province_keys`
- `duplicated_district_keys`

## Find district alias keywords

Data source: [Danh sách các đơn vị hành chính cấp huyện không còn tồn tại](https://vi.wikipedia.org/wiki/Danh_s%C3%A1ch_%C4%91%C6%A1n_v%E1%BB%8B_h%C3%A0nh_ch%C3%ADnh_c%E1%BA%A5p_huy%E1%BB%87n_c%E1%BB%A7a_Vi%E1%BB%87t_Nam#Danh_s%C3%A1ch_c%C3%A1c_%C4%91%C6%A1n_v%E1%BB%8B_h%C3%A0nh_ch%C3%ADnh_c%E1%BA%A5p_huy%E1%BB%87n_kh%C3%B4ng_c%C3%B2n_t%E1%BB%93n_t%E1%BA%A1i)

In [None]:
def check_containing_keyword(district, change):
    if change.count(district) == 2:
        return 1
    else:
        return 0

In [None]:
df_district_change = pd.read_csv('../../data/input/district_change.csv')

In [None]:
df_district_change.dropna(inplace=True)

In [None]:
df_district_change['containing_keyword_two_times'] = df_district_change.apply(lambda x: check_containing_keyword(x.old_district, x.change), axis=1)

In [None]:
df_change = df_district_change[(df_district_change.change.fillna('').str.contains('sáp nhập|đổi tên')) & ~(df_district_change.change.fillna('').str.contains('một phần'))].copy()

In [None]:
df_change

In [None]:
df_split = df_district_change[df_district_change.containing_keyword_two_times==1].copy()

In [None]:
df_split

In [None]:
# This is a particular case that is duplicated district_key
df_split = df_split[df_split.old_district!='Tân Uyên'].copy()

In [None]:
# Use Google Trend to decide new district
df_split.loc[df_split.old_district=='Mỏ Cày', 'district_english'] = 'Mo Cay Nam'
df_split.loc[df_split.old_district=='Hàm Thuận', 'district_english'] = 'Ham Thuan Nam'
df_split.loc[df_split.old_district=='Từ Liêm', 'district_english'] = 'Nam Tu Liem'
df_split.loc[df_split.old_district=='Trà My', 'district_english'] = 'Nam Tra My'
df_split.loc[df_split.old_district=='Gò Công', 'district_english'] = 'Go Cong Tay'

In [None]:
df_split

In [None]:
def find_new_district_english(text):
    au = parse_address(text, 2)
    return au.district_english

In [None]:
def prepare_address(province, change):
    address = re.sub(f'.* thành ', '', change) + ', ' + province
    return address

In [None]:
def create_province_english(text):
    au = parse_address(text, 1)
    return au.province_english

In [None]:
df_change['address'] = df_change.apply(lambda row: prepare_address(row.province, row.change), axis=1)

In [None]:
df_change

In [None]:
df_change = df_change[df_change.old_district!='Thủ Đức (quận)'].copy()

In [None]:
df_change['district_english'] = df_change.address.apply(find_new_district_english)

After doing research, I found district_english `None` are fine, because they are moved to another province or revert it's changing.

In [None]:
df_change.dropna(inplace=True)

In [None]:
df_change.dropna(inplace=True)
df_all = pd.concat([df_change, df_split])
df_all['province_english'] = df_all.province.apply(create_province_english)
df_all['district_key'] = df_all.old_district.apply(to_key, args=(2,))

In [None]:
df_all

In [None]:
district_alias_keys = [tuple(i) for i in  df_all[['province_english', 'district_english', 'district_key']].values.tolist()]

In [None]:
district_alias_keys

In [None]:
df_split['old_district_key'] = df_split['old_district'].apply(to_key, args=(2,))

In [None]:
# Pickle
half_district_keys = {}
for old_district_key in df_split['old_district_key'].unique().tolist():
    df_temp = df[df.district_key.str.contains(old_district_key)]
    district_keys =  df_temp.district_key.unique().tolist()
    district_key_data = {}
    for district_key in district_keys:
        district_key_data[district_key] = df_temp[df_temp.district_key==district_key]['ward_key'].unique().tolist()
    half_district_keys[old_district_key] = district_key_data

In [None]:
half_district_keys

After this step, we got `half_district_keys` for module data

### Create keyword lists that are duplicated ward_keyword

In [None]:
ward_count = df[['long_province', 'long_district', 'short_ward']].value_counts().reset_index()
df_duplicated_wards = ward_count[ward_count['count'] > 1].copy()
df_duplicated_wards.sort_values(by=['long_province', 'long_district', 'short_ward'], inplace=True)

In [None]:
df_duplicated_wards = df[(df.long_province.isin(df_duplicated_wards.long_province)) & (df.long_district.isin(df_duplicated_wards.long_district)) & (df.short_ward.isin(df_duplicated_wards.short_ward))][['province', 'long_district', 'ward','ward_level_english', 'district_key', 'ward_key']]

In [None]:
df_duplicated_wards

Following Google Trend, "xã ..." is higher "thị trấn ..." significant. But "phường ..." is higher than "xã ..." and we have only one ward in the list.

In [None]:
# Pickle
duplicated_ward_keys = df_duplicated_wards.ward_key.unique().tolist()
duplicated_ward_district_keys = df_duplicated_wards.district_key.unique().tolist()

new_data = {}

for ward_key in duplicated_ward_keys:
    levels = df_duplicated_wards[df_duplicated_wards.ward_key == ward_key]['ward_level_english'].tolist()
    if 'Ward' in levels:
        ward_level = 'Ward'
    elif 'Commune' in levels:
        ward_level = 'Commune'
    else:
        ward_level = 'Town'
    new_data[ward_key] = ward_level
    
duplicated_ward_keys = new_data.copy()

In [None]:
print('Districts:' ,duplicated_ward_district_keys)
print('Wards:', duplicated_ward_keys)

After this step, we got these data for module:
- `duplicated_ward_keys`
- `duplicated_ward_district_keys`

### Sort province_key to prioritize some provinces
- Some province has districts, or wards that are the same keyword with another province.
- We need to run module_testing to check whether any address is wrong > Use ChatGPT to sort keywords > Add manually if we found new wrong keyword.

For example:
- 'Huyện Quang Bình, Tỉnh Hà Giang' -> quangbinh
- 'Huyện Phù Yên, Tỉnh Sơn La' -> phuyen
- 'Huyện Văn Giang, Tỉnh Hưng Yên' -> angiang
- 'Huyện Quảng Ninh, Tỉnh Quảng Bình' -> quangninh
- Bac Lieu, Hoa Binh District -> hoabinh

In [None]:
province_keys = df.province_key.unique().tolist()

In [None]:
def find_province_key_match(text):
    for province_key in province_keys:
        if province_key in text:
            return province_key

In [None]:
match_district = df[df.district_key.str.contains('|'.join(province_keys))][['province', 'district', 'province_key','district_key']].drop_duplicates()
match_ward = df[(~df.ward.isna()) & (df.ward_key.str.contains('|'.join(province_keys)))][['province', 'ward', 'province_key','ward_key']].drop_duplicates()

match_district['match_province_key'] = match_district['district_key'].apply(find_province_key_match)
match_ward['match_province_key'] = match_ward['ward_key'].apply(find_province_key_match)

match_district = match_district[match_district.province_key != match_district.match_province_key]
match_ward = match_ward[match_ward.province_key != match_ward.match_province_key]

In [None]:
match_district

In [None]:
match_ward

In [None]:
# Should not use list(set()) because it will re-order
province_order = []

for row in match_district.itertuples():
    if row.province_key not in province_order:
        province_order.append(row.province_key)
    if row.match_province_key not in province_order:
        province_order.append(row.match_province_key)
        
for row in match_ward.itertuples():
    if row.province_key not in province_order:
        province_order.append(row.province_key)
    if row.match_province_key not in province_order:
        province_order.append(row.match_province_key)

In [None]:
province_key_order = {}
for index, province_key in enumerate(province_order):
    province_key_order[province_key] = index + 1
    

In [None]:
print(province_key_order)

In [None]:
df['province_key_order'] = df['province_key'].map(province_key_order).fillna(0)

df.sort_values(by='province_key_order', inplace=True)

### Add alias province_keyword
- We should not add "hn" because it will cause many wrong matches. I will replace `\bhn\b` to `\bha noi\b` of the address, this step is in `parse.py`

In [None]:
df_province = df[[col for col in df.columns if 'province' in col]].drop_duplicates()

province_alias_keys = [
    ('Ho Chi Minh', 'hcm')
]

for key in province_alias_keys:
    province_english, province_key = key
    df_province = add_province_key(df_province, province_english, province_key)

### Add alias district_keyword
There are many district was changed it's name or be combined with other districts. For instance: Quan 9 > Thanh pho Thu Duc.

Use `find_district_alias_keywords.ipynb` to create a list of tuples.

In [None]:
df_district = df[['province_english'] + [col for col in df.columns if 'district' in col]].drop_duplicates()

for key in district_alias_keys:
    province_english, district_english, district_key = key
    df_district = add_district_key(df_district, province_english, district_english, district_key)

In [None]:
district_alias_keys

Some districts contain "quan" with number in their keywords. We need to make a copy and replace "quan" to "district".

In [None]:
hcm_districts = df_district[df_district['district_key'].str.contains(r'quan\d{1,2}')].copy()
hcm_districts['district_key'] = hcm_districts['district_key'].str.replace('quan', 'district')
df_district = pd.concat([df_district, hcm_districts])
hcm_districts['district_key'] = hcm_districts['district_key'].str.replace('district', 'q.')
df_district = pd.concat([df_district, hcm_districts])

### Create alias ward_key
Some wards contain "phuong" with number in their keywords. We need to make a copy and replace "phuong" to "ward".


In [None]:
df_ward = df[['province_english', 'district_english'] + [col for col in df.columns if 'ward' in col]].drop_duplicates()

In [None]:
number_wards = df_ward[df_ward['ward_key'].fillna('').str.contains(r'phuong\d{1,2}')].copy()
number_wards['ward_key'] = number_wards['ward_key'].str.replace('phuong', 'ward')
df_ward = pd.concat([df_ward, number_wards])
number_wards['ward_key'] = number_wards['ward_key'].str.replace('ward', 'p.')
df_ward = pd.concat([df_ward, number_wards])

### Create province_keys lists
We will prioritize provinces that are not in district_key or ward_key when searching a province_key in address.

In [None]:
province_keys = df_province['province_key'].tolist()

# Pickle
province_keys_1 = []
province_keys_2 = []
province_keys_3 = []

district_keys = str(df.district_key.unique().tolist())
ward_keys = str(df.ward_key.unique().tolist())
for province_key in province_keys:
    if (province_key not in district_keys) and (province_key not in ward_keys):
        province_keys_1.append(province_key)
    elif province_key not in ward_keys:
        province_keys_2.append(province_key)
    else:
        province_keys_3.append(province_key)

In [None]:
province_alphanumerics = df.province_alphanumeric.unique().tolist() + df.province_alphanumeric_english.unique().tolist()

After this step we got these data for module:
- `province_keys_1`
- `province_keys_2`
- `province_keys_3`

### Create mapping dictionaries


In [None]:
# Pickle
province_map = {}

for province_key in df_province.province_key.unique():
    province = df_province[df_province['province_key'] == province_key]
    province_record = province.to_dict(orient='records')[0]
    province_map[province_key] = province_record
    
    
for province_alphanumeric in df_province.province_alphanumeric.unique():
    province = df_province[df_province['province_alphanumeric'] == province_alphanumeric]
    province_record = province.to_dict(orient='records')[0]
    province_map[province_alphanumeric] = province_record
    
    
for province_alphanumeric_english in df_province.province_alphanumeric_english.unique():
    province = df_province[df_province['province_alphanumeric_english'] == province_alphanumeric_english]
    province_record = province.to_dict(orient='records')[0]
    province_map[province_alphanumeric_english] = province_record

In [None]:
print(province_map)

In [None]:
# Pickle
district_map = {}

for province_english in df_province.province_english.unique():
    district_keys = {}
    for district_key in df_district[df_district.province_english == province_english]['district_key'].unique():
        district_levels = {}
        for district_level_english in df_district[(df_district.province_english == province_english) & (df_district.district_key == district_key)]['district_level_english'].unique():
            district = df_district[(df_district.province_english == province_english) & (df_district.district_key == district_key) & (df_district.district_level_english == district_level_english)]
            district_record = district[[col for col in district.columns if 'district' in col]].to_dict('records')[0]
            district_levels[district_level_english] = district_record
        district_keys[district_key] = district_levels
        
    district_keys = dict(sorted(district_keys.items(), key=lambda item: len(item[0]), reverse=True))
    
    district_map[province_english] = district_keys

In [None]:
print(district_map['Ho Chi Minh'])

In [None]:
# Pickle
ward_map = {}

for province_english in df_province.province_english.unique():
    districts = {}
    for district_english in df_district[df_district.province_english==province_english].district_english.unique():
        wards = {}
        for ward_key in df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english)].ward_key.dropna().unique():
            ward_levels = {}
            for ward_level_english in df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english) & (df_ward.ward_key==ward_key)].ward_level_english.unique():
                ward = df_ward[(df_ward.province_english==province_english) & (df_ward.district_english==district_english) & (df_ward.ward_key==ward_key) & (df_ward.ward_level_english==ward_level_english)]
                ward_record = ward[[col for col in ward.columns if 'ward' in col]].to_dict('records')[0]
                ward_levels[ward_level_english] = ward_record
            wards[ward_key] = ward_levels
        wards = dict(sorted(wards.items(), key=lambda item: len(str(item[0])), reverse=True))
        districts[district_english] = wards
    ward_map[province_english] = districts

In [None]:
print(ward_map['Ho Chi Minh']['Tan Binh'])

After this step, we got these data for module:
- `province_map`
- `district_map`
- `ward_map`

### Create double check dictionaries
Some address will cause wrong mapping because they have ward_key is as same as other province's province_key.
- Use **sub_2_create_double_check_keywords.ipynb** to get the data.
- Note that after saving this data to module, we can not create the data again.

In [None]:
double_check_provinces = {
    'angiang': ['thainguyen', 'ninhbinh', 'thanhhoa'],
    'hoabinh': ['thainguyen',
    'langson',
    'haiphong',
    'thaibinh',
    'dongnai',
    'vinhlong',
    'dongthap'],
    'binhthuan': ['thainguyen', 'quangngai', 'binhdinh'],
    'thanhhoa': ['langson', 'binhphuoc', 'travinh', 'kiengiang', 'haugiang'],
    'thaibinh': ['langson', 'tayninh'],
    'hagiang': ['thaibinh', 'thanhhoa'],
    'quangbinh': ['thaibinh', 'thanhhoa'],
    'sonla': ['ninhbinh', 'quangngai', 'khanhhoa'],
    'khanhhoa': ['ninhbinh', 'kiengiang'],
    'longan': ['thanhhoa', 'dongnai', 'vinhlong'],
    'quangninh': ['thanhhoa'],
    'vinhlong': ['quangtri'],
    'haiduong': ['quangtri'],
    'haiphong': ['quangtri'],
    'binhduong': ['quangnam', 'quangngai', 'binhdinh'],
    'binhdinh': ['quangnam'],
    'hungyen': ['kiengiang']
}

# double_check_provinces = {} # testing

Some address will cause wrong mapping because they have ward_key is as same as other district's district_key.

In [None]:
double_check_districts = {'unghoa': ['caugiay', 'chuongmy'],
 'thanhtri': ['hoangmai'],
 'thanhxuan': ['socson'],
 'thanhoai': ['thanhtri'],
 'thongnong': ['haquang'],
 'tralinh': ['trungkhanh'],
 'quanguyen': ['quanghoa'],
 'chora': ['babe'],
 'yenson': ['sonduong'],
 'camduong': ['laocai'],
 'muongla': ['phuyen', 'songma', 'sopcop'],
 'tranyen': ['lucyen', 'yenbinh'],
 'kyson': ['hoabinh', 'tanky'],
 'chilang': ['langson', 'trangdinh'],
 'honggai': ['halong'],
 'hoanhbo': ['halong'],
 'uongbi': ['mongcai'],
 'halong': ['vandon'],
 'haiduong': ['binhgiang'],
 'anduong': ['lechan', 'duongkinh', 'thuynguyen', 'vinhbao'],
 'catba': ['cathai'],
 'phucu': ['hungyen'],
 'myloc': ['namdinh'],
 'dongson': ['thanhhoa', 'bimson'],
 'hatrung': ['bathuoc'],
 'thanhhoa': ['nhuxuan', 'benluc'],
 'vinh': ['anhson', 'yenthanh', 'hoangmai'],
 'yenthanh': ['thanhchuong'],
 'anhson': ['thanhchuong', 'namdan'],
 'thachha': ['hatinh'],
 'huongthuy': ['hue'],
 'giang': ['thangbinh', 'bactramy', 'nuithanh'],
 'tramy': ['bactramy'],
 'sontinh': ['sontay'],
 'ducpho': ['moduc'],
 'dakto': ['konray', 'tumorong'],
 'dakha': ['tumorong'],
 'dakpo': ['kongchro'],
 'iapa': ['chuse'],
 'krongbuk': ['krongpac'],
 'phuocbinh': ['phuoclong'],
 'hoathanh': ['chauthanh'],
 'laithieu': ['thuanan'],
 'thongnhat': ['bienhoa'],
 'vinhan': ['vinhcuu'],
 'binhchanh': ['thuduc'],
 'tanphu': ['thuduc', 'quan7', 'cuchi'],
 'tanthanh': ['mochoa', 'thuthua'],
 'tanan': ['canduoc'],
 'tanhung': ['chauthanh'],
 'tanphuoc': ['gocongdong'],
 'mocay': ['mocaynam'],
 'thanhphu': ['giongtrom', 'binhdai'],
 'cainhum': ['mangthit'],
 'longho': ['mangthit'],
 'chauphu': ['chaudoc'],
 'anphu': ['phutan', 'tinhbien', 'chauthanh', 'thoaison'],
 'anminh': ['chauthanh', 'uminhthuong'],
 'vithanh': ['vithuy']}

# double_check_districts = {} # tesing

## Support find province from unique districts

Điều kiện 1: district_key not in ward_keys.
Điều kiện 2: district_count = 1, sau khi đã drop_duplicated [province_key, district_key]
Điều kiện 3: district_key not in province_keys của tỉnh khác.

Nếu district_key = province_key của chính nó:
>> Lấy danh sách long_district_alphanumerices (bao gồm Việt và English), nếu tồn tại trong address thì sau khi tìm được province_key sẽ không bị xóa province_key trong địa chỉ.


Nếu district_key != province_key của chính nó, tức là unique tuyệt đối:
>> Lấy danh sách unique_district_keys, nếu không tìm được province_key thì tìm unique_district_key để suy ngược ra province_key

In [None]:
df = pd.read_csv('../../data/output/vietnam_administrative_units.csv')
df['province_key'] = df['province'].apply(lambda x: to_key(x, 1))
df['district_key'] = df['short_district'].apply(lambda x: to_key(x, 2))
df['ward_key'] = df['short_ward'].apply(lambda x: to_key(x, 3))
df['ward_alphanumeric'] = df['long_ward'].apply(to_alphanumeric)


province_keys = df['province_key'].unique().tolist()
district_keys = df['district_key'].unique().tolist()
ward_keys = df['ward_key'].unique().tolist()
ward_alphanumerics = df['ward_alphanumeric'].unique().tolist()
str_ward_alphanumerics = str(ward_alphanumerics)

In [None]:
# Add alias key. Eg: Quan 2, Quan 9 are Thu Duc
for key in district_alias_keys:
    province_english, district_english, district_key = key
    df = add_district_key(df, province_english, district_english, district_key)

In [None]:
df_district_filter = df[~df.district_key.apply(lambda x: x in str_ward_alphanumerics)]

In [None]:
df_district_filter_count = df_district_filter[['province_key', 'district_key']].drop_duplicates()['district_key'].value_counts().reset_index()
one_district_keys = df_district_filter_count[df_district_filter_count['count']==1]['district_key'].tolist()
df_district_filter = df_district_filter[df_district_filter.district_key.isin(one_district_keys)][['province_key', 'district_key', 'long_district', 'long_district_english']].drop_duplicates()

def check_valid_district_key(district_key, province_key):
    if district_key in province_keys and district_key != province_key:
        return False
    else:
        return True
    
df_district_filter['is_valid_district_key'] = df_district_filter.apply(lambda x: check_valid_district_key(x['district_key'], x['province_key']), axis=1)
df_district_filter = df_district_filter[df_district_filter.is_valid_district_key]
df_district_filter['same_key'] = np.where(df_district_filter.province_key == df_district_filter.district_key, True, False)

long_district_alphanumerics = []
for row in df_district_filter[df_district_filter.same_key].itertuples():
    long_district_alphanumerics.append(to_alphanumeric(row.long_district))
    long_district_alphanumerics.append(to_alphanumeric(row.long_district).replace('thanhpho', 'tp.'))
    long_district_alphanumerics.append(to_alphanumeric(row.long_district_english))
    
    
unique_district_keys = {}
for row in df_district_filter[~df_district_filter.same_key][['province_key', 'district_key']].drop_duplicates().itertuples():
    unique_district_keys[row.district_key] = row.province_key
    district_key = row.district_key
    if re.search(r'^quan\d{1,2}', district_key):
        district_key = re.sub('^quan', 'q.', district_key)
        unique_district_keys[district_key] = row.province_key

In [None]:
len(long_district_alphanumerics)

In [None]:
long_district_alphanumerics

In [None]:
len(unique_district_keys)

In [None]:
unique_district_keys

## Support find province from unique "long" districts

In [None]:
df['province_alphanumeric'] = df['long_province'].apply(to_alphanumeric)
df['district_alphanumeric'] = df['long_district'].apply(to_alphanumeric)
df['ward_alphanumeric'] = df['long_ward'].apply(to_alphanumeric)
df['province_alphanumeric_english'] = df['long_province_english'].apply(to_alphanumeric)
df['district_alphanumeric_english'] = df['long_district_english'].apply(to_alphanumeric)
df['ward_alphanumeric_english'] = df['long_ward_english'].apply(to_alphanumeric)

In [None]:
province_alphanumerics = df['province_alphanumeric'].unique().tolist()
district_alphanumerics = df['district_alphanumeric'].unique().tolist()
ward_alphanumerics = df['ward_alphanumeric'].unique().tolist()

In [None]:
df_district_filter = df[~df.district_alphanumeric.isin(ward_alphanumerics)]
df_district_filter_count = df_district_filter[['province_alphanumeric', 'district_alphanumeric']].drop_duplicates()['district_alphanumeric'].value_counts().reset_index()
one_district_alphanumerics = df_district_filter_count[df_district_filter_count['count']==1]['district_alphanumeric'].tolist()
df_district_filter = df_district_filter[df_district_filter.district_alphanumeric.isin(one_district_alphanumerics)][['province_key','province_alphanumeric', 'district_alphanumeric', 'long_district', 'long_district_english', 'district_key', 'district_alphanumeric_english']].drop_duplicates()

def check_valid_district_alphanumeric(district_alphanumeric, province_alphanumeric):
    if district_alphanumeric in province_alphanumerics and district_alphanumeric != province_alphanumeric:
        return False
    else:
        return True
    
df_district_filter['is_valid_district_alphanumeric'] = df_district_filter.apply(lambda x: check_valid_district_alphanumeric(x['district_alphanumeric'], x['province_alphanumeric']), axis=1)
df_district_filter = df_district_filter[df_district_filter.is_valid_district_alphanumeric]
df_district_filter['same_alphanumeric'] = np.where(df_district_filter.province_alphanumeric == df_district_filter.district_alphanumeric, True, False)

for row in df_district_filter[df_district_filter.same_alphanumeric].itertuples():
    long_district_alphanumerics.append(to_alphanumeric(row.long_district))
    long_district_alphanumerics.append(to_alphanumeric(row.long_district).replace('thanhpho', 'tp.'))
    long_district_alphanumerics.append(to_alphanumeric(row.long_district_english))
    
    
for row in df_district_filter[~df_district_filter.same_alphanumeric][['province_key', 'district_alphanumeric', 'district_key', 'district_alphanumeric_english']].drop_duplicates().itertuples():
    if not row.district_key in unique_district_keys:
        unique_district_keys[row.district_alphanumeric] = row.province_key
        unique_district_keys[row.district_alphanumeric_english] = row.province_key
        district_alphanumeric = row.district_alphanumeric
        district_alphanumeric = re.sub('^quan', 'q.', district_alphanumeric)
        district_alphanumeric = re.sub('^huyen', 'h.', district_alphanumeric)
        district_alphanumeric = re.sub('^thanhpho', 'tp.', district_alphanumeric)
        district_alphanumeric = re.sub('^thixa', 'tx.', district_alphanumeric)
        unique_district_keys[district_alphanumeric] = row.province_key

In [None]:
len(unique_district_keys)

In [None]:
len(long_district_alphanumerics)

In [None]:
print(unique_district_keys)

## Support find province from "not unique" district_key

Now we can parse "Phuong 15 Quan Tan Binh" but "Phuong 15 Tan Binh" is not, because "tanbinh" is a ward_key of another province.

To reach this feature, we will check if any ward_key of the district is in the address, and the ward_key must be placed before the district_key.

- Condition 1: ward_key_district_key is unique.
- Condition 2: district_key not in `unique_district_keys`

In [None]:
# Data I want
{
    'tanbinh': {
        'hochiminh': ['phuong1', 'phuong2', 'p.1', 'p.2', 'district1', 'district2'],
        'otherprovincekey': [...]
    }
}

In [None]:
df['ward_key_district_key'] = df['ward_key'] + '_' + df['district_key']

In [None]:
ward_key_district_key_count = df['ward_key_district_key'].value_counts().reset_index()

In [None]:
unique_ward_key_district_keys = ward_key_district_key_count[ward_key_district_key_count['count']==1]['ward_key_district_key'].tolist()

In [None]:
df_unique_ward_key_district_keys = df[(df['ward_key_district_key'].isin(unique_ward_key_district_keys)) & (~df['district_key'].isin(unique_district_keys))]

In [None]:
not_unique_district_keys = {}
for district_key in df_unique_ward_key_district_keys['district_key'].unique().tolist():
    data = {}
    tmp_province_keys = df_unique_ward_key_district_keys[df_unique_ward_key_district_keys['district_key']==district_key]['province_key'].unique().tolist()
    for province_key in tmp_province_keys:
        tmp_ward_keys = df_unique_ward_key_district_keys[(df_unique_ward_key_district_keys['district_key']==district_key) & (df_unique_ward_key_district_keys['province_key']==province_key)]['ward_key'].unique().tolist()
        tmp_number_wards = [i for i in tmp_ward_keys if re.search('^phuong\d{1,2}', i)]
        for num_ward_key in tmp_number_wards:
            tmp_ward_keys.append(num_ward_key.replace('phuong','district'))
            tmp_ward_keys.append(num_ward_key.replace('phuong','p.'))
        data[province_key] = tmp_ward_keys
    not_unique_district_keys[district_key] = data

In [None]:
not_unique_district_keys['tanbinh']

In [None]:
not_unique_district_keys['chauthanh']

In [None]:
len(province_keys)

In [None]:
alphanumeric_long_wards = df[df.ward_key.isin(province_keys)].ward_alphanumeric.unique().tolist() + df[df.ward_key.isin(province_keys)].ward_alphanumeric_english.unique().tolist()

In [None]:
alphanumeric_long_wards

## Support 

In [None]:
with open('../../vietadminunits/data/parse.pkl', 'wb') as f:
    pickle.dump((duplicated_district_keys, duplicated_district_province_keys, duplicated_ward_keys, duplicated_ward_district_keys, province_keys_1, province_keys_2, province_keys_3, province_map, district_map, ward_map, double_check_provinces, double_check_districts, half_district_keys, long_district_alphanumerics, unique_district_keys, not_unique_district_keys, alphanumeric_long_wards, province_alphanumerics), f)

In [1]:
DICT_long_district_alphanumerics

NameError: name 'DICT_long_district_alphanumerics' is not defined