In [1]:
import pandas as pd
import re
from unidecode import unidecode
import pickle

In [2]:
def create_province_key(text):
    u_text = unidecode(text) # Unidecode first help remove special characters
    c_text = str(u_text).lower().strip() # Case must be second step
    c_text = re.sub(r'\sProvince$|^Tinh\s|^Thanh\sPho\s|\sCity$', '', c_text, flags=re.IGNORECASE)
    c_text = re.sub(r' \- |  |\- | \-', ' ', c_text)
    c_text = c_text.replace(' ', '')
    return c_text

In [3]:
def create_district_key(text):
    u_text = unidecode(text) # Unidecode first help remove special characters
    c_text = str(u_text).lower().strip() # Case must be second step
    c_text = re.sub(r' \- |  |\- | \-', ' ', c_text)
    c_text = c_text.replace(' ', '')
    return c_text

In [4]:
def add_province_key(df_province, province_english, province_key):
    if province_english not in df_province['province_english'].tolist():
        raise ValueError(f'{province_english} is not exist in province_english')
    elif province_key in df_province['province_key'].tolist():
        raise ValueError(f'{province_key} is exist in province_key')
    
    df_new = df_province.loc[df_province.province_english==province_english].head(1)
    df_new['province_key'] = province_key
    df_province = pd.concat([df_province, df_new])
    return df_province

In [5]:
def add_district_key(df, province_english, district_english, district_key):
    if district_english not in df['district_english'].tolist():
        raise ValueError(f'{district_english} is not exist in district_english')
    elif province_english not in df['province_english'].tolist():
        raise ValueError(f'{province_english} is not exist in province_english')
    elif district_key in df['district_key'].tolist():
        raise ValueError(f'{district_key} is exist in district_key')
    
    df_new = df.loc[(df.province_english==province_english) & (df.district_english==district_english)].head(1)
    df_new['district_key'] = district_key
    df = pd.concat([df, df_new])
    return df

In [6]:
# Pickle
df = pd.read_csv('../data/output/district_administrative_unit_of_vietnam.csv')

In [7]:
district_count = df[['province', 'district']].value_counts().reset_index()
duplicated_districts = district_count[district_count['count'] > 1]['district'].tolist()

In [9]:
df['province_key'] = df['province'].apply(create_province_key)
df['district_key'] = df['district'].apply(create_district_key)
df['level_english'].fillna('', inplace=True)

In [10]:
hcm_districts = df[df['district_key'].str.contains(r'quan\d{1,2}')].copy()
hcm_districts['district_key'] = hcm_districts['district_key'].str.replace('quan', 'district')
df = pd.concat([df, hcm_districts])

In [11]:
# After testing module, I added this code
# 'Huyện Quang Bình, Tỉnh Hà Giang' -> quangbinh
# 'Huyện Phù Yên, Tỉnh Sơn La' -> phuyen
# 'Huyện Văn Giang, Tỉnh Hưng Yên' -> angiang
# 'Huyện Quảng Ninh, Tỉnh Quảng Bình' -> quangninh
province_key_order = {
    'hagiang': 1,
    'quangbinh': 2,
    'sonla': 3,
    'phuyen': 4,
    'hungyen': 5,
    'angiang': 6,
    'quangninh': 7
}

df['province_key_order'] = df['province_key'].map(province_key_order)

df.sort_values(by='province_key_order', inplace=True)

In [12]:
# Pickle
duplicated_district_keys = df[df.district.isin(duplicated_districts)]['district_key'].unique().tolist()
duplicated_district_province_keys = df[df.district.isin(duplicated_districts)]['province_key'].unique().tolist()

In [13]:
df_province = df[['province', 'province_english', 'province_key']].drop_duplicates()

province_alias_keys = [
    ('Ho Chi Minh', 'hcm'),
    ('Ha Noi', 'hn') # Cause wrong with "Tinh Nghe An", fixed by adding to the end of province_keys_2
]

for key in province_alias_keys:
    province_english, province_key = key
    df_province = add_province_key(df_province, province_english, province_key)

In [14]:
district_alias_keys = [
    ('Ha Noi', 'Nam Tu Liem', 'tuliem')
]

for key in district_alias_keys:
    province_english, district_english, district_key = key
    df = add_district_key(df, province_english, district_english, district_key)

In [15]:
province_keys = df_province['province_key'].tolist()

# Pickle
province_keys_1 = []
province_keys_2 = []
district_keys = str(df.district_key.unique().tolist())
for province_key in province_keys:
    if province_key not in district_keys:
        province_keys_1.append(province_key)
    else:
        province_keys_2.append(province_key)

In [16]:
# Pickle
province_map = {}
for row in df_province.itertuples():
    province_map[row.province_key] = {'province': row.province, 'province_english': row.province_english}

In [17]:
# Pickle
district_map = {}
for province_english in df_province.province_english.unique():
    district_keys = {}
    for district_key in df[df.province_english == province_english]['district_key'].unique():
        levels = {}
        for level_english in df[(df.province_english == province_english) & (df.district_key == district_key)]['level_english'].unique():
            district_long = df[(df.province_english == province_english) & (df.district_key == district_key) & (df.level_english == level_english)]['district_long'].values[0]
            district_long_english = df[(df.province_english == province_english) & (df.district_key == district_key) & (df.level_english == level_english)]['district_long_english'].values[0]
            levels[level_english] = {'district_long': district_long, 'district_long_english': district_long_english}
        district_keys[district_key] = levels
    
    district_map[province_english] = district_keys

In [18]:
with open('../vnau_parser/data/data.pkl', 'wb') as f:
    pickle.dump((duplicated_district_keys, duplicated_district_province_keys, province_keys_1, province_keys_2, province_map, district_map), f)