In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path

from vietnamadminunits.parser.utils import key_normalize

import warnings
warnings.filterwarnings("ignore")

BASE_DIR = Path().resolve().parent.parent

## Reading data

In [2]:
# Convert data
df = pd.read_csv(BASE_DIR / 'data/processed/convert_legacy_2025_with_location_and_default_ward.csv')

# From data
df_legacy = pd.read_csv(BASE_DIR / 'data/interim/legacy_63-province-10040-ward_with_location_and_key.csv')
df_legacy = df_legacy[['province', 'district', 'ward', 'provinceKey', 'districtKey', 'wardKey', 'wardKeyDuplicated']]

# To data
df_from2025 = df[['newProvince', 'newWard']].drop_duplicates().copy()

## Enriching data

### Adding basic columns

In [3]:
# To data
unit_cols = ['newProvince', 'newWard']
for col in unit_cols:
    df_from2025[f"{col}Key"] = df_from2025[f"{col}"].apply(key_normalize)

### Checking duplication

In [4]:
# To data - newWardKey
count_ward_key = df_from2025.groupby(['newProvinceKey', 'newWardKey']).size().reset_index(name='count').sort_values(by=['count'], ascending=False)
duplicated_ward_key = count_ward_key[count_ward_key['count']>1].copy().reset_index(drop=True)
duplicated_ward_key['newWardKeyDuplicated'] = True
duplicated_ward_key.drop(columns=['count'], inplace=True)

print(duplicated_ward_key.shape[0])
duplicated_ward_key

# Có newWardKey bị duplicated khi bỏ dấu tiếng Việt

8


Unnamed: 0,newProvinceKey,newWardKey,newWardKeyDuplicated
0,thanhphohochiminh,xathanhan,True
1,tinhdongthap,xatanthanh,True
2,thanhphohaiphong,xacamgiang,True
3,tinhdongnai,xalocthanh,True
4,tinhthainguyen,xavanlang,True
5,tinhtayninh,xatanthanh,True
6,tinhquangngai,xabato,True
7,tinhquangngai,xasonha,True


In [5]:
# To data - Add newWardKeyDuplicated
df_from2025 = pd.merge(df_from2025, duplicated_ward_key, on=['newProvinceKey', 'newWardKey'], how='left')
df_from2025['newWardKeyDuplicated'].fillna(False, inplace=True)

# Đưa newWardKey về phiên bản có dấu tiếng Việt
df_from2025['newWardKey'] = np.where(df_from2025['newWardKeyDuplicated']==True, df_from2025['newWard'].apply(key_normalize, args=([], False)), df_from2025['newWardKey'])

df_from2025[df_from2025['newWardKeyDuplicated']]

Unnamed: 0,newProvince,newWard,newProvinceKey,newWardKey,newWardKeyDuplicated
287,Tỉnh Thái Nguyên,Xã Văn Lang,tinhthainguyen,xãvănlang,True
671,Tỉnh Thái Nguyên,Xã Văn Lăng,tinhthainguyen,xãvănlăng,True
1059,Thành phố Hải Phòng,Xã Cẩm Giang,thanhphohaiphong,xãcẩmgiang,True
1060,Thành phố Hải Phòng,Xã Cẩm Giàng,thanhphohaiphong,xãcẩmgiàng,True
1969,Tỉnh Quảng Ngãi,Xã Sơn Hà,tinhquangngai,xãsơnhà,True
1970,Tỉnh Quảng Ngãi,Xã Sơn Hạ,tinhquangngai,xãsơnhạ,True
1992,Tỉnh Quảng Ngãi,Xã Ba Tơ,tinhquangngai,xãbatơ,True
1998,Tỉnh Quảng Ngãi,Xã Ba Tô,tinhquangngai,xãbatô,True
2478,Tỉnh Đồng Nai,Xã Lộc Thạnh,tinhdongnai,xãlộcthạnh,True
2482,Tỉnh Đồng Nai,Xã Lộc Thành,tinhdongnai,xãlộcthành,True


In [6]:
# Merge tất cả data vào convert data
df = pd.merge(df, df_legacy, on=['province', 'district', 'ward'], how='left')
df = pd.merge(df, df_from2025, on=['newProvince', 'newWard'], how='left')

# Tạo provinceDistrictWardKey - đây là key chính để suy ra newWardKey
df['provinceDistrictWardKey'] =  df['provinceKey'] + '_' + df['districtKey'] + '_' + df['wardKey'].fillna('') # Fillna vì mấy cái đảo thì không có ward

## Creating dictionaries

In [7]:
df_no_divided = df[df['isDividedWard']==False]
df_no_divided.groupby(['newProvinceKey', 'provinceDistrictWardKey']).size().reset_index(name='count').sort_values(by=['count'], ascending=False)

Unnamed: 0,newProvinceKey,provinceDistrictWardKey,count
0,thanhphocantho,thanhphocantho_huyencodo_thitrancodo,1
6374,tinhninhbinh,tinhninhbinh_huyennhoquan_xadongphong,1
6376,tinhninhbinh,tinhninhbinh_huyennhoquan_xagialam,1
6377,tinhninhbinh,tinhninhbinh_huyennhoquan_xagiason,1
6378,tinhninhbinh,tinhninhbinh_huyennhoquan_xagiathuy,1
...,...,...,...
3190,tinhdongnai,tinhdongnai_huyenthongnhat_xalo25,1
3191,tinhdongnai,tinhdongnai_huyenthongnhat_xaquangtrung,1
3192,tinhdongnai,tinhdongnai_huyenthongnhat_xaxuanthien,1
3193,tinhdongnai,tinhdongnai_huyentrangbom_thitrantrangbom,1


In [8]:
df_divided = df[df['isDividedWard']==True]
df_divided.groupby(['newProvinceKey', 'provinceDistrictWardKey']).size().reset_index(name='count').sort_values(by=['count'], ascending=False)

Unnamed: 0,newProvinceKey,provinceDistrictWardKey,count
265,thanhphohanoi,thanhphohanoi_quannamtuliem_phuongdaimo,5
260,thanhphohanoi,thanhphohanoi_quanlongbien_phuongphucdong,4
168,thanhphohanoi,thanhphohanoi_huyenthanhtri_xatantrieu,4
262,thanhphohanoi,thanhphohanoi_quanlongbien_phuongthachban,4
213,thanhphohanoi,thanhphohanoi_quanhadong_phuongduongnoi,4
...,...,...,...
152,thanhphohanoi,thanhphohanoi_huyenthachthat_xabinhyen,2
151,thanhphohanoi,thanhphohanoi_huyensocson_xaquangtien,2
150,thanhphohanoi,thanhphohanoi_huyensocson_xaphuminh,2
149,thanhphohanoi,thanhphohanoi_huyensocson_xamaidinh,2


In [9]:
# Province - Chỉ cần danh sách provinceKey là suy ra được newProvinceKey
df_province = df[['newProvinceKey', 'provinceKey']].drop_duplicates().reset_index(drop=True)
DICT_PROVINCE = {}
for newProvinceKey, group in df_province.groupby('newProvinceKey'):
    DICT_PROVINCE[newProvinceKey] = group['provinceKey'].values.tolist()

In [10]:
# Ward (NO_DIVIDED) - Chỉ cần danh sách provinceDistrictWardKey là suy ra được newWardKey
DICT_PROVINCE_WARD_NO_DIVIDED = {}

for newProvinceKey, group in df_no_divided.groupby('newProvinceKey'):
    ward_dict = {}

    for newWardKey, group in group.groupby('newWardKey'):
        ward_dict[newWardKey] = group['provinceDistrictWardKey'].values.tolist()

    DICT_PROVINCE_WARD_NO_DIVIDED[newProvinceKey] = ward_dict

In [11]:
# Ward (DIVIDED) - Cần thêm thông tin isDefaultNewWard, newWardLat, newWardLon, newWardAreaKm2 để ra quyết định chọn newWardKey nào
DICT_PROVINCE_WARD_DIVIDED = {}
for newProvinceKey, group in df_divided.groupby('newProvinceKey'):
    keyword_dict = {}

    for provinceDistrictWardKey, group in group.groupby('provinceDistrictWardKey'):
        ward_dict = []

        for _, group in group.groupby('newWardKey'):
            ward = {
                'newWardKey': group['newWardKey'].iloc[0],
                'isDefaultNewWard': group['isDefaultNewWard'].iloc[0],
                'newWardLat': group['newWardLat'].iloc[0],
                'newWardLon': group['newWardLon'].iloc[0],
                'newWardAreaKm2': group['newWardAreaKm2'].iloc[0],
            }
            ward_dict.append(ward)

        keyword_dict[provinceDistrictWardKey] = ward_dict

    DICT_PROVINCE_WARD_DIVIDED[newProvinceKey] = keyword_dict

## Saving package data

In [12]:
converter_data = {
    'DICT_PROVINCE': DICT_PROVINCE,
    'DICT_PROVINCE_WARD_NO_DIVIDED': DICT_PROVINCE_WARD_NO_DIVIDED,
    'DICT_PROVINCE_WARD_DIVIDED': DICT_PROVINCE_WARD_DIVIDED
}

with open(BASE_DIR / 'vietnamadminunits/data/converter_2025.json', 'w') as f:
    json.dump(converter_data, f)

In [13]:
df[df['isDividedWard'] & df['wardKeyDuplicated']]
# May quá không có =))

Unnamed: 0,provinceCode,isMergedProvince,districtCode,districtType,districtShortDuplicated,wardCode,wardType,wardShortDuplicated,isMergedWard,isDividedWard,...,isDefaultNewWard,newProvinceShort,provinceKey,districtKey,wardKey,wardKeyDuplicated,newProvinceKey,newWardKey,newWardKeyDuplicated,provinceDistrictWardKey
