In [1]:
import pandas as pd
from vietadminunits import parse_address, get_data
from vietadminunits.utils import to_key, to_alphanumeric
import pickle

In [2]:
def find_new_district_english_module(text):
    au = parse_address(text, 2)
    return au.district_english

In [3]:
def create_province_english_module(text):
    au = parse_address(text, 1)
    return au.province_english

In [4]:
def create_district_key_module(text):
    au = parse_address(text)
    return au.district_key

In [5]:
def create_province_key_module(text):
    au = parse_address(text)
    return au.province_key

In [6]:
df = pd.DataFrame(get_data())

In [7]:
df['long_province_alphanumeric'] = df['long_province'].apply(to_alphanumeric)
df['long_district_alphanumeric'] = df['long_district'].apply(to_alphanumeric)
df['long_ward_alphanumeric'] = df['long_ward'].fillna('').apply(to_alphanumeric)

df['province_alphanumeric'] = df['province'].apply(to_alphanumeric)
df['short_district_alphanumeric'] = df['short_district'].apply(to_alphanumeric)
df['short_ward_alphanumeric'] = df['short_ward'].fillna('').apply(to_alphanumeric)

In [8]:
for ward in ['long_ward_alphanumeric', 'short_ward_alphanumeric']:
    for district in ['long_district_alphanumeric', 'short_district_alphanumeric']:
        for province in ['long_province_alphanumeric', 'province_alphanumeric']:
            df[f"address_{ward}_{province}_{district}"] = df[ward] + df[province] + df[district]
            df[f"address_comma_{ward}_{province}_{district}"] = df[ward] + ',' + df[province] + ',' + df[district]

In [9]:
df['province_key'] = df['province'].apply(to_key, args=(1,))
df['district_key'] = df['district'].apply(to_key, args=(2,))
df['ward_key'] = df['ward'].apply(to_key, args=(3,))

In [10]:
address_cols = [col for col in df.columns if 'address' in col]

## Find wards that are the same keyword with provinces

In [11]:
wrong_provinces = []



for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)
        unit = parse_address(address, level=1)
        data = {
            'address': address,
            'province': row.province,
            'province_key': row.province_key,
            'district_key': row.district_key,
            'ward_key': row.ward_key,
            'wrong_province_key': unit.province_key,
        }
        
        if unit.province == None:
            print(address)
            wrong_provinces.append(data)
            
        elif unit.province_key != row.province_key:
            print(address)
            wrong_provinces.append(data)

xatinhhatinhquangngaihuyensontinh
thitrancaitauhadongthaphuyenchauthanh
xaanhiepdongthaphuyenchauthanh
xaannhondongthaphuyenchauthanh
xatannhuandongdongthaphuyenchauthanh
xatanbinhdongthaphuyenchauthanh
xatanphutrungdongthaphuyenchauthanh
xaphulongdongthaphuyenchauthanh
xaanphuthuandongthaphuyenchauthanh
xaphuhuudongthaphuyenchauthanh
xaankhanhdongthaphuyenchauthanh
xatanphudongthaphuyenchauthanh
xahoatandongthaphuyenchauthanh
xatinhhatinhquangngaisontinh
xasonhatinhphuyensonhoa
xaxuanminhhagiangquangbinh
xatiennguyenhagiangquangbinh
xatannamhagiangquangbinh
xabanriahagiangquangbinh
xayenthanhhagiangquangbinh
thitranyenbinhhagiangquangbinh
xatantrinhhagiangquangbinh
xatanbachagiangquangbinh
xabanglanghagiangquangbinh
xayenhahagiangquangbinh
xahuongsonhagiangquangbinh
xaxuangianghagiangquangbinh
xanakhuonghagiangquangbinh
xatienyenhagiangquangbinh
xavithuonghagiangquangbinh
thitranphuyensonlaphuyen
xasuoitosonlaphuyen
xamuongthaisonlaphuyen
xamuongcoisonlaphuyen
xaquanghuysonlaphuyen
xa

In [12]:
df_wrong_provinces = pd.DataFrame(wrong_provinces)

In [13]:
df_wrong_provinces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   address             432 non-null    object
 1   province            432 non-null    object
 2   province_key        432 non-null    object
 3   district_key        432 non-null    object
 4   ward_key            432 non-null    object
 5   wrong_province_key  432 non-null    object
dtypes: object(6)
memory usage: 20.4+ KB


In [14]:
df_wrong_provinces

Unnamed: 0,address,province,province_key,district_key,ward_key,wrong_province_key
0,xatinhhatinhquangngaihuyensontinh,Quảng Ngãi,quangngai,sontinh,tinhha,hatinh
1,thitrancaitauhadongthaphuyenchauthanh,Đồng Tháp,dongthap,chauthanh,caitauha,phuyen
2,xaanhiepdongthaphuyenchauthanh,Đồng Tháp,dongthap,chauthanh,anhiep,phuyen
3,xaannhondongthaphuyenchauthanh,Đồng Tháp,dongthap,chauthanh,annhon,phuyen
4,xatannhuandongdongthaphuyenchauthanh,Đồng Tháp,dongthap,chauthanh,tannhuandong,phuyen
...,...,...,...,...,...,...
427,"vinhmyb,baclieu,hoabinh",Bạc Liêu,baclieu,hoabinh,vinhmyb,hoabinh
428,"vinhhau,baclieu,hoabinh",Bạc Liêu,baclieu,hoabinh,vinhhau,hoabinh
429,"vinhhaua,baclieu,hoabinh",Bạc Liêu,baclieu,hoabinh,vinhhaua,hoabinh
430,"vinhmya,baclieu,hoabinh",Bạc Liêu,baclieu,hoabinh,vinhmya,hoabinh


In [15]:
DICT_double_check_inverted_provinces = {}
for row in df_wrong_provinces.itertuples():
    tmp_df = df_wrong_provinces[df_wrong_provinces['wrong_province_key']==row.wrong_province_key]
    tmp_province_keys = tmp_df['province_key'].unique().tolist()
    province_data = {}
    for province_key in tmp_province_keys:
        ward_district_keys = tmp_df[tmp_df['province_key']==province_key][['ward_key', 'district_key']].drop_duplicates().values.tolist()
        province_data[province_key] = ward_district_keys
    
    DICT_double_check_inverted_provinces[row.wrong_province_key] = province_data

## Find wards that are the same keyword with districts

We have to add `double_check_provinces` to module to solve province level before doing this step.

In [16]:
wrong_districts = []
for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)

        unit = parse_address(address, level=2)
    
        data = {
            'address': address,
            'province_english': row.province_english,
            'province_key': row.province_key,
            'wrong_province_key': unit.province_key,
            'district_key': row.district_key,
            'ward_key': row.ward_key,
            'wrong_district_key': unit.district_key,
        }
        
        if unit.province == None:
            print(address)
            wrong_districts.append(data)
        elif unit.district == None:
            print(address)
            wrong_districts.append(data)
                
        elif unit.district_key != row.district_key:
            print(address)
            wrong_districts.append(data)



xatinhhatinhquangngaihuyensontinh
thitrancaitauhadongthaphuyenchauthanh
xaanhiepdongthaphuyenchauthanh
xaannhondongthaphuyenchauthanh
xatannhuandongdongthaphuyenchauthanh
xatanbinhdongthaphuyenchauthanh
xatanphutrungdongthaphuyenchauthanh
xaphulongdongthaphuyenchauthanh
xaanphuthuandongthaphuyenchauthanh
xaphuhuudongthaphuyenchauthanh
xaankhanhdongthaphuyenchauthanh
xatanphudongthaphuyenchauthanh
xahoatandongthaphuyenchauthanh
xatinhhatinhquangngaisontinh
xasonhatinhphuyensonhoa
xaxuanminhhagiangquangbinh
xatiennguyenhagiangquangbinh
xatannamhagiangquangbinh
xabanriahagiangquangbinh
xayenthanhhagiangquangbinh
thitranyenbinhhagiangquangbinh
xatantrinhhagiangquangbinh
xatanbachagiangquangbinh
xabanglanghagiangquangbinh
xayenhahagiangquangbinh
xahuongsonhagiangquangbinh
xaxuangianghagiangquangbinh
xanakhuonghagiangquangbinh
xatienyenhagiangquangbinh
xavithuonghagiangquangbinh
thitranphuyensonlaphuyen
xasuoitosonlaphuyen
xamuongthaisonlaphuyen
xamuongcoisonlaphuyen
xaquanghuysonlaphuyen
xa

In [17]:
df_wrong_districts = pd.DataFrame(wrong_districts)

In [18]:
df_wrong_districts = df_wrong_districts[df_wrong_districts.province_key==df_wrong_districts.wrong_province_key]

In [19]:
df_wrong_districts

Unnamed: 0,address,province_english,province_key,wrong_province_key,district_key,ward_key,wrong_district_key


In [20]:
df_wrong_districts[df_wrong_districts.wrong_district_key.isna()]

Unnamed: 0,address,province_english,province_key,wrong_province_key,district_key,ward_key,wrong_district_key


In [21]:
df[(df.province_english=='Vinh Phuc') & (df.ward_english.str.startswith('Dong'))]

Unnamed: 0,province,district,ward,long_province,long_district,long_ward,short_district,short_ward,province_english,district_english,...,address_comma_short_ward_alphanumeric_long_province_alphanumeric_long_district_alphanumeric,address_short_ward_alphanumeric_province_alphanumeric_long_district_alphanumeric,address_comma_short_ward_alphanumeric_province_alphanumeric_long_district_alphanumeric,address_short_ward_alphanumeric_long_province_alphanumeric_short_district_alphanumeric,address_comma_short_ward_alphanumeric_long_province_alphanumeric_short_district_alphanumeric,address_short_ward_alphanumeric_province_alphanumeric_short_district_alphanumeric,address_comma_short_ward_alphanumeric_province_alphanumeric_short_district_alphanumeric,province_key,district_key,ward_key
3084,Vĩnh Phúc,Vĩnh Yên,Đống Đa,Tỉnh Vĩnh Phúc,Thành phố Vĩnh Yên,Phường Đống Đa,Vĩnh Yên,Đống Đa,Vinh Phuc,Vinh Yen,...,"dongda,tinhvinhphuc,thanhphovinhyen",dongdavinhphucthanhphovinhyen,"dongda,vinhphuc,thanhphovinhyen",dongdatinhvinhphucvinhyen,"dongda,tinhvinhphuc,vinhyen",dongdavinhphucvinhyen,"dongda,vinhphuc,vinhyen",vinhphuc,vinhyen,dongda
3086,Vĩnh Phúc,Vĩnh Yên,Đồng Tâm,Tỉnh Vĩnh Phúc,Thành phố Vĩnh Yên,Phường Đồng Tâm,Vĩnh Yên,Đồng Tâm,Vinh Phuc,Vinh Yen,...,"dongtam,tinhvinhphuc,thanhphovinhyen",dongtamvinhphucthanhphovinhyen,"dongtam,vinhphuc,thanhphovinhyen",dongtamtinhvinhphucvinhyen,"dongtam,tinhvinhphuc,vinhyen",dongtamvinhphucvinhyen,"dongtam,vinhphuc,vinhyen",vinhphuc,vinhyen,dongtam
3095,Vĩnh Phúc,Phúc Yên,Đồng Xuân,Tỉnh Vĩnh Phúc,Thành phố Phúc Yên,Phường Đồng Xuân,Phúc Yên,Đồng Xuân,Vinh Phuc,Phuc Yen,...,"dongxuan,tinhvinhphuc,thanhphophucyen",dongxuanvinhphucthanhphophucyen,"dongxuan,vinhphuc,thanhphophucyen",dongxuantinhvinhphucphucyen,"dongxuan,tinhvinhphuc,phucyen",dongxuanvinhphucphucyen,"dongxuan,vinhphuc,phucyen",vinhphuc,phucyen,dongxuan
3114,Vĩnh Phúc,Lập Thạch,Đồng Ích,Tỉnh Vĩnh Phúc,Huyện Lập Thạch,Xã Đồng Ích,Lập Thạch,Đồng Ích,Vinh Phuc,Lap Thach,...,"dongich,tinhvinhphuc,huyenlapthach",dongichvinhphuchuyenlapthach,"dongich,vinhphuc,huyenlapthach",dongichtinhvinhphuclapthach,"dongich,tinhvinhphuc,lapthach",dongichvinhphuclapthach,"dongich,vinhphuc,lapthach",vinhphuc,lapthach,dongich
3122,Vĩnh Phúc,Tam Dương,Đồng Tĩnh,Tỉnh Vĩnh Phúc,Huyện Tam Dương,Xã Đồng Tĩnh,Tam Dương,Đồng Tĩnh,Vinh Phuc,Tam Duong,...,"dongtinh,tinhvinhphuc,huyentamduong",dongtinhvinhphuchuyentamduong,"dongtinh,vinhphuc,huyentamduong",dongtinhtinhvinhphuctamduong,"dongtinh,tinhvinhphuc,tamduong",dongtinhvinhphuctamduong,"dongtinh,vinhphuc,tamduong",vinhphuc,tamduong,dongtinh
3156,Vĩnh Phúc,Yên Lạc,Đồng Cương,Tỉnh Vĩnh Phúc,Huyện Yên Lạc,Xã Đồng Cương,Yên Lạc,Đồng Cương,Vinh Phuc,Yen Lac,...,"dongcuong,tinhvinhphuc,huyenyenlac",dongcuongvinhphuchuyenyenlac,"dongcuong,vinhphuc,huyenyenlac",dongcuongtinhvinhphucyenlac,"dongcuong,tinhvinhphuc,yenlac",dongcuongvinhphucyenlac,"dongcuong,vinhphuc,yenlac",vinhphuc,yenlac,dongcuong
3157,Vĩnh Phúc,Yên Lạc,Đồng Văn,Tỉnh Vĩnh Phúc,Huyện Yên Lạc,Xã Đồng Văn,Yên Lạc,Đồng Văn,Vinh Phuc,Yen Lac,...,"dongvan,tinhvinhphuc,huyenyenlac",dongvanvinhphuchuyenyenlac,"dongvan,vinhphuc,huyenyenlac",dongvantinhvinhphucyenlac,"dongvan,tinhvinhphuc,yenlac",dongvanvinhphucyenlac,"dongvan,vinhphuc,yenlac",vinhphuc,yenlac,dongvan
3204,Vĩnh Phúc,Sông Lô,Đồng Quế,Tỉnh Vĩnh Phúc,Huyện Sông Lô,Xã Đồng Quế,Sông Lô,Đồng Quế,Vinh Phuc,Song Lo,...,"dongque,tinhvinhphuc,huyensonglo",dongquevinhphuchuyensonglo,"dongque,vinhphuc,huyensonglo",dongquetinhvinhphucsonglo,"dongque,tinhvinhphuc,songlo",dongquevinhphucsonglo,"dongque,vinhphuc,songlo",vinhphuc,songlo,dongque
3213,Vĩnh Phúc,Sông Lô,Đồng Thịnh,Tỉnh Vĩnh Phúc,Huyện Sông Lô,Xã Đồng Thịnh,Sông Lô,Đồng Thịnh,Vinh Phuc,Song Lo,...,"dongthinh,tinhvinhphuc,huyensonglo",dongthinhvinhphuchuyensonglo,"dongthinh,vinhphuc,huyensonglo",dongthinhtinhvinhphucsonglo,"dongthinh,tinhvinhphuc,songlo",dongthinhvinhphucsonglo,"dongthinh,vinhphuc,songlo",vinhphuc,songlo,dongthinh


In [22]:
DICT_double_check_inverted_districts = {}

for province_english in df_wrong_districts['province_english'].unique():
    district_data = {}
    for wrong_district_key in df_wrong_districts[(df_wrong_districts['province_english']==province_english)]['wrong_district_key'].unique():
        tmp_district_keys = df_wrong_districts[(df_wrong_districts['province_english']==province_english) & (df_wrong_districts['wrong_district_key']==wrong_district_key)]['district_key'].unique().tolist()
        district_data[wrong_district_key] = tmp_district_keys
    
    DICT_double_check_inverted_districts[province_english] = district_data

In [23]:
DICT_double_check_inverted_districts

{}

In [25]:
with open('../../vietadminunits/data/parse.pkl', 'rb') as f:
    data = pickle.load(f)

data['DICT_double_check_inverted_provinces'] = DICT_double_check_inverted_provinces
# data['DICT_double_check_inverted_districts'] = DICT_double_check_inverted_districts

with open('../../vietadminunits/data/parse.pkl', 'wb') as f:
    pickle.dump(data, f)