In [1]:
import pandas as pd
from vietadminunits import parse_address, get_data
from vietadminunits.utils import to_key, to_alphanumeric
import pickle


In [2]:
def find_new_district_english_module(text):
    au = parse_address(text, 2)
    return au.district_english

In [3]:
def create_province_english_module(text):
    au = parse_address(text, 1)
    return au.province_english

In [4]:
def create_district_key_module(text):
    au = parse_address(text)
    return au.district_key

In [5]:
def create_province_key_module(text):
    au = parse_address(text)
    return au.province_key

In [6]:
df = pd.DataFrame(get_data())

In [7]:
df['long_province_alphanumeric'] = df['long_province'].apply(to_alphanumeric)
df['long_district_alphanumeric'] = df['long_district'].apply(to_alphanumeric)
df['long_ward_alphanumeric'] = df['long_ward'].fillna('').apply(to_alphanumeric)

df['province_alphanumeric'] = df['province'].apply(to_alphanumeric)
df['short_district_alphanumeric'] = df['short_district'].apply(to_alphanumeric)
df['short_ward_alphanumeric'] = df['short_ward'].fillna('').apply(to_alphanumeric)

In [8]:
for ward in ['long_ward_alphanumeric', 'short_ward_alphanumeric']:
    for district in ['long_district_alphanumeric', 'short_district_alphanumeric']:
        for province in ['long_province_alphanumeric', 'province_alphanumeric']:
            df[f"address_{ward}_{district}_{province}"] = df[ward] + df[district] + df[province]
            df[f"address_comma_{ward}_{district}_{province}"] = df[ward] + ',' + df[district] + ',' + df[province]

In [9]:
df['province_key'] = df['province'].apply(to_key, args=(1,))
df['district_key'] = df['district'].apply(to_key, args=(2,))

In [10]:
address_cols = [col for col in df.columns if 'address' in col]

## Find wards that are the same keyword with provinces

In [11]:
wrong_provinces = []



for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)
        unit = parse_address(address, level=1)
        data = {
            'address': address,
            'province': row.province,
            'province_key': row.province_key,
            'wrong_province_key': unit.province_key,
        }
        
        if unit.province == None:
            print(address)
            wrong_provinces.append(data)
            
        elif unit.province_key != row.province_key:
            print(address)
            wrong_provinces.append(data)

In [12]:
df_wrong_provinces = pd.DataFrame(wrong_provinces)

In [13]:
df_wrong_provinces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


In [14]:
df_wrong_provinces

In [15]:
# df_wrong_provinces[df_wrong_provinces.wrong_province_key.isna()]

In [16]:
DICT_double_check_provinces = {}
for row in df_wrong_provinces.itertuples():
    DICT_double_check_provinces[row.wrong_province_key] = df_wrong_provinces[df_wrong_provinces['wrong_province_key']==row.wrong_province_key]['province_key'].unique().tolist()

In [17]:
DICT_double_check_provinces

{}

## Find wards that are the same keyword with districts

We have to add `double_check_provinces` to module to solve province level before doing this step.

In [18]:
wrong_districts = []
for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)

        unit = parse_address(address, level=2)
    
        data = {
            'address': address,
            'province_english': row.province_english,
            'province_key': row.province_key,
            'wrong_province_key': unit.province_key,
            'district_key': row.district_key,
            'wrong_district_key': unit.district_key,
        }
        
        if unit.province == None:
            print(address)
            wrong_districts.append(data)
        elif unit.district == None:
            print(address)
            wrong_districts.append(data)
                
        elif unit.district_key != row.district_key:
            print(address)
            wrong_districts.append(data)



thitrandienbiendonghuyendienbiendongtinhdienbien
xanasonhuyendienbiendongtinhdienbien
xaphinhuhuyendienbiendongtinhdienbien
xachiengsohuyendienbiendongtinhdienbien
xamuongluanhuyendienbiendongtinhdienbien
xapunhihuyendienbiendongtinhdienbien
xanonguhuyendienbiendongtinhdienbien
xaxadunghuyendienbiendongtinhdienbien
xakeolomhuyendienbiendongtinhdienbien
xaluangioihuyendienbiendongtinhdienbien
xaphinhgianghuyendienbiendongtinhdienbien
xapuhonghuyendienbiendongtinhdienbien
xatiadinhhuyendienbiendongtinhdienbien
xahangliahuyendienbiendongtinhdienbien
phuongthanhxuanquan12thanhphohochiminh
phuongthanhlocquan12thanhphohochiminh
phuonghiepthanhquan12thanhphohochiminh
phuongthoianquan12thanhphohochiminh
phuongtanchanhhiepquan12thanhphohochiminh
phuonganphudongquan12thanhphohochiminh
phuongtanthoihiepquan12thanhphohochiminh
phuongtrungmytayquan12thanhphohochiminh
phuongtanhungthuanquan12thanhphohochiminh
phuongdonghungthuanquan12thanhphohochiminh
phuongtanthoinhatquan12thanhphohochiminh
phuong1

In [19]:
df_wrong_districts = pd.DataFrame(wrong_districts)

In [20]:
df_wrong_districts = df_wrong_districts[df_wrong_districts.province_key==df_wrong_districts.wrong_province_key]

In [21]:
df_wrong_districts

Unnamed: 0,address,province_english,province_key,wrong_province_key,district_key,wrong_district_key
0,thitrandienbiendonghuyendienbiendongtinhdienbien,Dien Bien,dienbien,dienbien,dienbiendong,dienbien
1,xanasonhuyendienbiendongtinhdienbien,Dien Bien,dienbien,dienbien,dienbiendong,dienbien
2,xaphinhuhuyendienbiendongtinhdienbien,Dien Bien,dienbien,dienbien,dienbiendong,dienbien
3,xachiengsohuyendienbiendongtinhdienbien,Dien Bien,dienbien,dienbien,dienbiendong,dienbien
4,xamuongluanhuyendienbiendongtinhdienbien,Dien Bien,dienbien,dienbien,dienbiendong,dienbien
...,...,...,...,...,...,...
1051,"binhan,gocongdong,tiengiang",Tien Giang,tiengiang,tiengiang,gocongdong,gocong
1052,"tandien,gocongdong,tiengiang",Tien Giang,tiengiang,tiengiang,gocongdong,gocong
1053,"binhnghi,gocongdong,tiengiang",Tien Giang,tiengiang,tiengiang,gocongdong,gocong
1054,"phuoctrung,gocongdong,tiengiang",Tien Giang,tiengiang,tiengiang,gocongdong,gocong


In [22]:
df_wrong_districts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056 entries, 0 to 1055
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   address             1056 non-null   object
 1   province_english    1056 non-null   object
 2   province_key        1056 non-null   object
 3   wrong_province_key  1056 non-null   object
 4   district_key        1056 non-null   object
 5   wrong_district_key  1056 non-null   object
dtypes: object(6)
memory usage: 49.6+ KB


In [23]:
df_wrong_districts[df_wrong_districts.wrong_district_key.isna()]

Unnamed: 0,address,province_english,province_key,wrong_province_key,district_key,wrong_district_key


In [24]:
df_wrong_districts[df_wrong_districts.wrong_district_key=='quangngai']

Unnamed: 0,address,province_english,province_key,wrong_province_key,district_key,wrong_district_key


In [25]:
DICT_double_check_districts = {}

for province_english in df_wrong_districts['province_english'].unique():
    district_data = {}
    for wrong_district_key in df_wrong_districts[(df_wrong_districts['province_english']==province_english)]['wrong_district_key'].unique():
        tmp_district_keys = df_wrong_districts[(df_wrong_districts['province_english']==province_english) & (df_wrong_districts['wrong_district_key']==wrong_district_key)]['district_key'].unique().tolist()
        district_data[wrong_district_key] = tmp_district_keys
    
    DICT_double_check_districts[province_english] = district_data

In [26]:
DICT_double_check_districts

{'Dien Bien': {'dienbien': ['dienbiendong']},
 'Ho Chi Minh': {'quan1': ['quan12', 'quan10', 'quan11']},
 'Binh Duong': {'tanuyen': ['bactanuyen']},
 'Tien Giang': {'gocong': ['gocongtay', 'gocongdong']}}

In [27]:
with open('../../vietadminunits/data/parse.pkl', 'rb') as f:
    data = pickle.load(f)

data['DICT_double_check_provinces'] = DICT_double_check_provinces
data['DICT_double_check_districts'] = DICT_double_check_districts

with open('../../vietadminunits/data/parse.pkl', 'wb') as f:
    pickle.dump(data, f)