In [1]:
import pandas as pd
from vietadminunits import parse_address, get_data
from vietadminunits.utils import to_key, to_alphanumeric
import pickle

In [2]:
df = pd.DataFrame(get_data())

In [3]:
df['long_province_alphanumeric'] = df['long_province'].apply(to_alphanumeric)
df['long_district_alphanumeric'] = df['long_district'].apply(to_alphanumeric)
df['long_ward_alphanumeric'] = df['long_ward'].fillna('').apply(to_alphanumeric)

df['province_alphanumeric'] = df['province'].apply(to_alphanumeric)
df['short_district_alphanumeric'] = df['short_district'].apply(to_alphanumeric)
df['short_ward_alphanumeric'] = df['short_ward'].fillna('').apply(to_alphanumeric)

In [4]:
for ward in ['long_ward_alphanumeric', 'short_ward_alphanumeric']:
    for district in ['long_district_alphanumeric', 'short_district_alphanumeric']:
        for province in ['long_province_alphanumeric', 'province_alphanumeric']:
            df[f"address_{ward}_{province}_{district}"] = df[ward] + df[province] + df[district]
            df[f"address_comma_{ward}_{province}_{district}"] = df[ward] + ',' + df[province] + ',' + df[district]

In [5]:
df['province_key'] = df['province'].apply(to_key, args=(1,))
df['district_key'] = df['district'].apply(to_key, args=(2,))
df['ward_key'] = df['ward'].apply(to_key, args=(3,))

In [6]:
address_cols = [col for col in df.columns if 'address' in col]

## Find wards that are the same keyword with provinces

In [7]:
wrong_provinces = []



for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)
        unit = parse_address(address, level=1)
        data = {
            'address': address,
            'province': row.province,
            'province_key': row.province_key,
            'district_key': row.district_key,
            'ward_key': row.ward_key,
            'wrong_province_key': unit.province_key,
        }
        
        if unit.province == None:
            print(address)
            wrong_provinces.append(data)
            
        elif unit.province_key != row.province_key:
            print(address)
            wrong_provinces.append(data)

In [8]:
df_wrong_provinces = pd.DataFrame(wrong_provinces)

In [9]:
df_wrong_provinces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


In [None]:
df_wrong_provinces

In [None]:
DICT_double_check_inverted_provinces = {}
for row in df_wrong_provinces.itertuples():
    tmp_df = df_wrong_provinces[df_wrong_provinces['wrong_province_key']==row.wrong_province_key]
    tmp_province_keys = tmp_df['province_key'].unique().tolist()
    province_data = {}
    for province_key in tmp_province_keys:
        ward_district_keys = tmp_df[tmp_df['province_key']==province_key][['ward_key', 'district_key']].drop_duplicates().values.tolist()
        province_data[province_key] = ward_district_keys
    
    DICT_double_check_inverted_provinces[row.wrong_province_key] = province_data

## Find wards that are the same keyword with districts

We have to add `double_check_provinces` to module to solve province level before doing this step.

In [None]:
wrong_districts = []
for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)

        unit = parse_address(address, level=2)
    
        data = {
            'address': address,
            'province_english': row.province_english,
            'province_key': row.province_key,
            'wrong_province_key': unit.province_key,
            'district_key': row.district_key,
            'ward_key': row.ward_key,
            'wrong_district_key': unit.district_key,
        }
        
        if unit.province == None:
            print(address)
            wrong_districts.append(data)
        elif unit.district == None:
            print(address)
            wrong_districts.append(data)
                
        elif unit.district_key != row.district_key:
            print(address)
            wrong_districts.append(data)



In [None]:
df_wrong_districts = pd.DataFrame(wrong_districts)

In [None]:
df_wrong_districts = df_wrong_districts[df_wrong_districts.province_key==df_wrong_districts.wrong_province_key]

In [None]:
df_wrong_districts

In [None]:
df_wrong_districts[df_wrong_districts.wrong_district_key.isna()]

In [None]:
DICT_double_check_inverted_districts = {}

for province_english in df_wrong_districts['province_english'].unique():
    district_data = {}
    for wrong_district_key in df_wrong_districts[(df_wrong_districts['province_english']==province_english)]['wrong_district_key'].unique():
        tmp_district_keys = df_wrong_districts[(df_wrong_districts['province_english']==province_english) & (df_wrong_districts['wrong_district_key']==wrong_district_key)]['district_key'].unique().tolist()
        district_data[wrong_district_key] = tmp_district_keys
    
    DICT_double_check_inverted_districts[province_english] = district_data

In [None]:
DICT_double_check_inverted_districts