In [1]:
import pandas as pd
from vietadminunits import parse_address, get_data
from vietadminunits.utils import to_key, to_alphanumeric
import pickle

In [2]:
df = pd.DataFrame(get_data())

In [3]:
df['long_province_alphanumeric'] = df['long_province'].apply(to_alphanumeric)
df['long_district_alphanumeric'] = df['long_district'].apply(to_alphanumeric)
df['long_ward_alphanumeric'] = df['long_ward'].fillna('').apply(to_alphanumeric)

df['province_alphanumeric'] = df['province'].apply(to_alphanumeric)
df['short_district_alphanumeric'] = df['short_district'].apply(to_alphanumeric)
df['short_ward_alphanumeric'] = df['short_ward'].fillna('').apply(to_alphanumeric)

In [4]:
for ward in ['long_ward', 'ward']:
    for district in ['long_district', 'district']:
        for province in ['long_province', 'province']:
            df[f"address_{ward}_{district}_{province}"] = df[ward].fillna('') + df[district] + df[province]
            df[f"address_comma_{ward}_{district}_{province}"] = df[ward].fillna('') + ',' + df[district] + ',' + df[province]
            df[f"address_{ward}_{province}_{district}"] = df[ward].fillna('') + df[province] + df[district]
            df[f"address_comma_{ward}_{province}_{district}"] = df[ward].fillna('') + ',' + df[province] + ',' + df[district]

In [5]:
df['province_key'] = df['province'].apply(to_key, args=(1,))
df['district_key'] = df['district'].apply(to_key, args=(2,))
df['ward_key'] = df['ward'].apply(to_key, args=(3,))

In [6]:
address_cols = [col for col in df.columns if 'address' in col]

In [7]:
len(address_cols)

32

## Check wrong province

In [8]:
wrong_provinces = []



for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)
        unit = parse_address(address, level=1)
        data = {
            'address': address,
            'province': row.province,
            'province_key': row.province_key,
            'district_key': row.district_key,
            'ward_key': row.ward_key,
            'wrong_province_key': unit.province_key,
        }
        
        if unit.province != row.province:
            print(address)
            wrong_provinces.append(data)

In [9]:
df_wrong_provinces = pd.DataFrame(wrong_provinces)

In [10]:
df_wrong_provinces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


In [11]:
df_wrong_provinces

## Check wrong district

In [12]:
wrong_districts = []
for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)

        unit = parse_address(address, level=2)
    
        data = {
            'address': address,
            'province_english': row.province_english,
            'province_key': row.province_key,
            'wrong_province_key': unit.province_key,
            'district_key': row.district_key,
            'ward_key': row.ward_key,
            'wrong_district_key': unit.district_key,
        }
        
        if unit.district != row.district:
            print(address)
            wrong_districts.append(data)



In [13]:
df_wrong_districts = pd.DataFrame(wrong_districts)

In [14]:
df_wrong_districts

In [15]:
df_wrong_districts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


## Check wrong ward

In [16]:
wrong_wards = []
for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)

        unit = parse_address(address, level=3)
    
        data = {
            'address': address,
            'province_english': row.province_english,
            'province_key': row.province_key,
            'wrong_province_key': unit.province_key,
            'district_english': row.district_english,
            'district_key': row.district_key,
            'wrong_district_key': unit.district_key,
            'ward_key': row.ward_key,
            'wrong_ward_key': unit.ward_key,
            
        }
        
        if unit.ward != row.ward:
            print(address, row.ward, unit.ward)
            wrong_wards.append(data)

In [17]:
df_wrong_wards = pd.DataFrame(wrong_wards)

In [18]:
df_wrong_wards

In [19]:
df_wrong_wards.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
