In [1]:
import pandas as pd
from vietadminunits import parse_address, get_data
from vietadminunits.utils import to_key, to_alphanumeric
import pickle

In [2]:
df = pd.DataFrame(get_data())

In [3]:
df['long_province_alphanumeric'] = df['long_province'].apply(to_alphanumeric)
df['long_district_alphanumeric'] = df['long_district'].apply(to_alphanumeric)
df['long_ward_alphanumeric'] = df['long_ward'].fillna('').apply(to_alphanumeric)

df['province_alphanumeric'] = df['province'].apply(to_alphanumeric)
df['short_district_alphanumeric'] = df['short_district'].apply(to_alphanumeric)
df['short_ward_alphanumeric'] = df['short_ward'].fillna('').apply(to_alphanumeric)

In [4]:
for ward in ['long_ward', 'ward']:
    for district in ['long_district', 'district']:
        for province in ['long_province', 'province']:
            df[f"address_{ward}_{district}_{province}"] = df[ward].fillna('') + df[district] + df[province]
            df[f"address_comma_{ward}_{district}_{province}"] = df[ward].fillna('') + ',' + df[district] + ',' + df[province]

In [5]:

df['province_key'] = df['province'].apply(to_key, args=(1,))
df['district_key'] = df['district'].apply(to_key, args=(2,))
df['ward_key'] = df['ward'].apply(to_key, args=(3,))

In [6]:
address_cols = [col for col in df.columns if 'address' in col]

## Find wards that are the same keyword with provinces

In [7]:
wrong_provinces = []



for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)
        unit = parse_address(address, level=1)
        data = {
            'address': address,
            'province': row.province,
            'province_key': row.province_key,
            'wrong_province_key': unit.province_key,
        }
        
        if unit.province == None:
            print(address)
            wrong_provinces.append(data)
            
        elif unit.province != row.province:
            print(address)
            wrong_provinces.append(data)

In [8]:
df_wrong_provinces = pd.DataFrame(wrong_provinces)

In [9]:
df_wrong_provinces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


In [10]:
df_wrong_provinces

In [11]:
# df_wrong_provinces[df_wrong_provinces.wrong_province_key.isna()]

In [12]:
DICT_double_check_provinces = {}
for row in df_wrong_provinces.itertuples():
    DICT_double_check_provinces[row.wrong_province_key] = df_wrong_provinces[df_wrong_provinces['wrong_province_key']==row.wrong_province_key]['province_key'].unique().tolist()

In [13]:
DICT_double_check_provinces

{}

## Find wards that are the same keyword with districts

We have to add `double_check_provinces` to module to solve province level before doing this step.

In [14]:
wrong_districts = []
for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)

        unit = parse_address(address, level=3)
    
        data = {
            'address': address,
            'province_english': row.province_english,
            'province_key': row.province_key,
            'wrong_province_key': unit.province_key,
            'district_key': row.district_key,
            'wrong_district_key': unit.district_key,
        }
        
        if unit.province == None:
            print(address)
            wrong_districts.append(data)
        elif unit.district == None:
            print(address)
            wrong_districts.append(data)
                
        elif unit.district != row.district:
            print(address)
            wrong_districts.append(data)



In [15]:
df_wrong_districts = pd.DataFrame(wrong_districts)

In [17]:
df_wrong_districts

In [18]:
df_wrong_districts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame


## Double check ward

In [21]:
wrong_wards = []
for address_col in address_cols:
    for row in df.itertuples():
        address = getattr(row, address_col)
        
        try:
        
            unit = parse_address(address, level=3)
        
            data = {
                'address': address,
                'province_english': row.province_english,
                'province_key': row.province_key,
                'wrong_province_key': unit.province_key,
                'district_english': row.district_english,
                'district_key': row.district_key,
                'wrong_district_key': unit.district_key,
                'ward_key': row.ward_key,
                'wrong_ward_key': unit.ward_key,
            }
            
            if unit.province == None:
                print(address)
                wrong_wards.append(data)
                
            elif unit.district == None:
                print(address)
                wrong_wards.append(data)
                
            elif (unit.ward == None) and (unit.district_key not in ['bachlongvi', 'conco', 'hoangsa', 'lyson', 'condao']):
                print(address)
                wrong_wards.append(data)
                    
            elif (unit.ward != row.ward) and (unit.district_key not in ['bachlongvi', 'conco', 'hoangsa', 'lyson', 'condao']):
                print(address)
                wrong_wards.append(data)
                
                
        except Exception as e:
            print('ERROR address:', address)
            print('ERROR info:', e)

In [22]:
df_wrong_wards = pd.DataFrame(wrong_wards)

In [24]:
df_wrong_wards

In [25]:
df_wrong_wards.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
