In [1]:
from vietadminunits import parse_address
import pandas as pd
import re
from unidecode import unidecode

In [2]:
def create_key(text):
    if not isinstance(text, str):
        return text
    u_text = unidecode(text) # Unidecode first help remove special characters
    c_text = u_text.lower().strip() # Case must be second step
    c_text = re.sub(r"\-|\'", ' ', c_text)
    c_text = c_text.replace(' ', '')
    return c_text

In [3]:
def find_new_district_english_module(text):
    au = parse_address(text, 2)
    return au.district_english

In [4]:
def prepare_address(province, change):
    address = province + ' ' + re.sub(f'.* thành ', '', change)
    return address

In [5]:
def create_province_english_module(text):
    au = parse_address(text, 1)
    return au.province_english

In [6]:
def create_district_key_module(text):
    au = parse_address(text)
    return au.district_key

In [7]:
def create_province_key_module(text):
    au = parse_address(text)
    return au.province_key

In [9]:
df = pd.read_csv('../../data/input/danhmuchanhchinhgso.gov.vn.csv')

In [10]:
df['ward_address'] = df['long_ward'].fillna('') + ', ' + df['long_district'] + ', ' + df['long_province']
ward_addresses = df['ward_address'].dropna().tolist()

In [11]:
df['district_address'] = df['long_district'] + ', ' + df['long_province']
district_addresses = df['district_address'].dropna().tolist()

## Find wards that are the same keyword with provinces

In [12]:
def create_district_key(text):
    if not isinstance(text, str):
        return text
    u_text = unidecode(text) # Unidecode first help remove special characters
    c_text = str(u_text).lower().strip() # Case must be second step
    if not re.search(r'^Quan \d{1,2}', c_text, flags=re.IGNORECASE):
        c_text = re.sub(r'\sDistrict$|^Quan\s|^Huyen\s|^Thanh\sPho\s|\sCity$|^Thi\sXa\s|\sTown$|\s0', '', c_text, flags=re.IGNORECASE)
    c_text = re.sub(r"\-|\'", ' ', c_text)
    c_text = re.sub(r'\bqui\b', 'quy', c_text)
    c_text = c_text.replace(' ', '')
    return c_text

In [13]:
def create_province_key(text):
    u_text = unidecode(text) # Unidecode first help remove special characters
    c_text = str(u_text).lower().strip() # Case must be second step
    c_text = re.sub(r'\sProvince$|^Tinh\s|^Thanh\sPho\s|\sCity$', '', c_text, flags=re.IGNORECASE)
    c_text = re.sub(r"\-|\'", ' ', c_text)
    c_text = re.sub(r'\bqui\b', 'quy', c_text)
    c_text = c_text.replace(' ', '')
    return c_text

In [14]:
wrong_addresses = []
for row in df.itertuples():
    unit = parse_address(row.ward_address, level=1)
    
    data = {
        'ward_address': row.ward_address,
        'province': row.long_province,
        'wrong_province_key': unit.province_key,
    }
    
    if unit.province == None:
        print(row.ward_address)
        wrong_addresses.append(data)
        
    # elif unit.district == None:
    #     print(row.ward_address)
    #     wrong_addresses.append(data)
    
    elif unit.province_key != create_province_key(row.long_province):
        print(row.ward_address)
        wrong_addresses.append(data)

In [15]:
wrong_addresses = []
for row in df.itertuples():
    unit = parse_address(row.district_address, level=1)
    
    data = {
        'ward_address': row.district_address,
        'province': row.long_province,
        'wrong_province_key': unit.province_key,
    }
    
    if unit.province == None:
        print(row.district_address)
        wrong_addresses.append(data)
        
    # elif unit.district == None:
    #     print(row.ward_address)
    #     wrong_addresses.append(data)
    
    elif unit.province_key != create_province_key(row.long_province):
        print(row.district_address)
        wrong_addresses.append(data)

In [16]:
df_wrong_provinces = pd.DataFrame(wrong_addresses)

In [17]:
df_wrong_provinces

In [18]:
double_check_provinces = {}

if df_wrong_provinces.shape[0]:
    df_wrong_provinces['true_province_key'] = df_wrong_provinces.province.apply(create_province_key)
    
    
    for row in df_wrong_provinces.itertuples():
        double_check_provinces[row.wrong_province_key] = df_wrong_provinces[df_wrong_provinces['wrong_province_key']==row.wrong_province_key]['true_province_key'].unique().tolist()

In [19]:
double_check_provinces

{}

## Find wards that are the same keyword with districts

We have to add `double_check_provinces` to module to solve province level before doing this step.

In [20]:
wrong_addresses = []
for row in df.itertuples():
    try:
        unit = parse_address(str(row.ward_address), level=2)
    except Exception as e:
        raise Exception(row.ward_address)
    
    # unit = parse_address(str(row.ward_address), level=3)
    
    data = {
        'ward_address': row.ward_address,
        'province': row.long_province,
        'district': row.long_district,
        'wrong_district_key': unit.district_key,
    }
    
    if unit.province == None:
        print(row.ward_address)
        wrong_addresses.append(data)
    elif unit.district == None:
        print(row.ward_address)
        wrong_addresses.append(data)
    # elif unit.ward == None and unit.district_key not in ['bachlongvi', 'conco', 'hoangsa', 'lyson', 'condao']:
    #     print(row.ward_address)
    #     wrong_addresses.append(data)
    
    elif unit.district_key != create_district_key(row.long_district):
        print(row.ward_address, unit.district_key, create_district_key(row.long_district))
        wrong_addresses.append(data)

In [21]:
df_wrong_districts = pd.DataFrame(wrong_addresses)

In [22]:
df_wrong_districts

In [23]:
double_check_districts = {}
if df_wrong_districts.shape[0]:
    # df_wrong_districts = df_wrong_districts.drop(columns='ward_address').drop_duplicates()
    
    df_wrong_districts['district_address'] = df_wrong_districts['district'].apply(create_district_key) + ', ' + df_wrong_districts['province']
    
    df_wrong_districts['true_district_key'] = df_wrong_districts['district_address'].apply(create_district_key_module)
    
    for row in df_wrong_districts.itertuples():
        double_check_districts[row.wrong_district_key] = df_wrong_districts[df_wrong_districts['wrong_district_key']==row.wrong_district_key]['true_district_key'].unique().tolist()

In [24]:
double_check_districts

{}

In [25]:
df_wrong_districts[df_wrong_districts.true_district_key==df_wrong_districts.wrong_district_key]

AttributeError: 'DataFrame' object has no attribute 'true_district_key'