In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
import re

from vietnamadminunits.parser.utils import key_normalize

import warnings
warnings.filterwarnings('ignore')

BASE_DIR = Path().resolve().parent.parent

In [None]:
# def create_sort(text, level=1):
#     if isinstance(text, str):
#         if level == 1:
#             text = re.sub(r'^Tỉnh\s|Thành phố\s', '', text, flags=re.IGNORECASE)
#         elif level == 2:
#             if re.search(r'^Quận\s\d{1,2}', text, flags=re.IGNORECASE):
#                 pass
#             else:
#                 text = re.sub(r'^Quận\s|Huyện\s|Thị xã\s|Thành phố\s', '', text, flags=re.IGNORECASE)
#         else:
#             if re.search(r'^Phường\s\d{1,2}', text, flags=re.IGNORECASE):
#                 pass
#             else:
#                 text = re.sub(r'^Phường\s|Thị trấn\s|Xã\s', '', text, flags=re.IGNORECASE)
#
#         return text.strip()
#     return text


def create_sort(text, level=1):
    if not isinstance(text, str):
        return text

    # Định nghĩa các tiền tố cần xóa theo cấp
    REMOVE_PREFIXES = {
        1: r'^(Tỉnh|Thành phố)\s',
        2: r'^(?!Quận\s\d{1,2})(Quận|Huyện|Thị xã|Thành phố)\s',
        3: r'^(?!Phường\s\d{1,2})(Phường|Thị trấn|Xã)\s',
    }

    pattern = REMOVE_PREFIXES.get(level)
    if pattern:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    return text.strip()


def create_keywords(row, level=1):
    district_type_acronym = {
        'Quận': 'q',
        'Thị xã': 'tx',
        'Thành phố': 'tp',
        'Huyện': 'h',
    }
    ward_type_acronym = {
        'Phường': 'p',
        'Thị trấn': 'tt',
        'Xã': 'x'
    }

    typing_aliases = {
        'quy': 'qui',
        'qui': 'quy',
        'ngok': 'ngoc',
        'ngoc': 'ngok',
        'pak': 'pac',
        'pac': 'pak',
        'dak': 'dac',
        'dac': 'dak',
        'vi': 'vy',
        'vy': 'vi',
        'sy': 'si',
        'si': 'sy',
        'yang': 'jang',
        'jang': 'yang',
        'sa': 'xa',
        'xa': 'sa',
        
    }

    keywords = []
    if level == 1:
        keywords.append(row['provinceKey'])
        keywords.append(row['provinceShortKey'])
        if pd.notnull(row['provinceAlias']):
            aliases = json.loads(row['provinceAlias'])
            for a in aliases:
                keywords.append(key_normalize(a))

    elif level == 2:
        keywords.append(row['districtKey'])

        if not row['districtShortKeyDuplicated']:
            keywords.append(row['districtShortKey'])
        else:
            keywords.append(key_normalize(f"{row['districtShortKey']} {row['districtType']}"))
            keywords.append(key_normalize(f"{district_type_acronym[row['districtType']]} {row['districtShortKey']}"))


        if row['districtShortDuplicated']:
            district_type = row['districtType']
            district_type_key = key_normalize(district_type)
            district_type_key_acronym = district_type_acronym[row['districtType']]
            acronym_keyword = re.sub(fr'^{district_type_key}', district_type_key_acronym, row['districtKey'])
            keywords.append(acronym_keyword)

            # Huyện Kỳ Anh, Thị xã Kỳ Anh,... thêm shortKey không có type cho type cao hơn
            # if row['districtType'] in ['Thị xã', 'Thành phố']:
            #     keywords.append(key_normalize(create_sort(text=row['district'], level=2)))
            # Triển khai giải pháp tìm wardKeywords xong mới chọn mặc định

        if pd.notnull(row['districtAlias']):
            aliases = json.loads(row['districtAlias'])
            for a in aliases:
                keywords.append(key_normalize(a))

        if re.search(r'^quan\d{1,2}', row['districtKey'], flags=re.IGNORECASE):
            keywords.append(row['districtKey'].replace('quan', 'q'))
            keywords.append(row['districtKey'].replace('quan', 'district'))
            number = row['districtKey'].replace('quan', '').zfill(2)
            keywords.append(f"quan{number}")
            keywords.append(f"q{number}")
            keywords.append(f"district{number}")


        match = re.search(rf"({'|'.join(sorted(typing_aliases.keys(), key=len, reverse=True))})", row['districtShortKey'], flags=re.IGNORECASE)
        if match:
            typing_alias = match.group(0)
            keywords.append(re.sub(fr"{typing_alias}", typing_aliases[typing_alias], row['districtKey'], flags=re.IGNORECASE))
            keywords.append(re.sub(fr"{typing_alias}", typing_aliases[typing_alias], row['districtShortKey'], flags=re.IGNORECASE))


    else:
        if pd.notnull(row['wardKey']):
            keywords.append(row['wardKey'])


            if not row['wardShortKeyDuplicated']:
                keywords.append(row['wardShortKey'])
            else:
                keywords.append(key_normalize(f"{row['wardShortKey']} {row['wardType']}"))
                keywords.append(key_normalize(f"{ward_type_acronym[row['wardType']]} {row['wardShortKey']}"))


            if row['wardShortDuplicated']:
                ward_type = row['wardType']
                ward_type_key = key_normalize(ward_type)
                ward_type_key_acronym = ward_type_acronym[row['wardType']]
                acronym_keyword = re.sub(fr'^{ward_type_key}', ward_type_key_acronym, row['wardKey'])
                keywords.append(acronym_keyword)


            if pd.notnull(row['wardAlias']):
                aliases = json.loads(row['wardAlias'])
                for a in aliases:
                    # keywords.append(key_normalize(a)) # xãnhânthành và xãhợpthành cần vào xãđôngthành
                    keywords.append(a)

            if re.search(r'^phuong\d{1,2}', row['wardKey'], flags=re.IGNORECASE):
                keywords.append(row['wardKey'].replace('phuong', 'p'))
                keywords.append(row['wardKey'].replace('phuong', 'f'))
                keywords.append(row['wardKey'].replace('phuong', 'ward'))
                number = row['wardKey'].replace('phuong', '').zfill(2)
                keywords.append(f"phuong{number}")
                keywords.append(f"p{number}")
                keywords.append(f"f{number}")
                keywords.append(f"ward{number}")

            if 'thitrannongtruong' in row['wardKey']:
                keywords.append(row['wardKey'].replace('thitrannongtruong', 'thitrannt'))
                keywords.append(row['wardKey'].replace('thitrannongtruong', 'ttnt'))
                keywords.append(row['wardKey'].replace('thitrannongtruong', 'nt'))


            for key in typing_aliases.keys():
                if key in row['wardShortKey']:
                    keywords.append(row['wardKey'].replace(key, typing_aliases[key]))
                    keywords.append(row['wardShortKey'].replace(key, typing_aliases[key]))


        else:
            return np.nan

    keywords = list(set(keywords))
    keywords = sorted(keywords, key=len, reverse=True)
    return json.dumps(keywords)



def zill_code(value, level=1):
    if not pd.isnull(value):
        if level == 1:
            return str(int(value)).zfill(2)
        elif level == 2:
            return str(int(value)).zfill(3)
        elif level == 3:
            return str(int(value)).zfill(5)

    return value

In [None]:
df = pd.read_csv(BASE_DIR / 'data/processed/legacy_63-province-10040-ward_with_location.csv')

In [None]:
df

## Enriching data

### Adding basic columns

In [None]:
unit_cols = ['province', 'district', 'ward']
level_map = {
    'province': 1,
    'district': 2,
    'ward': 3
}

for col in unit_cols:
    # Create short version
    level = level_map[col]
    # df[f"{col}Short"] = df[col].apply(create_sort, args=(level,)) # existing

    df[f"{col}Code"] = df[f"{col}Code"].apply(zill_code, args=(level,))

    # Create key
    df[f"{col}Key"] = df[f"{col}"].apply(key_normalize)

    # Create short key
    df[f"{col}ShortKey"] = df[f"{col}Short"].apply(key_normalize)

### Checking duplication

#### District

In [None]:
# districtKey
df_district = df[['province', 'provinceKey', 'district', 'districtKey', 'districtShortKey']].drop_duplicates()
df_district.groupby(['province', 'districtKey']).size().reset_index(name='count').sort_values(by=['count'], ascending=False).head()

# Toàn bộ districtKey là unique, tuyệt vời!

In [None]:
# districtShortKey
count_district_short_key = df_district.groupby(['province', 'districtShortKey']).size().reset_index(name='count').sort_values(by=['count'], ascending=False)
duplicated_district_short_key = count_district_short_key[count_district_short_key['count']>1].copy()
duplicated_district_short_key['districtShortKeyDuplicated'] = True
duplicated_district_short_key.drop(columns=['count'], inplace=True)
duplicated_district_short_key

# Nhờ có phần (Type) ở phía sau cho các district cùng tên nên districtShortKey không bị duplicated, ví dụ: Kỳ Anh (Huyện), Kỳ Anh (Thị xã)

In [None]:
# Add districtShortKeyDuplicated (flow cũ khi districtShortKey vẫn có duplicated do chưa có (Type) ở phía sau cho các district cùng tên)
# Hỗ trợ trong phần tạo keywords
df = pd.merge(df, duplicated_district_short_key, on=['province', 'districtShortKey'], how='left')
df['districtShortKeyDuplicated'].fillna(False, inplace=True)
df[df['districtShortKeyDuplicated']][['province', 'district', 'districtShort']].drop_duplicates().sort_values(by='districtShort')

#### Ward

In [None]:
# wardKey
count_ward_key = df.groupby(['province', 'district', 'wardKey']).size().reset_index(name='count').sort_values(by=['count'], ascending=False)
count_ward_key['wardKeyDuplicated'] = np.where(count_ward_key['count']>1, True, False)
duplicated_ward_key = count_ward_key[count_ward_key['wardKeyDuplicated']]
duplicated_ward_key.drop(columns=['count'], inplace=True)

print(duplicated_ward_key.shape[0])
duplicated_ward_key

# Có nhiều wardKey bị trùng trong một district vì bỏ dấu tiếng Việt

In [None]:
# Add wardKeyDuplicated
df = pd.merge(df, duplicated_ward_key, on=['province', 'district', 'wardKey'], how='left')
df['wardKeyDuplicated'].fillna(False, inplace=True)

# Đưa wardKey và wardShortKey về phiên bản có dấu tiếng Việt
df['wardKey'] = np.where(df['wardKeyDuplicated'], df['ward'].apply(key_normalize, args=([], False)), df['wardKey'])
df['wardShortKey'] = np.where(df['wardKeyDuplicated'], df['wardShort'].apply(key_normalize, args=([], False)), df['wardShortKey'])

# Preview
df[df['wardKeyDuplicated']][['province', 'district', 'ward']].sort_values(by=['province', 'district', 'ward']).reset_index(drop=True)

In [None]:
# wardShortKey
count_ward_short_key = df.groupby(['province', 'district', 'wardShortKey']).size().reset_index(name='count').sort_values(by=['count'], ascending=False)
duplicated_ward_short_key = count_ward_short_key[count_ward_short_key['count']>1].copy()
duplicated_ward_short_key['wardShortKeyDuplicated'] = True
duplicated_ward_short_key.drop(columns=['count'], inplace=True)
duplicated_ward_short_key

In [None]:
# Add wardShortKeyDuplicated
df = pd.merge(df, duplicated_ward_short_key, on=['province', 'district', 'wardShortKey'], how='left')
df['wardShortKeyDuplicated'].fillna(False, inplace=True)

# Preview
df[df['wardShortKeyDuplicated']][['province', 'district', 'ward']]

# Cần thêm một DICT mà wardKey là no-accented nhưng wardKeyShort là accented

## Creating list of keywords

In [None]:
# -- CREATE ALIAS, phải làm sau khi đã fix duplicate keys, ví dụ như xãđôngthành

# Create alias columns with nan value
for col in ['province', 'district', 'ward']:
    df[f"{col}Alias"] = np.nan

df_province_alias = pd.read_csv(BASE_DIR / 'data/alias_keywords/legacy/alias_province.csv')
df_district_alias = pd.read_csv(BASE_DIR / 'data/alias_keywords/legacy/alias_district.csv')
df_ward_alias = pd.read_csv(BASE_DIR / 'data/alias_keywords/legacy/alias_ward.csv')

province_alias_map = (
    df_province_alias
    .groupby('province_key')['alias_keyword']
    .apply(list)
    .apply(json.dumps)
    .to_dict()
)

df['provinceAlias'] = df['provinceKey'].map(province_alias_map)


district_alias_map = (
    df_district_alias
    .groupby(['province_key', 'district_key'])['alias_keyword']
    .apply(list)
    .apply(json.dumps)
    .to_dict()
)

df['districtAlias'] = df.apply(
    lambda row: district_alias_map.get((row['provinceKey'], row['districtKey'])),
    axis=1
)


ward_alias_map = (
    df_ward_alias
    .groupby(['province_key', 'district_key', 'ward_key'])['alias_keyword']
    .apply(list)
    .apply(json.dumps)
    .to_dict()
)

df['wardAlias'] = df.apply(
    lambda row: ward_alias_map.get((row['provinceKey'], row['districtKey'], row['wardKey'])),
    axis=1
)

In [None]:
# Create keywords
for col in unit_cols:
    level = level_map[col]
    df[f"{col}Keywords"] = df.apply(lambda row: create_keywords(row, level=level), axis=1)

## Creating dictionaries

In [None]:
# Province map
df_province = df[['provinceKey', 'provinceKeywords', 'province', 'provinceShort', 'provinceLat', 'provinceLon', 'provinceCode']].drop_duplicates().reset_index(drop=True)
DICT_PROVINCE = {}
for _, row in df_province.iterrows():
    DICT_PROVINCE[row['provinceKey']] = {
        'provinceKeywords': json.loads(row['provinceKeywords']),
        'province': row['province'],
        'provinceShort': row['provinceShort'],
        'provinceLat': row['provinceLat'],
        'provinceLon': row['provinceLon'],
        'provinceCode': row['provinceCode'],
    }


# District map
df_district = df[['provinceKey', 'provinceShortKey', 'districtKey', 'districtShortKey', 'districtKeywords', 'district', 'districtType', 'districtShort', 'districtLat', 'districtLon', 'districtCode']].drop_duplicates().reset_index(drop=True)
DICT_PROVINCE_DISTRICT = {}
for _, province_row in df_province.iterrows():
    province_key = province_row['provinceKey']
    DICT_PROVINCE_DISTRICT[province_key] = {}

    df_district_filtered = df_district[df_district['provinceKey'] == province_key]

    for _, district_row in df_district_filtered.iterrows():
        DICT_PROVINCE_DISTRICT[province_key][district_row['districtKey']] = {
            'districtKeywords': json.loads(district_row['districtKeywords']) if pd.notnull(district_row['districtKeywords']) else [],
            'district': district_row['district'],
            'districtType': district_row['districtType'],
            'districtShort': district_row['districtShort'],
            'districtLat': district_row['districtLat'],
            'districtLon': district_row['districtLon'],
            'districtCode': district_row['districtCode'],
        }


# Unique district to province map
province_short_keys = df['provinceShortKey'].unique().tolist()
for index, row in df_district.iterrows():
    district_short_key = row['districtShortKey']
    left_district_short_keys = df_district.loc[df_district.index != index, 'districtShortKey'].tolist()
    if district_short_key not in province_short_keys and district_short_key not in left_district_short_keys:
        df_district.loc[index, 'districtUnique'] = True
df_district['districtUnique'].fillna(False, inplace=True)
df_district_unique = df_district[df_district['districtUnique']==True]

DICT_UNIQUE_DISTRICT_PROVINCE = {}
for _, row in df_district_unique.iterrows():
    DICT_UNIQUE_DISTRICT_PROVINCE[row['districtKey']] = {
        'districtKeywords': json.loads(row['districtKeywords']),
        'provinceKey': row['provinceKey']
    }

In [None]:
# Ward map
df_ward = df[['provinceKey', 'districtKey', 'wardKey', 'wardKeywords', 'ward', 'wardShort', 'wardType', 'wardKeyDuplicated', 'wardLat', 'wardLon', 'wardCode']].drop_duplicates().reset_index(drop=True)

df_ward_no_accented = df_ward[df_ward['wardKeyDuplicated']==False]
df_ward_accented = df_ward[df_ward['wardKeyDuplicated']==True]

def build_province_district_ward_dict(df, short_name_key=False):
    DICT_PROVINCE_DISTRICT_WARD = {}

    for province_key, province_group in df.groupby('provinceKey'):
        DICT_PROVINCE_DISTRICT_WARD[province_key] = {}

        for district_key, district_group in province_group.groupby('districtKey'):
            DICT_PROVINCE_DISTRICT_WARD[province_key][district_key] = {}

            for _, row in district_group.iterrows():
                ward_key = row['wardKey']
                if short_name_key:
                    keywords = [key_normalize(row['wardShort'], decode=False)]
                else:
                    keywords = json.loads(row['wardKeywords']) if pd.notnull(row['wardKeywords']) else []
                DICT_PROVINCE_DISTRICT_WARD[province_key][district_key][ward_key] = {
                    'wardKeywords': keywords,
                    'ward': row['ward'],
                    'wardShort': row['wardShort'],
                    'wardType': row['wardType'],
                    'wardLat': row['wardLat'],
                    'wardLon': row['wardLon'],
                    'wardCode': row['wardCode'],
                }

    return DICT_PROVINCE_DISTRICT_WARD


DICT_PROVINCE_DISTRICT_WARD_NO_ACCENTED = build_province_district_ward_dict(df_ward_no_accented)
DICT_PROVINCE_DISTRICT_WARD_ACCENTED = build_province_district_ward_dict(df_ward_accented)

df_ward_short_accented = df[df['wardShortKeyDuplicated']]
DICT_PROVINCE_DISTRICT_WARD_SHORT_ACCENTED = build_province_district_ward_dict(df_ward_short_accented, short_name_key=True)

In [None]:
df_district_divided = pd.read_csv(BASE_DIR / 'data/alias_keywords/legacy/divided_district.csv')
df_district_divided['districtDefault'].fillna(False, inplace=True)

# Khởi tạo dict thường
DICT_PROVINCE_DISTRICT_DIVIDED = {}

# Duyệt qua từng dòng trong bảng chia tách
for _, row in df_district_divided.iterrows():
    province_key = row['provinceKey']
    divided_key = row['dividedDistrictKey']
    divided_keywords = json.loads(row['dividedDistrictKeyWords']) if isinstance(row['dividedDistrictKeyWords'], str) else []
    district_key = row['districtKey']
    is_default = bool(row['districtDefault'])

    # Lấy ward keywords
    mask = (df['provinceKey'] == province_key) & (df['districtKey'] == district_key)
    ward_keywords = df.loc[mask, 'wardKeywords'].dropna().tolist()
    ward_keywords_flat = sum([json.loads(w) if isinstance(w, str) else [] for w in ward_keywords], [])

    # Tạo các cấp nếu chưa có
    if province_key not in DICT_PROVINCE_DISTRICT_DIVIDED:
        DICT_PROVINCE_DISTRICT_DIVIDED[province_key] = {}
    if divided_key not in DICT_PROVINCE_DISTRICT_DIVIDED[province_key]:
        DICT_PROVINCE_DISTRICT_DIVIDED[province_key][divided_key] = {
            'dividedDistrictKeywords': divided_keywords,
            'districts': {}
        }

    # Gán dữ liệu district
    DICT_PROVINCE_DISTRICT_DIVIDED[province_key][divided_key]['districts'][district_key] = {
        'wardKeywords': ward_keywords_flat,
        'districtDefault': is_default
    }

## Saving package data

In [None]:
# DICT
parser_data = {
    'DICT_PROVINCE': DICT_PROVINCE,
    'DICT_PROVINCE_DISTRICT': DICT_PROVINCE_DISTRICT,
    'DICT_UNIQUE_DISTRICT_PROVINCE': DICT_UNIQUE_DISTRICT_PROVINCE,
    'DICT_PROVINCE_DISTRICT_WARD_NO_ACCENTED': DICT_PROVINCE_DISTRICT_WARD_NO_ACCENTED,
    'DICT_PROVINCE_DISTRICT_WARD_ACCENTED': DICT_PROVINCE_DISTRICT_WARD_ACCENTED,
    'DICT_PROVINCE_DISTRICT_WARD_SHORT_ACCENTED': DICT_PROVINCE_DISTRICT_WARD_SHORT_ACCENTED,
    'DICT_PROVINCE_DISTRICT_DIVIDED': DICT_PROVINCE_DISTRICT_DIVIDED
}

with open(BASE_DIR / 'vietnamadminunits/data/parser_legacy.json', 'w') as f:
    json.dump(parser_data, f)

# SQLite
import sqlite3
with sqlite3.connect(BASE_DIR / 'vietnamadminunits/data/dataset.db') as conn:
    df.to_sql('admin_units_legacy', conn, if_exists='replace', index=False)

## Saving interim data

In [None]:
df.to_csv(BASE_DIR / 'data/interim/legacy_63-province-10040-ward_with_location_and_key.csv', index=False)