In [26]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True

    def search(self, word):
        node = self.root
        prefix = ''
        for char in word:
            if char not in node.children:
                break
            prefix += char
            node = node.children[char]
            if node.is_end_of_word:
                return prefix
        return ''

class Solution:
    def __init__(self):
        self.province_path = './data/processed/list_province.txt'
        self.district_path = './data/processed/list_district.txt'
        self.ward_path = './data/processed/list_ward.txt'
        self.trie = Trie()
        self.build_trie()

    def build_trie(self):
        # Read and insert words from province, district, and ward files into the trie
        self.read_and_insert(self.province_path)
        self.read_and_insert(self.district_path)
        self.read_and_insert(self.ward_path)

    def read_and_insert(self, file_path):
        # Open the file and insert each line as a word into the trie
        with open(file_path, 'r') as file:
            for line in file:
                # Convert to lowercase and remove leading/trailing whitespaces
                word = line.strip().lower()
                self.trie.insert(word)

    def longest_prefix_match(self, s):
        # Find the longest prefix match for the input string
        s = s.lower()
        return self.trie.search(s)

    def process(self, s: str):
        # Find the longest prefix match for the input string
        prefix = self.longest_prefix_match(s)
        
        # Remove the prefix and any leading/trailing whitespaces and commas
        remaining = s[len(prefix):].strip(', ')
        
        # Split the remaining string by commas to extract province, district, and ward
        parts = remaining.split(', ')
        province = parts[-1] if parts else ""
        district = parts[-2] if len(parts) >= 2 else ""
        ward = parts[-3] if len(parts) >= 3 else ""
        
        return {
            "province": province,
            "district": district,
            "ward": ward,
        }

In [27]:
# Instantiate the solution object
solution = Solution()

# Input string
input_string = "Tuyên Quang, Yên Sơn, Tân Bình"

# Process the input string
result = solution.process(input_string)

# Print the result
print(result)

{'province': 'Tân Bình', 'district': 'Yên Sơn', 'ward': ''}


In [28]:
# NOTE: DO NOT change this cell
# This cell is for scoring

TEAM_NAME = 'Ths_AA_Group20'  # This should be your team name
EXCEL_FILE = f'{TEAM_NAME}.xlsx'

import json
import time
with open('public.json') as f:
    data = json.load(f)

summary_only = True
df = []
solution = Solution()
timer = []
correct = 0
for test_idx, data_point in enumerate(data):
    address = data_point["text"]

    ok = 0
    try:
        start = time.perf_counter_ns()
        result = solution.process(address)
        answer = data_point["result"]
        finish = time.perf_counter_ns()
        timer.append(finish - start)
        ok += int(answer["province"] == result["province"])
        ok += int(answer["district"] == result["district"])
        ok += int(answer["ward"] == result["ward"])
        df.append([
            test_idx,
            address,
            answer["province"],
            result["province"],
            int(answer["province"] == result["province"]),
            answer["district"],
            result["district"],
            int(answer["district"] == result["district"]),
            answer["ward"],
            result["ward"],
            int(answer["ward"] == result["ward"]),
            ok,
            timer[-1] / 1_000_000_000,
        ])
    except Exception as e:
        df.append([
            test_idx,
            address,
            answer["province"],
            "EXCEPTION",
            0,
            answer["district"],
            "EXCEPTION",
            0,
            answer["ward"],
            "EXCEPTION",
            0,
            0,
            0,
        ])
        # any failure count as a zero correct
        pass
    correct += ok


    if not summary_only:
        # responsive stuff
        print(f"Test {test_idx:5d}/{len(data):5d}")
        print(f"Correct: {ok}/3")
        print(f"Time Executed: {timer[-1] / 1_000_000_000:.4f}")


print(f"-"*30)
total = len(data) * 3
score_scale_10 = round(correct / total * 10, 2)
if len(timer) == 0:
    timer = [0]
max_time_sec = round(max(timer) / 1_000_000_000, 4)
avg_time_sec = round((sum(timer) / len(timer)) / 1_000_000_000, 4)

import pandas as pd

df2 = pd.DataFrame(
    [[correct, total, score_scale_10, max_time_sec, avg_time_sec]],
    columns=['correct', 'total', 'score / 10', 'max_time_sec', 'avg_time_sec',],
)

columns = [
    'ID',
    'text',
    'province',
    'province_student',
    'province_correct',
    'district',
    'district_student',
    'district_correct',
    'ward',
    'ward_student',
    'ward_correct',
    'total_correct',
    'time_sec',
]

df = pd.DataFrame(df)
df.columns = columns

print(f'{TEAM_NAME = }')
print(f'{EXCEL_FILE = }')
print(df2)

# writer = pd.ExcelWriter(EXCEL_FILE, engine='xlsxwriter')
# df2.to_excel(writer, index=False, sheet_name='summary')
# df.to_excel(writer, index=False, sheet_name='details')
# writer.close()


------------------------------
TEAM_NAME = 'Ths_AA_Group20'
EXCEL_FILE = 'Ths_AA_Group20.xlsx'
   correct  total  score / 10  max_time_sec  avg_time_sec
0      309   1350        2.29           0.0           0.0
