In [1]:
import unicodedata
import re
from difflib import SequenceMatcher

def normalize_vietnamese_string(text: str) -> str:
    text = text.lower().strip()
    text = unicodedata.normalize('NFD', text)
    text = re.sub(r'[\u0300-\u036f]', '', text)
    text = text.replace('đ', 'd')
    return " ".join(text.split())

def fuzzy_brand_match(text: str, threshold: float = 0.6) -> str | None:
    brand_variants = {
        'samsung': ['samsung', 'ss', 'sam sung', 'samsum', 'samsun'],
        'apple': ['apple', 'iphone', 'ipad', 'macbook', 'mac', 'ip'],
        'oppo': ['oppo', 'op', 'opo'],
        'vivo': ['vivo', 'vi vo', 'vv'],
        'xiaomi': ['xiaomi', 'mi', 'redmi', 'xiao mi', 'xiomi'],
        'realme': ['realme', 'real me', 'rm'],
        'huawei': ['huawei', 'hua wei', 'hw'],
        'dell': ['dell', 'de ll'],
        'hp': ['hp', 'hewlett packard'],
        'asus': ['asus', 'a sus'],
        'acer': ['acer', 'a cer'],
        'lenovo': ['lenovo', 'le no vo'],
        'lg': ['lg', 'l g'],
        'sony': ['sony', 'so ny']
    }
    
    text = normalize_vietnamese_string(text)
    
    # Kiểm tra exact match trước
    for brand, variants in brand_variants.items():
        for variant in variants:
            if variant in text:
                return brand
    
    # Fuzzy matching nếu không có exact match
    best_brand = None
    best_score = 0
    
    for brand, variants in brand_variants.items():
        for variant in variants:
            # Tính similarity score
            score = SequenceMatcher(None, variant, text).ratio()
            if score > threshold and score > best_score:
                best_score = score
                best_brand = brand
    
    return best_brand

def parse_price_string(price_str: str) -> int | None:
    if not price_str: 
        return None
    try:
        text = price_str.lower().strip().replace('.', '').replace(',', '')
        total_value = 0
        if 'tr' in text or 'trieu' in text:
            text = text.replace('trieu', 'tr')
            parts = text.split('tr')
            if parts[0]: 
                total_value += float(parts[0]) * 1_000_000
            if len(parts) > 1 and parts[1]:
                if len(parts[1]) < 3: 
                    total_value += float(parts[1]) * 100_000
                else: 
                    total_value += float(parts[1]) * 1_000
            return int(total_value)
        if 'k' in text:
            value_part = text.replace('k', '').strip()
            return int(float(value_part) * 1_000)
        return int(text)
    except (ValueError, IndexError):
        return None

def parse_master_query(query: str) -> dict:
    conditions = {}
    q = normalize_vietnamese_string(query)
    original_q = q
    
    # Xử lý loại sản phẩm
    type_pattern = r'(dien thoai|laptop|may tinh bang|cap sac|tai nghe|du phong|flycam|tablet)'
    type_match = re.search(type_pattern, q)
    if type_match:
        conditions['phanloai'] = type_match.group(1)
        q = re.sub(type_pattern, '', q).strip()

    # ✅ Xử lý brand với fuzzy matching
    brand = fuzzy_brand_match(original_q)
    if brand:
        conditions['brand'] = brand
        # Xóa tất cả brand variants khỏi query
        brand_variants = {
            'samsung': ['samsung', 'ss', 'sam sung', 'samsum', 'samsun'],
            'apple': ['apple', 'iphone', 'ipad', 'macbook', 'mac', 'ip'],
            'oppo': ['oppo', 'op', 'opo'],
            'vivo': ['vivo', 'vi vo', 'vv'],
            'xiaomi': ['xiaomi', 'mi', 'redmi', 'xiao mi', 'xiomi'],
            'realme': ['realme', 'real me', 'rm'],
            'huawei': ['huawei', 'hua wei', 'hw'],
        }
        
        if brand in brand_variants:
            for variant in brand_variants[brand]:
                q = re.sub(rf'\b{re.escape(variant)}\b', '', q, flags=re.IGNORECASE).strip()

    # Xử lý giá
    price_patterns = {
        'price_lte': r'(duoi|toi da|max)\s*([0-9,.\s]+(?:tr|trieu|k)?)',
        'price_gte': r'(tren|tu|min)\s*([0-9,.\s]+(?:tr|trieu|k)?)',
        'price_exact': r'(?:gia|)\s*([0-9,.\s]+)\s*(tr|trieu|k)\b(?!\s*(?:duoi|tren|tu|min|toi da|max))'
    }
    
    for key, pattern in price_patterns.items():
        match = re.search(pattern, q)
        if match:
            if key == 'price_exact':
                price_str = match.group(1) + match.group(2)
                price_val = parse_price_string(price_str)
            else:
                price_val = parse_price_string(match.group(2))
            
            if price_val is not None: 
                conditions[key] = price_val
                q = re.sub(pattern, '', q).strip()
                break
    
    # RAM pattern
    ram_pattern = r'\b(\d+)\s*(?:g|gb)?\s*ram\b|\bram\s*(\d+)\s*(?:g|gb)\b'
    ram_match = re.search(ram_pattern, q)
    if ram_match:
        for group in ram_match.groups():
            if group and group.isdigit():
                conditions['ram_gb'] = int(group)
                break
        q = re.sub(ram_pattern, '', q).strip()
    
    # Storage pattern  
    storage_patterns = [
        r'(\d+)\s*(?:g|gb)\s*(?:bo nho|luu tru)\b',
        r'\b(?:bo nho|luu tru)\s*(\d+)\s*(?:g|gb)\b'
    ]
    
    for pattern in storage_patterns:
        storage_match = re.search(pattern, q)
        if storage_match:
            conditions['storage_gb'] = int(storage_match.group(1))
            q = re.sub(pattern, '', q).strip()
            break
    
    # Xử lý số GB còn lại
    if 'storage_gb' not in conditions:
        remaining_gb = re.search(r'\b(\d+)\s*(?:g|gb)\b', q)
        if remaining_gb:
            gb_value = int(remaining_gb.group(1))
            if gb_value >= 32:
                conditions['storage_gb'] = gb_value
                q = re.sub(r'\b(\d+)\s*(?:g|gb)\b', '', q, count=1).strip()
    
    # Text search cho phần còn lại
    q_clean = re.sub(r'\s+', ' ', q).strip()
    if q_clean:
        conditions['text_search'] = q_clean
        
    return conditions

# ✅ Test cases nâng cao
test_cases = [
    "điện thoại",          # Samsung Galaxy S23 Ultra
    "ip 15 pro max 256gb",                   # iPhone 15 Pro Max
    "sam sung note 20 dưới 10tr",            # Samsung Note 20
    "macbook pro m2 16gb",                   # MacBook Pro M2
    "mi 13 lite oppo 8gb",                   # Xiaomi Mi 13 Lite (lỗi brand)
    "điện thoai ss giá 15 triệu",           # Samsung phone 15tr
    "laptop dell inspiron 5000 32gb",       # Dell Inspiron
    "galaxy tab s9 wifi 8gb 128gb",         # Samsung Galaxy Tab S9
    "redmi note 12 pro",                     # Xiaomi Redmi Note 12 Pro
    "ipad air 2022 64g"                     # iPad Air 2022
]



for test in test_cases:
    result = parse_master_query(test)
    print(result)

{'phanloai': 'dien thoai'}
{'brand': 'apple', 'price_lte': 256, 'text_search': '15 pro gb'}
{'brand': 'samsung', 'price_lte': 10000000, 'text_search': 'note 20'}
{'brand': 'apple', 'text_search': 'pro m2 16gb'}
{'brand': 'oppo', 'text_search': 'mi 13 lite 8gb'}
{'phanloai': 'dien thoai', 'brand': 'samsung', 'price_exact': 15000000}
{'phanloai': 'laptop', 'brand': 'oppo', 'storage_gb': 32, 'text_search': 'dell inspiron 5000'}
{'text_search': 'galaxy tab s9 wifi 8gb 128gb'}
{'brand': 'xiaomi', 'text_search': 'note 12 pro'}
{'brand': 'apple', 'storage_gb': 64, 'text_search': 'air 2022'}
