In [172]:
# %%shell
# git clone --branch TrggTin --single-branch https://github.com/vphuhan/21KHDL-TikTok-Analytics.git
# cd 21KHDL-TikTok-Analytics
# git sparse-checkout init --cone
# git sparse-checkout set data/interim
# git checkout

In [173]:
# pip install pandas nltk underthesea scikit-learn tqdm

# Imports and Initialization

In [174]:
import pandas as pd
import re
import unicodedata
import nltk
from underthesea import word_tokenize, pos_tag, ner
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import get_close_matches
import logging
import json
import os
from tqdm import tqdm
import string
import regex as re
import traceback
import jdc  
from spellchecker import SpellChecker
from datetime import datetime

In [175]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("extraction_log.log"),
        logging.StreamHandler()
    ]
)

In [176]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# VietnameseTextProcessor Class Definition

In [None]:
class VietnameseTextProcessor:
    def __init__(self, food_list_path=None, location_list_path=None):
        """
        Kh·ªüi t·∫°o B·ªô x·ª≠ l√Ω vƒÉn b·∫£n ti·∫øng Vi·ªát

        Tham s·ªë:
            food_list_path (str): ƒê∆∞·ªùng d·∫´n ƒë·∫øn t·ªáp JSON ch·ª©a danh s√°ch m√≥n ƒÉn Vi·ªát Nam
            location_list_path (str): ƒê∆∞·ªùng d·∫´n ƒë·∫øn t·ªáp JSON ch·ª©a danh s√°ch ƒë·ªãa ƒëi·ªÉm ·ªü Vi·ªát Nam
        """
        # T·∫£i ho·∫∑c kh·ªüi t·∫°o danh s√°ch m√≥n ƒÉn v√† ƒë·ªãa ƒëi·ªÉm
        self.foods = self._load_entity_list(food_list_path, "foods")
        self.locations = self._load_entity_list(location_list_path, "locations")

        # C√°c t·ª´ kh√≥a ph·ªï bi·∫øn li√™n quan ƒë·∫øn m√≥n ƒÉn v√† h∆∞∆°ng v·ªã trong ti·∫øng Vi·ªát ƒë·ªÉ h·ªó tr·ª£ nh·∫≠n di·ªán
        self.food_indicators = [
            "b√°nh m√¨", "ph·ªü", "b√∫n", "x√®o", "c∆°m", "g·ªèi", "ch·∫£", "x√¥i", "cao l·∫ßu", "ch√°o",
            "m√¨ g√≥i", "h·ªß ti·∫øu", "nem", "ch·∫£ ram", "b√°nh kh·ªçt",
            "l·∫©u", "c√°", "th·ªãt", "canh", "rau", "ƒë·∫≠u", "·ªëc", "s√∫p", "b·∫Øp", "l∆∞∆°n", "mƒÉng", "n·∫•m",
            "chu·ªëi", "n·ªôm", "tr√†", "c√† ph√™", "sinh t·ªë", "kem", "t√†u h·ªß", "ch√®", "yaourt", "n∆∞·ªõc m√≠a",
            "s·ªØa", "k·∫πo", "ƒëa", "nem chua", "g√†", "b√≤", "heo", "v·ªãt", "c√°", "t√¥m", "m·ª±c, ·ªëc", "s√≤", "h√†u",
            "b√∫n ri√™u", "b√∫n b√≤", "b√∫n m·∫Øm", "b√∫n m·ªçc", "b√∫n ch·∫£", "b√∫n ƒë·∫≠u", "b√∫n ·ªëc"
        ]

        self.taste_indicators = [
            "ngon", "ng·ªçt", "chua", "cay", "ƒë·∫Øng", "m·∫∑n", "b√πi", "b√©o", "gi√≤n", "m·ªÅm",
            "th∆°m", "n·ªìng", "ƒë·∫≠m ƒë√†", "nh·∫°t", "thanh", "t∆∞∆°i", "ch√°t", "cay n·ªìng", "cay nh·∫π", "cay v·ª´a",
            "s·∫ßn s·∫≠t", "m·ªçng n∆∞·ªõc", "ƒë·∫Øng ngh√©t", "ch√°t", "cay x√®", "t√™", "m·∫∑n ch√°t", "ng·ªçt l·ªãm", "b√©o ng·∫≠y", "th∆°m l·ª´ng",
            "n·ªìng n√†n", "ƒë·∫≠m v·ªã", "nh·∫°t nh·∫Ωo", "thanh m√°t", "t∆∞∆°i", "ƒë·∫≠m ƒë√† h∆∞∆°ng v·ªã", "v·ª´a ƒÉn", "h·ª£p kh·∫©u v·ªã"
        ]

        self.locations_indicators = [ 
            "qu·∫≠n 1", "qu·∫≠n 2", "qu·∫≠n 3", "qu·∫≠n 4", "qu·∫≠n 5", "qu·∫≠n 6", "qu·∫≠n 7", "qu·∫≠n 8", "qu·∫≠n 9", "qu·∫≠n 10",
            "qu·∫≠n 11", "qu·∫≠n 12", "b√¨nh th·∫°nh", "t√¢n b√¨nh", "t√¢n ph√∫", "ph√∫ nhu·∫≠n", "g√≤ v·∫•p", "b√¨nh t√¢n", "th·ªß ƒë·ª©c", "h√≥c m√¥n",
            "c·ªß chi", "nh√† b√®", "c·∫ßn gi·ªù", "b√¨nh ch√°nh", "tp th·ªß ƒë·ª©c",
            "h√† n·ªôi", "h·ªì ch√≠ minh", "ƒë√† n·∫µng", "h·∫£i ph√≤ng", "c·∫ßn th∆°", "hu·∫ø", "nha trang", "v≈©ng t√†u", "ƒë√† l·∫°t",
            "h·∫° long", "m·ªπ tho", "long xuy√™n", "r·∫°ch gi√°", "c√† mau", "bi√™n h√≤a", "bu√¥n ma thu·ªôt", "th√°i nguy√™n", "nam ƒë·ªãnh"
        ]


        # T·∫£i c√°c t√†i nguy√™n c·ªßa NLTK n·∫øu c·∫ßn
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

        # T·∫°o th∆∞ m·ª•c ƒë·ªÉ l∆∞u tr·ªØ c√°c t·ªáp d·ªØ li·ªáu ƒë∆∞·ª£c tr√≠ch xu·∫•t
        os.makedirs("extracted_data", exist_ok=True)


# Helper Methods

In [178]:
def _load_entity_list(self, file_path, entity_type):
    """T·∫£i danh s√°ch th·ª±c th·ªÉ t·ª´ t·ªáp ho·∫∑c tr·∫£ v·ªÅ t·∫≠p r·ªóng m·∫∑c ƒë·ªãnh"""
    if file_path and os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return set(json.load(f))
        except Exception as e:
            logging.warning(f"L·ªói khi t·∫£i danh s√°ch {entity_type}: {e}")

    logging.info(f"Kh√¥ng t√¨m th·∫•y danh s√°ch {entity_type} hi·ªán c√≥, b·∫Øt ƒë·∫ßu v·ªõi t·∫≠p r·ªóng")
    return set()

def save_entity_list(self, entity_list, entity_type):
    """L∆∞u danh s√°ch th·ª±c th·ªÉ ƒë√£ c·∫≠p nh·∫≠t v√†o t·ªáp"""
    file_path = f"extracted_data/{entity_type}_list.json"
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(list(entity_list), f, ensure_ascii=False, indent=2)
    logging.info(f"ƒê√£ l∆∞u {len(entity_list)} {entity_type} v√†o {file_path}")

def normalize_vietnamese_text(self, text):
    """Chu·∫©n h√≥a vƒÉn b·∫£n ti·∫øng Vi·ªát b·∫±ng c√°ch x·ª≠ l√Ω d·∫•u v√† ch·ªØ hoa/th∆∞·ªùng"""
    if not isinstance(text, str):
        return ""

    # Chu·∫©n h√≥a k√Ω t·ª± Unicode
    text = unicodedata.normalize('NFC', text)

    # Lo·∫°i b·ªè kho·∫£ng tr·∫Øng th·ª´a
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def clean_text(self, text):
    """L√†m s·∫°ch vƒÉn b·∫£n b·∫±ng c√°ch lo·∫°i b·ªè k√Ω t·ª± ƒë·∫∑c bi·ªát v√† chu·∫©n h√≥a"""
    if not isinstance(text, str):
        return ""

    # Chu·∫©n h√≥a vƒÉn b·∫£n
    text = self.normalize_vietnamese_text(text)
    text = text.lower()

    # Lo·∫°i b·ªè ƒë∆∞·ªùng d·∫´n URL
    text = re.sub(r'https?://\S+|www\.\S+|\S+@\S+\.\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)

    # Lo·∫°i b·ªè bi·ªÉu t∆∞·ª£ng c·∫£m x√∫c v√† k√Ω t·ª± ƒë·∫∑c bi·ªát trong khi gi·ªØ l·∫°i ch·ªØ ti·∫øng Vi·ªát
    emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"  # dingbats
            u"\U000024C2-\U0001F251" 
            "]+", flags=re.UNICODE)
    text = emoji_pattern.sub('', text)

    symbols_to_remove = [
            '!', '"', '#', '$', '%', '&', "'", '*', '+', ',', 
            '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', 
            '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~',
            '"', '"', ''', ''', '‚Ä¶', '‚Äì', '‚Äî', '‚Ä¢', '‚Ä≤', '‚Ä≥',
            '‚Äû', '¬´', '¬ª', '‚Äπ', '‚Ä∫', '‚ü®', '‚ü©', '„Äà', '„Äâ'
    ]
    
    # Create a pattern that excludes Vietnamese diacritics
    pattern = f'[{"".join(map(re.escape, symbols_to_remove))}]'
    text = re.sub(pattern, ' ', text)

    # Handle ellipsis and multiple dots
    text = re.sub(r'\.{2,}', ' ', text)

    # Handle multiple spaces and normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    # Handle parentheses and brackets
    text = re.sub(r'[\(\)\[\]\{\}‚ü®‚ü©„Äà„Äâ]', ' ', text)

    # Clean up extra spaces around Vietnamese words
    text = re.sub(r'\s+([^\w\s])|([^\w\s])\s+', r'\1\2', text)

    # Final whitespace cleanup
    text = text.strip()

    return text

def auto_correct_text(self, text):
    """T·ª± ƒë·ªông s·ª≠a l·ªói ch√≠nh t·∫£ b·∫±ng b·ªô ki·ªÉm tra ch√≠nh t·∫£"""
    spell = SpellChecker(language='vi')
    words = word_tokenize(text)
    corrected_words = [spell.correction(word) for word in words]
    return " ".join(corrected_words)

def load_stopwords(self, file_path):
    """T·∫£i danh s√°ch t·ª´ d·ª´ng t·ª´ t·ªáp"""
    if file_path and os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return set(f.read().splitlines())
        except Exception as e:
            logging.warning(f"L·ªói khi t·∫£i danh s√°ch t·ª´ d·ª´ng: {e}")
    logging.info("Kh√¥ng t√¨m th·∫•y t·ªáp t·ª´ d·ª´ng, b·∫Øt ƒë·∫ßu v·ªõi t·∫≠p r·ªóng")
    return set()

def remove_stopwords(self, text, stopwords):
    """Lo·∫°i b·ªè t·ª´ d·ª´ng kh·ªèi vƒÉn b·∫£n"""
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return " ".join(filtered_words)

def preprocess_text(self, text):
    """√Åp d·ª•ng t·∫•t c·∫£ c√°c b∆∞·ªõc ti·ªÅn x·ª≠ l√Ω l√™n vƒÉn b·∫£n"""
    try:
        text = self.clean_text(text)
        text = self.auto_correct_text(text)  # ƒê√£ s·ª≠a l·ªói t·∫°i ƒë√¢y
        stopwords = self.load_stopwords('vietnamese-stopwords.txt')
        text = self.remove_stopwords(text, stopwords)
        return text
    except Exception as e:
        logging.error(f"L·ªói khi ti·ªÅn x·ª≠ l√Ω vƒÉn b·∫£n: {e}")
        return text if isinstance(text, str) else ""

# G√°n c√°c ph∆∞∆°ng th·ª©c v√†o l·ªõp VietnameseTextProcessor
VietnameseTextProcessor._load_entity_list = _load_entity_list
VietnameseTextProcessor.save_entity_list = save_entity_list 
VietnameseTextProcessor.normalize_vietnamese_text = normalize_vietnamese_text
VietnameseTextProcessor.clean_text = clean_text
VietnameseTextProcessor.auto_correct_text = auto_correct_text
VietnameseTextProcessor.load_stopwords = load_stopwords
VietnameseTextProcessor.preprocess_text = preprocess_text


In [179]:
def test_text_processing():
    """Test function to demonstrate all text preprocessing and cleaning steps"""
    
    # Initialize the processor
    processor = VietnameseTextProcessor()
    
    # Test text with various cases to check
    test_text = """
    üî• Qu√°n Ph·ªü ngon ·ªü Q.1 TPHCM! https://example.com
    M√≥n ph·ªü b√≤ t√°i n·∫°m g·∫ßu c·ª±c k·ª≥ ngon, n∆∞·ªõc d√πng ƒë·∫≠m ƒë√†...
    ƒê·ªãa ch·ªâ: 123 L√™ L·ª£i, P. B·∫øn Ngh√©, Qu·∫≠n 1, TP.HCM
    #pho #amthuc #reviewdoan @foodblogger
    """
    
    print("Original Text:")
    print("-" * 50)
    print(test_text)
    print("\n")

    # Test normalize_vietnamese_text
    print("1. After Vietnamese Text Normalization:")
    print("-" * 50)
    normalized = processor.normalize_vietnamese_text(test_text)
    print(normalized)
    print("\n")

    # Test clean_text
    print("2. After Text Cleaning:")
    print("-" * 50)
    cleaned = processor.clean_text(test_text)
    print(cleaned)
    print("\n")

    # Test stopwords removal
    print("3. After Stopwords Removal:")
    print("-" * 50)

In [180]:
test_text_processing()

2025-03-09 23:18:39,988 - INFO - Kh√¥ng t√¨m th·∫•y danh s√°ch foods hi·ªán c√≥, b·∫Øt ƒë·∫ßu v·ªõi t·∫≠p r·ªóng
2025-03-09 23:18:39,989 - INFO - Kh√¥ng t√¨m th·∫•y danh s√°ch locations hi·ªán c√≥, b·∫Øt ƒë·∫ßu v·ªõi t·∫≠p r·ªóng


Original Text:
--------------------------------------------------

    üî• Qu√°n Ph·ªü ngon ·ªü Q.1 TPHCM! https://example.com
    M√≥n ph·ªü b√≤ t√°i n·∫°m g·∫ßu c·ª±c k·ª≥ ngon, n∆∞·ªõc d√πng ƒë·∫≠m ƒë√†...
    ƒê·ªãa ch·ªâ: 123 L√™ L·ª£i, P. B·∫øn Ngh√©, Qu·∫≠n 1, TP.HCM
    #pho #amthuc #reviewdoan @foodblogger
    


1. After Vietnamese Text Normalization:
--------------------------------------------------
üî• Qu√°n Ph·ªü ngon ·ªü Q.1 TPHCM! https://example.com M√≥n ph·ªü b√≤ t√°i n·∫°m g·∫ßu c·ª±c k·ª≥ ngon, n∆∞·ªõc d√πng ƒë·∫≠m ƒë√†... ƒê·ªãa ch·ªâ: 123 L√™ L·ª£i, P. B·∫øn Ngh√©, Qu·∫≠n 1, TP.HCM #pho #amthuc #reviewdoan @foodblogger


2. After Text Cleaning:
--------------------------------------------------
qu√°n ph·ªü ngon ·ªü q 1 tphcm m√≥n ph·ªü b√≤ t√°i n·∫°m g·∫ßu c·ª±c k·ª≥ ngon n∆∞·ªõc d√πng ƒë·∫≠m ƒë√† ƒë·ªãa ch·ªâ 123 l√™ l·ª£i p b·∫øn ngh√© qu·∫≠n 1 tp hcm


3. After Stopwords Removal:
--------------------------------------------------


# Entity Extraction Methods

In [None]:
def extract_entities_from_ner(self, text):
    """Tr√≠ch xu·∫•t th·ª±c th·ªÉ t·ª´ vƒÉn b·∫£n b·∫±ng Named Entity Recognition (NER) c·ªßa underthesea."""
    locations = []

    try:
        ner_tags = ner(text)  # Th·ª±c hi·ªán nh·∫≠n d·∫°ng th·ª±c th·ªÉ c√≥ t√™n (NER)

        # Ki·ªÉm tra n·∫øu k·∫øt qu·∫£ t·ª´ NER c√≥ ƒë·ªãnh d·∫°ng mong ƒë·ª£i
        if not isinstance(ner_tags, list):
            return locations

        # Tr√≠ch xu·∫•t c√°c ƒë·ªãa ƒëi·ªÉm t·ª´ NER
        current_loc = []

        for item in ner_tags:
            # X·ª≠ l√Ω c√°c ƒë·ªãnh d·∫°ng ƒë·∫ßu ra kh√°c nhau t·ª´ NER
            if isinstance(item, (list, tuple)) and len(item) == 2:
                word, tag = item
            else:
                continue

            if tag.startswith('B-LOC'):
                if current_loc:
                    locations.append(' '.join(current_loc))
                    current_loc = []
                current_loc.append(word)
            elif tag.startswith('I-LOC') and current_loc:
                current_loc.append(word)
            elif current_loc:
                locations.append(' '.join(current_loc))
                current_loc = []

        # Th√™m th·ª±c th·ªÉ ƒë·ªãa ƒëi·ªÉm cu·ªëi c√πng n·∫øu c√≥
        if current_loc:
            locations.append(' '.join(current_loc))

    except Exception as e:
        logging.error(f"L·ªói khi tr√≠ch xu·∫•t th·ª±c th·ªÉ b·∫±ng NER: {e}")
        logging.error(traceback.format_exc())

    return locations

def extract_entities_from_patterns(self, text, sentences, pos_tags):
    """Tr√≠ch xu·∫•t th·ª±c th·ªÉ b·∫±ng c√°ch s·ª≠ d·ª•ng ph∆∞∆°ng ph√°p d·ª±a tr√™n m·∫´u (Pattern Matching)."""
    foods = []
    locations = []
    tastes = []

    # X·ª≠ l√Ω t·ª´ng c√¢u ƒë·ªÉ tr√≠ch xu·∫•t th·ª±c th·ªÉ
    for idx, sentence in enumerate(sentences):
        words = word_tokenize(sentence)
        sentence_pos_tags = pos_tags[idx] if idx < len(pos_tags) else []

        # T√¨m th·ª±c th·ªÉ v·ªÅ th·ª±c ph·∫©m
        self._extract_food_entities(sentence, sentence_pos_tags, foods)

        # T√¨m th·ª±c th·ªÉ v·ªÅ ƒë·ªãa ƒëi·ªÉm
        self._extract_location_entities(sentence, sentence_pos_tags, locations)

        # T√¨m m√¥ t·∫£ v·ªÅ h∆∞∆°ng v·ªã
        self._extract_taste_descriptions(sentence, words, tastes)

    return foods, locations, tastes

def _validate_entity(self, phrase, indicators):
    """Validate if a phrase contains at least one indicator"""
    phrase_lower = phrase.lower()
    return any(indicator.lower() in phrase_lower for indicator in indicators)

def _extract_food_entities(self, sentence, pos_tags, foods):
    """Extract food entities with improved indicator matching"""
    # Check existing food list
    for food in self.foods:
        if food.lower() in sentence.lower() and self._validate_entity(food, self.food_indicators):
            foods.append(food)

    # Find food indicators
    for idx, (word, tag) in enumerate(pos_tags):
        if word.lower() in self.food_indicators:
            noun_phrase = [word]
            max_look_ahead = 4
            
            # Look ahead for related words
            for i in range(1, max_look_ahead):
                if idx + i < len(pos_tags):
                    next_word, next_tag = pos_tags[idx + i]
                    # Accept nouns, adjectives, and numbers for quantities
                    if next_tag.startswith(('N', 'A', 'M')):
                        noun_phrase.append(next_word)
                    else:
                        # Check if we should continue based on common food patterns
                        if len(noun_phrase) < 2 or not self._validate_entity(" ".join(noun_phrase), self.food_indicators):
                            continue
                        break
            
            if noun_phrase and self._validate_entity(" ".join(noun_phrase), self.food_indicators):
                food_name = " ".join(noun_phrase)
                foods.append(food_name)
                self.foods.add(food_name)

def _extract_location_entities(self, sentence, pos_tags, locations):
    """Extract location entities with improved indicator matching"""
    # Check existing location list
    for location in self.locations:
        if location.lower() in sentence.lower() and self._validate_entity(location, self.locations_indicators):
            locations.append(location)

    # Find location indicators
    for idx, (word, tag) in enumerate(pos_tags):
        if any(indicator.lower() in word.lower() for indicator in self.locations_indicators):
            noun_phrase = [word]
            max_look_ahead = 4
            
            # Look ahead for related words
            for i in range(1, max_look_ahead):
                if idx + i < len(pos_tags):
                    next_word, next_tag = pos_tags[idx + i]
                    # Accept proper nouns, numbers, and regular nouns
                    if next_tag.startswith(('N', 'M', 'Np', 'Nu')):
                        noun_phrase.append(next_word)
                    else:
                        # Check if we should continue based on location patterns
                        if len(noun_phrase) < 2 or not self._validate_entity(" ".join(noun_phrase), self.locations_indicators):
                            continue
                        break
            
            if noun_phrase and self._validate_entity(" ".join(noun_phrase), self.locations_indicators):
                location_name = " ".join(noun_phrase)
                locations.append(location_name)
                self.locations.add(location_name)

def _extract_taste_descriptions(self, sentence, words, tastes):
    """Extract taste descriptions with improved matching"""
    for taste_word in self.taste_indicators:
        if taste_word in sentence.lower():
            taste_idx = -1
            for idx, word in enumerate(words):
                if taste_word in word.lower():
                    taste_idx = idx
                    break
            
            if taste_idx >= 0:
                # Look for a wider context
                start = max(0, taste_idx - 2)
                end = min(len(words), taste_idx + 3)
                taste_phrase = " ".join(words[start:end])
                
                # Validate the taste phrase
                if 2 <= len(taste_phrase.split()) <= 4 and self._validate_entity(taste_phrase, self.taste_indicators):
                    tastes.append(taste_phrase)

def extract_entities(self, text):
    """Extract entities with improved validation and matching"""
    if not text or not isinstance(text, str):
        return {"foods": [], "locations": [], "tastes": []}

    try:
        results = {"foods": [], "locations": [], "tastes": []}

        # Extract locations using NER
        ner_locations = self.extract_entities_from_ner(text)
        validated_locations = [loc for loc in ner_locations if self._validate_entity(loc, self.locations_indicators)]
        results["locations"].extend(validated_locations)
        self.locations.update(validated_locations)

        # Extract entities using pattern matching
        sentences = nltk.sent_tokenize(text)
        pos_tags = [pos_tag(sent) for sent in sentences]

        foods, locations, tastes = self.extract_entities_from_patterns(text, sentences, pos_tags)

        # Validate and extend results
        results["foods"].extend([f for f in foods if self._validate_entity(f, self.food_indicators)])
        results["locations"].extend([l for l in locations if self._validate_entity(l, self.locations_indicators)])
        results["tastes"].extend([t for t in tastes if self._validate_entity(t, self.taste_indicators)])

        # Update entity lists
        self.foods.update(results["foods"])
        self.locations.update(results["locations"])

        # Remove duplicates and empty strings
        for key in results:
            results[key] = list(set(filter(None, results[key])))

        return results

    except Exception as e:
        logging.error(f"Error extracting entities: {e}")
        logging.error(traceback.format_exc())
        return {"foods": [], "locations": [], "tastes": []}

# G√°n c√°c ph∆∞∆°ng th·ª©c v√†o l·ªõp VietnameseTextProcessor
VietnameseTextProcessor.validate_entity = _validate_entity
VietnameseTextProcessor.extract_entities_from_ner = extract_entities_from_ner
VietnameseTextProcessor.extract_entities_from_patterns = extract_entities_from_patterns
VietnameseTextProcessor._extract_food_entities = _extract_food_entities
VietnameseTextProcessor._extract_location_entities = _extract_location_entities
VietnameseTextProcessor._extract_taste_descriptions = _extract_taste_descriptions
VietnameseTextProcessor.extract_entities = extract_entities


# DataFrame Processing and Bootstrapping

In [182]:
def process_dataframe(self, df, text_column="video_transcription", batch_size=100):
    """
    X·ª≠ l√Ω to√†n b·ªô DataFrame v√† tr√≠ch xu·∫•t c√°c th·ª±c th·ªÉ.

    Tham s·ªë:
        df (pd.DataFrame): DataFrame ch·ª©a d·ªØ li·ªáu vƒÉn b·∫£n.
        text_column (str): T√™n c·ªôt ch·ª©a vƒÉn b·∫£n.
        batch_size (int): K√≠ch th∆∞·ªõc batch ƒë·ªÉ x·ª≠ l√Ω nh·∫±m ti·∫øt ki·ªám b·ªô nh·ªõ.

    Tr·∫£ v·ªÅ:
        pd.DataFrame: DataFrame g·ªëc v·ªõi c√°c c·ªôt ch·ª©a th·ª±c th·ªÉ ƒë∆∞·ª£c tr√≠ch xu·∫•t.
    """
    # Ki·ªÉm tra n·∫øu DataFrame tr·ªëng ho·∫∑c kh√¥ng c√≥ c·ªôt vƒÉn b·∫£n
    if df.empty or text_column not in df.columns:
        logging.error(f"DataFrame kh√¥ng h·ª£p l·ªá ho·∫∑c thi·∫øu c·ªôt '{text_column}'")
        return df

    # T·∫°o th∆∞ m·ª•c l∆∞u tr·ªØ n·∫øu ch∆∞a t·ªìn t·∫°i
    os.makedirs("extracted_data", exist_ok=True)

    # Kh·ªüi t·∫°o c√°c c·ªôt ƒë·ªÉ l∆∞u th·ª±c th·ªÉ tr√≠ch xu·∫•t
    df['preprocessed_text'] = ""
    df['extracted_foods'] = None
    df['extracted_locations'] = None
    df['extracted_tastes'] = None

    total_batches = (len(df) + batch_size - 1) // batch_size  # T√≠nh s·ªë batch c·∫ßn x·ª≠ l√Ω

    for i in tqdm(range(total_batches), desc="ƒêang x·ª≠ l√Ω batch"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))

        batch = df.iloc[start_idx:end_idx].copy()

        # Ti·ªÅn x·ª≠ l√Ω vƒÉn b·∫£n
        batch['preprocessed_text'] = batch[text_column].apply(self.preprocess_text)

        # Tr√≠ch xu·∫•t th·ª±c th·ªÉ
        entities_list = []
        for text in batch['preprocessed_text']:
            entities_list.append(self.extract_entities(text))

        # C·∫≠p nh·∫≠t DataFrame v·ªõi th·ª±c th·ªÉ tr√≠ch xu·∫•t
        batch['extracted_foods'] = [data['foods'] for data in entities_list]
        batch['extracted_locations'] = [data['locations'] for data in entities_list]
        batch['extracted_tastes'] = [data['tastes'] for data in entities_list]

        # C·∫≠p nh·∫≠t v√†o DataFrame g·ªëc
        df.iloc[start_idx:end_idx] = batch

        # L∆∞u k·∫øt qu·∫£ t·∫°m th·ªùi theo t·ª´ng batch
        if (i + 1) % 5 == 0 or (i + 1) == total_batches:
            self.save_entity_list(self.foods, "foods")
            self.save_entity_list(self.locations, "locations")

            # L∆∞u k·∫øt qu·∫£ trung gian
            checkpoint_file = f"extracted_data/processed_data_batch_{i+1}.csv"
            df.iloc[:end_idx].to_csv(checkpoint_file, index=False)
            logging.info(f"ƒê√£ l∆∞u k·∫øt qu·∫£ trung gian v√†o {checkpoint_file} sau batch {i+1}/{total_batches}")

    # Th·ªëng k√™ s·ªë l∆∞·ª£ng th·ª±c th·ªÉ ƒë√£ t√¨m th·∫•y
    food_count = len(self.foods)
    location_count = len(self.locations)

    logging.info(f"Tr√≠ch xu·∫•t ho√†n t·∫•t. T√¨m th·∫•y {food_count} th·ª±c th·ªÉ m√≥n ƒÉn v√† {location_count} th·ª±c th·ªÉ ƒë·ªãa ƒëi·ªÉm.")

    return df

def bootstrap_entity_lists(self, df, text_column="preprocessed_text", min_freq=3):
    """
    M·ªü r·ªông danh s√°ch th·ª±c th·ªÉ b·∫±ng TF-IDF ƒë·ªÉ t√¨m c√°c th·ª±c th·ªÉ ti·ªÅm nƒÉng.
    
    Tham s·ªë:
        df (pd.DataFrame): DataFrame ch·ª©a d·ªØ li·ªáu vƒÉn b·∫£n.
        text_column (str): T√™n c·ªôt ch·ª©a vƒÉn b·∫£n ƒë√£ ti·ªÅn x·ª≠ l√Ω.
        min_freq (int): S·ªë l·∫ßn xu·∫•t hi·ªán t·ªëi thi·ªÉu ƒë·ªÉ xem x√©t m·ªôt th·ª±c th·ªÉ.

    Tr·∫£ v·ªÅ:
        set: T·∫≠p h·ª£p c√°c th·ª±c th·ªÉ m√≥n ƒÉn m·ªõi ƒë∆∞·ª£c nh·∫≠n di·ªán.
    """
    if df.empty or text_column not in df.columns:
        logging.error(f"Kh√¥ng th·ªÉ m·ªü r·ªông th·ª±c th·ªÉ: DataFrame kh√¥ng h·ª£p l·ªá ho·∫∑c thi·∫øu c·ªôt '{text_column}'")
        return set()

    # L·ªçc ra c√°c vƒÉn b·∫£n h·ª£p l·ªá
    valid_texts = df[text_column].dropna().replace('', pd.NA).dropna().tolist()

    if not valid_texts:
        logging.warning("Kh√¥ng t√¨m th·∫•y vƒÉn b·∫£n h·ª£p l·ªá ƒë·ªÉ m·ªü r·ªông th·ª±c th·ªÉ")
        return set()

    try:
        min_df_val = max(1, min(min_freq, len(valid_texts) // 2))
        
        tfidf = TfidfVectorizer(
            ngram_range=(1, 3),  # X√©t c√°c n-gram t·ª´ 1 ƒë·∫øn 3 t·ª´
            min_df=min_df_val,  # ƒêi·ªÅu ch·ªânh min_df
            max_df=0.9  # Lo·∫°i b·ªè c√°c c·ª•m t·ª´ qu√° ph·ªï bi·∫øn
        )

        tfidf_matrix = tfidf.fit_transform(valid_texts)
        feature_names = tfidf.get_feature_names_out()

        # L·∫•y danh s√°ch n-gram c√≥ gi√° tr·ªã TF-IDF cao
        important_ngrams = []
        for i in range(min(tfidf_matrix.shape[0], 100)):
            feature_index = tfidf_matrix[i,:].nonzero()[1]
            tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
            # S·∫Øp x·∫øp theo ƒëi·ªÉm TF-IDF gi·∫£m d·∫ßn
            for idx, score in sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:20]:
                important_ngrams.append(feature_names[idx])

        # L·ªçc c√°c c·ª•m t·ª´ c√≥ th·ªÉ l√† t√™n m√≥n ƒÉn (d·ª±a v√†o t·ª´ ch·ªâ m√≥n ƒÉn)
        potential_foods = set()
        for text in valid_texts:
            for indicator in self.food_indicators:
                if indicator in text:
                    for ngram in important_ngrams:
                        # Ki·ªÉm tra n·∫øu ngram xu·∫•t hi·ªán g·∫ßn t·ª´ ch·ªâ m√≥n ƒÉn
                        if ngram in text and re.search(r'\b' + re.escape(indicator) + r'.{0,30}' + re.escape(ngram), text, re.IGNORECASE):
                            potential_foods.add(ngram)
                        if ngram in text and re.search(r'\b' + re.escape(ngram) + r'.{0,30}' + re.escape(indicator), text, re.IGNORECASE):
                            potential_foods.add(ngram)

        # L·ªçc b·ªè c√°c th·ª±c th·ªÉ kh√¥ng h·ª£p l·ªá (qu√° ng·∫Øn, ch·ªâ ch·ª©a s·ªë, v.v.)
        filtered_foods = {food for food in potential_foods if len(food) > 2 and not food.isdigit()}

        # C·∫≠p nh·∫≠t danh s√°ch m√≥n ƒÉn
        self.foods.update(filtered_foods)
        logging.info(f"ƒê√£ th√™m {len(filtered_foods)} th·ª±c th·ªÉ m√≥n ƒÉn ti·ªÅm nƒÉng t·ª´ m·ªü r·ªông th·ª±c th·ªÉ")

        return filtered_foods

    except Exception as e:
        logging.error(f"L·ªói khi m·ªü r·ªông th·ª±c th·ªÉ: {e}")
        logging.error(traceback.format_exc())
        return set()

VietnameseTextProcessor.process_dataframe = process_dataframe
VietnameseTextProcessor.bootstrap_entity_lists = bootstrap_entity_lists

In [183]:
def main():
    try:
        # T·∫°o m·ªôt th·ªÉ hi·ªán c·ªßa b·ªô x·ª≠ l√Ω vƒÉn b·∫£n
        processor = VietnameseTextProcessor()

        # T·∫£i t·∫≠p d·ªØ li·ªáu
        logging.info("ƒêang t·∫£i t·∫≠p d·ªØ li·ªáu...")
        try:
            # df = pd.read_csv("/content/21KHDL-TikTok-Analytics/data/interim/small_video_transcription.csv")
            df = pd.read_csv("C:/Users/nguye/OneDrive/TaÃÄi li√™Ã£u/GitHub/21KHDL-TikTok-Analytics/data/interim/small_video_transcription.csv")
            if df.empty:
                logging.error("T·∫≠p d·ªØ li·ªáu ƒë∆∞·ª£c t·∫£i v·ªÅ tr·ªëng")
                return
            logging.info(f"T·∫≠p d·ªØ li·ªáu ƒë√£ t·∫£i c√≥ {len(df)} d√≤ng")
        except Exception as e:
            logging.error(f"L·ªói khi t·∫£i t·∫≠p d·ªØ li·ªáu: {e}")
            logging.error(traceback.format_exc())
            return

        # X·ª≠ l√Ω m·ªôt m·∫´u nh·ªè ƒë·ªÉ ki·ªÉm th·ª≠ (s·ª≠ d·ª•ng .head(10) ƒë·ªÉ th·ª≠ nghi·ªám, x√≥a b·ªè ƒë·ªÉ x·ª≠ l√Ω to√†n b·ªô)
        sample_df = df.head(10)

        # X·ª≠ l√Ω d·ªØ li·ªáu vƒÉn b·∫£n
        logging.info("B·∫Øt ƒë·∫ßu x·ª≠ l√Ω vƒÉn b·∫£n v√† tr√≠ch xu·∫•t th·ª±c th·ªÉ...")
        processed_df = processor.process_dataframe(sample_df, text_column='video_transcription')

        # M·ªü r·ªông danh s√°ch th·ª±c th·ªÉ b·∫±ng ph∆∞∆°ng ph√°p bootstrapping
        logging.info("Th·ª±c hi·ªán bootstrapping ƒë·ªÉ m·ªü r·ªông danh s√°ch th·ª±c th·ªÉ...")
        processor.bootstrap_entity_lists(processed_df)

        # L∆∞u k·∫øt qu·∫£ cu·ªëi c√πng
        processed_df.to_csv("extracted_data/fully_processed_data.csv", index=False)
        processor.save_entity_list(processor.foods, "foods")
        processor.save_entity_list(processor.locations, "locations")

        # L∆∞u k·∫øt qu·∫£ c√≥ c·∫•u tr√∫c d∆∞·ªõi d·∫°ng JSON g·ªìm video_id, author_id v√† c√°c th·ª±c th·ªÉ tr√≠ch xu·∫•t
        structured_data = []
        for _, row in processed_df.iterrows():
            structured_data.append({
                'video_id': row.get('video_id', ''),
                'author_id': row.get('author_id', ''),
                'extracted_entities': {
                    'foods': row.get('extracted_foods', []),
                    'locations': row.get('extracted_locations', []),
                    'tastes': row.get('extracted_tastes', [])
                }
            })

        with open("extracted_data/structured_entities.json", 'w', encoding='utf-8') as f:
            json.dump(structured_data, f, ensure_ascii=False, indent=2)

        logging.info("Qu√° tr√¨nh x·ª≠ l√Ω ho√†n t·∫•t. K·∫øt qu·∫£ ƒë√£ ƒë∆∞·ª£c l∆∞u trong th∆∞ m·ª•c 'extracted_data'.")

    except Exception as e:
        logging.error(f"L·ªói nghi√™m tr·ªçng trong h√†m main: {e}")
        logging.error(traceback.format_exc())

In [184]:
if __name__ == "__main__":
    main()

2025-03-09 23:18:40,074 - INFO - Kh√¥ng t√¨m th·∫•y danh s√°ch foods hi·ªán c√≥, b·∫Øt ƒë·∫ßu v·ªõi t·∫≠p r·ªóng
2025-03-09 23:18:40,074 - INFO - Kh√¥ng t√¨m th·∫•y danh s√°ch locations hi·ªán c√≥, b·∫Øt ƒë·∫ßu v·ªõi t·∫≠p r·ªóng
2025-03-09 23:18:40,078 - INFO - ƒêang t·∫£i t·∫≠p d·ªØ li·ªáu...
2025-03-09 23:18:40,332 - INFO - T·∫≠p d·ªØ li·ªáu ƒë√£ t·∫£i c√≥ 10673 d√≤ng
2025-03-09 23:18:40,333 - INFO - B·∫Øt ƒë·∫ßu x·ª≠ l√Ω vƒÉn b·∫£n v√† tr√≠ch xu·∫•t th·ª±c th·ªÉ...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preprocessed_text'] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#

In [185]:
def create_food_prompts():
    food_prompt_template = """
    H√£y ph√¢n t√≠ch th√¥ng tin v·ªÅ m√≥n ƒÉn sau ƒë√¢y:
    
    Danh s√°ch m√≥n ƒÉn: {foods}
    
    Y√™u c·∫ßu:
    1. Ph√¢n lo·∫°i c√°c m√≥n ƒÉn th√†nh c√°c nh√≥m (v√≠ d·ª•: m√≥n n∆∞·ªõc, m√≥n n∆∞·ªõng, ƒë·ªì u·ªëng, etc.)
    2. X√°c ƒë·ªãnh c√°c m√≥n ƒë·∫∑c tr∆∞ng nh·∫•t
    3. ƒê·ªÅ xu·∫•t m√≥n ƒÉn ph·ªï bi·∫øn nh·∫•t d·ª±a tr√™n t·∫ßn su·∫•t xu·∫•t hi·ªán
    4. Li√™n k·∫øt m√≥n ƒÉn v·ªõi vƒÉn h√≥a ·∫©m th·ª±c Vi·ªát Nam
    5. ƒê·ªÅ xu·∫•t c√°c k·∫øt h·ª£p m√≥n ƒÉn ph√π h·ª£p
    
    Vui l√≤ng tr√¨nh b√†y k·∫øt qu·∫£ m·ªôt c√°ch chi ti·∫øt v√† c√≥ c·∫•u tr√∫c.
    """
    return food_prompt_template

def create_location_prompts():
    location_prompt_template = """
    H√£y ph√¢n t√≠ch th√¥ng tin v·ªÅ ƒë·ªãa ƒëi·ªÉm ·∫©m th·ª±c sau ƒë√¢y:
    
    Danh s√°ch ƒë·ªãa ƒëi·ªÉm: {locations}
    
    Y√™u c·∫ßu:
    1. Nh√≥m c√°c ƒë·ªãa ƒëi·ªÉm theo khu v·ª±c (qu·∫≠n/huy·ªán)
    2. X√°c ƒë·ªãnh c√°c khu v·ª±c ·∫©m th·ª±c n·ªïi ti·∫øng
    3. ƒê·ªÅ xu·∫•t tuy·∫øn ƒë∆∞·ªùng kh√°m ph√° ·∫©m th·ª±c
    4. Li√™n k·∫øt ƒë·ªãa ƒëi·ªÉm v·ªõi ƒë·∫∑c tr∆∞ng ·∫©m th·ª±c
    5. X√°c ƒë·ªãnh c√°c ƒëi·ªÉm ·∫©m th·ª±c c√≥ m·∫≠t ƒë·ªô cao
    
    Vui l√≤ng ph√¢n t√≠ch v√† ƒë∆∞a ra c√°c g·ª£i √Ω chi ti·∫øt cho ng∆∞·ªùi d√πng.
    """
    return location_prompt_template

def analyze_food_locations(structured_data):
    """Analyze food and location data using Gemini API"""
    
    import google.generativeai as genai
    from collections import Counter
    
    # Configure API
    genai.configure(api_key='AIzaSyD1WFlkEtQnFVDJCPbnitmaHQVdw2pXRK4')
    model = genai.GenerativeModel('models/gemini-2.0-flash-thinking-exp-1219')
    
    # Extract unique foods and locations
    all_foods = []
    all_locations = []
    
    for item in structured_data:
        all_foods.extend(item['extracted_entities']['foods'])
        all_locations.extend(item['extracted_entities']['locations'])
    
    # Count frequencies
    food_counts = Counter(all_foods)
    location_counts = Counter(all_locations)
    
    # Create prompts
    food_prompt = create_food_prompts().format(
        foods="\n".join(f"- {food} (xu·∫•t hi·ªán {count} l·∫ßn)" 
                       for food, count in food_counts.most_common())
    )
    
    location_prompt = create_location_prompts().format(
        locations="\n".join(f"- {loc} (xu·∫•t hi·ªán {count} l·∫ßn)"
                           for loc, count in location_counts.most_common())
    )
    
    # Get responses from Gemini
    food_analysis = model.generate_content(food_prompt)
    location_analysis = model.generate_content(location_prompt)
    
    return {
        'food_analysis': food_analysis.text,
        'location_analysis': location_analysis.text,
        'statistics': {
            'total_unique_foods': len(set(all_foods)),
            'total_unique_locations': len(set(all_locations)),
            'most_common_foods': dict(food_counts.most_common(10)),
            'most_common_locations': dict(location_counts.most_common(10))
        }
    }

In [None]:
# Example usage
import json

# Load structured data
with open('C:/Users/nguye/OneDrive/TaÃÄi li√™Ã£u/GitHub/21KHDL-TikTok-Analytics/notebooks/extracted_data/structured_entities.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Run analysis
results = analyze_food_locations(data)

# Save results
with open('food_location_analysis.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# Print summary
print("Ph√¢n t√≠ch m√≥n ƒÉn v√† ƒë·ªãa ƒëi·ªÉm:")
print("\nTop 5 m√≥n ƒÉn ph·ªï bi·∫øn nh·∫•t:")
for food, count in list(results['statistics']['most_common_foods'].items())[:10]:
    print(f"- {food}: {count} l·∫ßn")

print("\nTop 5 ƒë·ªãa ƒëi·ªÉm ph·ªï bi·∫øn nh·∫•t:")
for loc, count in list(results['statistics']['most_common_locations'].items())[:10]:
    print(f"- {loc}: {count} l·∫ßn")

Ph√¢n t√≠ch m√≥n ƒÉn v√† ƒë·ªãa ƒëi·ªÉm:

Top 5 m√≥n ƒÉn ph·ªï bi·∫øn nh·∫•t:
- g√†: 9 l·∫ßn
- c√°: 8 l·∫ßn
- th·ªãt: 7 l·∫ßn
- ·ªëc: 7 l·∫ßn
- n∆∞·ªõng: 6 l·∫ßn

Top 5 ƒë·ªãa ƒëi·ªÉm ph·ªï bi·∫øn nh·∫•t:
- b√¨nh th·∫°nh: 2 l·∫ßn
- th·ªß ƒë·ª©c: 1 l·∫ßn
- c·ªß chi: 1 l·∫ßn
- t√¢n b√¨nh: 1 l·∫ßn
- ph√∫ nhu·∫≠n: 1 l·∫ßn
