In [11]:
# %%shell
# git clone --branch TrggTin --single-branch https://github.com/vphuhan/21KHDL-TikTok-Analytics.git
# cd 21KHDL-TikTok-Analytics
# git sparse-checkout init --cone
# git sparse-checkout set data/interim
# git checkout

In [1]:
# pip install pandas nltk underthesea scikit-learn tqdm

# Imports and Initialization

In [2]:
import pandas as pd
import re
import unicodedata
import nltk
from underthesea import word_tokenize, pos_tag, ner
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import get_close_matches
import logging
import json
import os
from tqdm import tqdm
import string
import regex as re
import traceback
import jdc  

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("extraction_log.log"),
        logging.StreamHandler()
    ]
)

In [4]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# VietnameseTextProcessor Class Definition

In [5]:
class VietnameseTextProcessor:
    def __init__(self, food_list_path=None, location_list_path=None):
        """
        Initialize the Vietnamese Text Processor

        Args:
            food_list_path (str): Path to JSON file with known Vietnamese food items
            location_list_path (str): Path to JSON file with known Vietnamese locations
        """
        # Load or initialize food and location lists
        self.foods = self._load_entity_list(food_list_path, "foods")
        self.locations = self._load_entity_list(location_list_path, "locations")

        # Common Vietnamese food and taste related words for bootstrapping
        self.food_indicators = [
            "bánh", "phở", "bún", "xèo", "cơm", "gỏi", "chả", "xôi", "cao lầu", "cháo",
            "mì", "hủ tiếu", "nem", "ram", "khọt",
            "lẩu", "cá", "thịt", "canh", "rau", "đậu", "nướng", "ốc", "súp", "bắp",
            "chuối", "nộm", "trà", "cà phê", "sinh tố", "kem", "tàu hủ", "chè", "yaourt", "nước mía",
            "sữa", "kẹo", "đa", "nem chua", "gà", "món", "ăn"
        ]

        self.taste_indicators = [
            "ngon", "ngọt", "chua", "cay", "đắng", "mặn", "bùi", "béo", "giòn", "mềm",
            "thơm", "nồng", "đậm đà", "nhạt", "thanh", "tươi", "ướp", "rim", "kho", "xào",
            "nướng", "luộc", "hấp", "chiên", "xốt", "tẩm", "ướt", "khô", "giòn tan", "dai",
            "sần sật", "mọng nước", "đắng nghét", "chát", "cay xè", "tê", "mặn chát", "ngọt lịm", "béo ngậy", "thơm lừng",
            "nồng nàn", "đậm vị", "nhạt nhẽo", "thanh mát", "tươi rói", "tươi ngon", "đậm đà hương vị", "vừa ăn", "hợp khẩu vị"
        ]

        self.locations_indicators = [
            "Quận", "Huyện", "Phường", "Xã", "Thành phố", "TP", "Tỉnh", "đường", "phố", "chợ", "địa chỉ", "nằm ở", "tại",
            "Quận 1", "Quận 2", "Quận 3", "Quận 4", "Quận 5", "Quận 6", "Quận 7", "Quận 8", "Quận 9", "Quận 10",
            "Quận 11", "Quận 12", "Bình Thạnh", "Tân Bình", "Tân Phú", "Phú Nhuận", "Gò Vấp", "Bình Tân", "Thủ Đức", "Hóc Môn",
            "Củ Chi", "Nhà Bè", "Cần Giờ", "Bình Chánh", "TP Thủ Đức",
            "Hà Nội", "Hồ Chí Minh", "Đà Nẵng", "Hải Phòng", "Cần Thơ", "Huế", "Nha Trang", "Vũng Tàu", "Đà Lạt",
            "Hạ Long", "Mỹ Tho", "Long Xuyên", "Rạch Giá", "Cà Mau", "Biên Hòa", "Buôn Ma Thuột", "Thái Nguyên", "Nam Định"
        ]

        # Load NLTK resources if needed
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

        # Create directory for output files
        os.makedirs("extracted_data", exist_ok=True)

# Helper Methods

In [6]:
def _load_entity_list(self, file_path, entity_type):
    """Load entity list from file or return default empty set"""
    if file_path and os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return set(json.load(f))
        except Exception as e:
            logging.warning(f"Error loading {entity_type} list: {e}")

    logging.info(f"No existing {entity_type} list found, starting with empty set")
    return set()

def save_entity_list(self, entity_list, entity_type):
    """Save updated entity list to file"""
    file_path = f"extracted_data/{entity_type}_list.json"
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(list(entity_list), f, ensure_ascii=False, indent=2)
    logging.info(f"Saved {len(entity_list)} {entity_type} to {file_path}")

def normalize_vietnamese_text(self, text):
    """Normalize Vietnamese text by handling diacritics and case"""
    if not isinstance(text, str):
        return ""

    # Normalize Unicode characters
    text = unicodedata.normalize('NFC', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def clean_text(self, text):
    """Clean the text by removing special characters and normalizing"""
    if not isinstance(text, str):
        return ""

    # Normalize text
    text = self.normalize_vietnamese_text(text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove emojis and special characters while preserving Vietnamese characters
    # This regex keeps Vietnamese letters, numbers, punctuation and whitespace
    vietnamese_pattern = r'[^\p{L}\p{N}\p{P}\s]+'
    text = re.sub(vietnamese_pattern, '', text, flags=re.UNICODE)

    # Fix spacing around punctuation
    text = re.sub(r'\s+([.,;:?!])', r'\1', text)

    return text

def correct_common_misspellings(self, text):
    """Attempt to correct common misspellings in Vietnamese text"""
    words = word_tokenize(text)
    corrected_words = []

    # Dictionary of common misspellings
    common_corrections = {
        "pho": "phở",
        "bun": "bún",
        "banh": "bánh",
        "com": "cơm",
        "hu tieu": "hủ tiếu",
        "goi": "gỏi",
        "ca phe": "cà phê",
        "nuoc mia": "nước mía",
        "che": "chè",
    }

    for word in words:
        # Check if word is in common corrections
        if word.lower() in common_corrections:
            corrected_words.append(common_corrections[word.lower()])
        else:
            corrected_words.append(word)

    return " ".join(corrected_words)

def preprocess_text(self, text):
    """Apply all preprocessing steps to the text"""
    if not isinstance(text, str) or not text.strip():
        return ""

    try:
        text = self.clean_text(text)
        text = self.correct_common_misspellings(text)
        return text
    except Exception as e:
        logging.error(f"Error preprocessing text: {e}")
        logging.error(traceback.format_exc())
        return text if isinstance(text, str) else ""
    
VietnameseTextProcessor._load_entity_list = _load_entity_list
VietnameseTextProcessor.save_entity_list = save_entity_list 
VietnameseTextProcessor.normalize_vietnamese_text = normalize_vietnamese_text
VietnameseTextProcessor.clean_text = clean_text
VietnameseTextProcessor.correct_common_misspellings = correct_common_misspellings
VietnameseTextProcessor.preprocess_text = preprocess_text

# Entity Extraction Methods

In [7]:
def extract_entities_from_ner(self, text):
    """Extract entities using underthesea NER"""
    locations = []

    try:
        ner_tags = ner(text)

        # Check if ner_tags has the expected format
        if not ner_tags:
            return locations

        # Extract locations from NER
        current_loc = []

        for item in ner_tags:
            # Handle different output formats from NER
            if isinstance(item, (list, tuple)) and len(item) == 2:
                word, tag = item
            else:
                continue

            if tag.startswith('B-LOC'):
                if current_loc:
                    locations.append(' '.join(current_loc))
                    current_loc = []
                current_loc.append(word)
            elif tag.startswith('I-LOC') and current_loc:
                current_loc.append(word)
            elif current_loc:
                locations.append(' '.join(current_loc))
                current_loc = []

        # Add the last location if it exists
        if current_loc:
            locations.append(' '.join(current_loc))

    except Exception as e:
        logging.error(f"Error in NER extraction: {e}")
        logging.error(traceback.format_exc())

    return locations

def extract_entities_from_patterns(self, text, sentences, pos_tags):
    """Extract entities using pattern matching"""
    foods = []
    locations = []
    tastes = []

    # Process each sentence for entity extraction
    for idx, sentence in enumerate(sentences):
        words = word_tokenize(sentence)
        sentence_pos_tags = pos_tags[idx] if idx < len(pos_tags) else []

        # Find food entities
        self._extract_food_entities(sentence, sentence_pos_tags, foods)

        # Find location entities
        self._extract_location_entities(sentence, sentence_pos_tags, locations)

        # Find taste descriptions
        self._extract_taste_descriptions(sentence, words, tastes)

    return foods, locations, tastes

def _extract_food_entities(self, sentence, pos_tags, foods):
    """Extract food entities from a sentence"""
    # Look for direct matches from existing food list
    for food in self.foods:
        if food.lower() in sentence.lower():
            foods.append(food)

    # Look for food indicator words
    for idx, (word, tag) in enumerate(pos_tags):
        if word.lower() in self.food_indicators:
            # Look ahead for potential food name (noun phrases)
            noun_phrase = []
            for i in range(1, 4):
                if idx + i < len(pos_tags):
                    next_word, next_tag = pos_tags[idx + i]
                    if next_tag.startswith(('N', 'A')):  # Noun or Adjective
                        noun_phrase.append(next_word)
                    else:
                        break

            if noun_phrase:
                food_name = " ".join(noun_phrase)
                foods.append(food_name)
                self.foods.add(food_name)

def _extract_location_entities(self, sentence, pos_tags, locations):
    """Extract location entities from a sentence"""
    # Look for direct matches from existing location list
    for location in self.locations:
        if location.lower() in sentence.lower():
            locations.append(location)

    # Look for location indicator words
    for idx, (word, tag) in enumerate(pos_tags):
        if any(indicator.lower() in word.lower() for indicator in self.locations_indicators):
            # Look ahead for potential location name (noun phrases)
            noun_phrase = []
            for i in range(1, 4):
                if idx + i < len(pos_tags):
                    next_word, next_tag = pos_tags[idx + i]
                    if next_tag.startswith(('N', 'M', 'Np')):  # Noun, Number, Proper noun
                        noun_phrase.append(next_word)
                    else:
                        break

            if noun_phrase:
                location_name = " ".join(noun_phrase)
                locations.append(location_name)
                self.locations.add(location_name)

def _extract_taste_descriptions(self, sentence, words, tastes):
    """Extract taste descriptions from a sentence"""
    for taste_word in self.taste_indicators:
        if taste_word in sentence.lower():
            # Find the position of the taste word
            taste_idx = -1
            for idx, word in enumerate(words):
                if taste_word in word.lower():
                    taste_idx = idx
                    break

            if taste_idx >= 0:
                # Extract surrounding context
                start = max(0, taste_idx - 3)
                end = min(len(words), taste_idx + 4)  # Increased range
                taste_phrase = " ".join(words[start:end])
                tastes.append(taste_phrase)

def extract_entities(self, text):
    """Extract food, location, and taste entities from text"""
    if not text or not isinstance(text, str):
        return {"foods": [], "locations": [], "tastes": []}

    try:
        results = {
            "foods": [],
            "locations": [],
            "tastes": []
        }

        # Extract locations using NER
        ner_locations = self.extract_entities_from_ner(text)
        results["locations"].extend(ner_locations)
        self.locations.update(ner_locations)

        # Extract entities using pattern matching
        sentences = nltk.sent_tokenize(text)
        pos_tags = [pos_tag(sent) for sent in sentences]

        foods, locations, tastes = self.extract_entities_from_patterns(text, sentences, pos_tags)

        results["foods"].extend(foods)
        results["locations"].extend(locations)
        results["tastes"].extend(tastes)

        # Update entity sets
        self.foods.update(foods)
        self.locations.update(locations)

        # Remove duplicates and filter out empty strings
        for key in results:
            results[key] = list(set(filter(None, results[key])))

        return results

    except Exception as e:
        logging.error(f"Error extracting entities: {e}")
        logging.error(traceback.format_exc())
        return {"foods": [], "locations": [], "tastes": []}
    
VietnameseTextProcessor.extract_entities_from_ner = extract_entities_from_ner
VietnameseTextProcessor.extract_entities_from_patterns = extract_entities_from_patterns
VietnameseTextProcessor._extract_food_entities = _extract_food_entities
VietnameseTextProcessor._extract_location_entities = _extract_location_entities
VietnameseTextProcessor._extract_taste_descriptions = _extract_taste_descriptions
VietnameseTextProcessor.extract_entities = extract_entities

# DataFrame Processing and Bootstrapping

In [8]:
def process_dataframe(self, df, text_column="video_transcription", batch_size=100):
    """Process the entire dataframe and extract entities

    Args:
        df (pd.DataFrame): Dataframe with text data
        text_column (str): Name of the column containing text
        batch_size (int): Process in batches to save memory

    Returns:
        pd.DataFrame: Original dataframe with extracted entity columns
    """
    # Check if dataframe is empty or text column doesn't exist
    if df.empty or text_column not in df.columns:
        logging.error(f"Invalid dataframe or missing column '{text_column}'")
        return df

    # Create output directory if it doesn't exist
    os.makedirs("extracted_data", exist_ok=True)

    # Initialize columns for extracted entities
    df['preprocessed_text'] = ""
    df['extracted_foods'] = None
    df['extracted_locations'] = None
    df['extracted_tastes'] = None

    total_batches = (len(df) + batch_size - 1) // batch_size

    for i in tqdm(range(total_batches), desc="Processing batches"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))

        batch = df.iloc[start_idx:end_idx].copy()

        # Preprocess texts
        batch['preprocessed_text'] = batch[text_column].apply(self.preprocess_text)

        # Extract entities
        entities_list = []
        for text in batch['preprocessed_text']:
            entities_list.append(self.extract_entities(text))

        # Update dataframe with extracted entities
        batch['extracted_foods'] = [data['foods'] for data in entities_list]
        batch['extracted_locations'] = [data['locations'] for data in entities_list]
        batch['extracted_tastes'] = [data['tastes'] for data in entities_list]

        # Update original dataframe
        df.iloc[start_idx:end_idx] = batch

        # Save intermediate results periodically
        if (i + 1) % 5 == 0 or (i + 1) == total_batches:
            self.save_entity_list(self.foods, "foods")
            self.save_entity_list(self.locations, "locations")

            # Save intermediate results
            checkpoint_file = f"extracted_data/processed_data_batch_{i+1}.csv"
            df.iloc[:end_idx].to_csv(checkpoint_file, index=False)
            logging.info(f"Saved intermediate results to {checkpoint_file} after batch {i+1}/{total_batches}")

    # Generate statistics
    food_count = len(self.foods)
    location_count = len(self.locations)

    logging.info(f"Extraction complete. Found {food_count} unique foods and {location_count} unique locations.")

    return df

def bootstrap_entity_lists(self, df, text_column="preprocessed_text", min_freq=3):
    """Bootstrap entity lists using TF-IDF to find potential new entities"""
    if df.empty or text_column not in df.columns:
        logging.error(f"Cannot bootstrap: Invalid dataframe or missing column '{text_column}'")
        return set()

    # Filter out empty texts
    valid_texts = df[text_column].dropna().replace('', pd.NA).dropna().tolist()

    if not valid_texts:
        logging.warning("No valid texts found for bootstrapping")
        return set()

    try:
        # Use TF-IDF to find important n-grams
        tfidf = TfidfVectorizer(
            ngram_range=(1, 3),
            min_df=min_freq,
            max_df=0.9
        )

        tfidf_matrix = tfidf.fit_transform(valid_texts)
        feature_names = tfidf.get_feature_names_out()

        # Get high TF-IDF n-grams
        important_ngrams = []
        for i in range(min(tfidf_matrix.shape[0], 100)):  # Limit to first 100 docs for efficiency
            feature_index = tfidf_matrix[i,:].nonzero()[1]
            tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
            # Sort by descending score
            for idx, score in sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:20]:
                important_ngrams.append(feature_names[idx])

        # Filter to likely food names (those appearing near food indicators)
        potential_foods = set()
        for text in valid_texts:
            for indicator in self.food_indicators:
                if indicator in text:
                    for ngram in important_ngrams:
                        # Check if ngram is near a food indicator
                        if ngram in text and re.search(r'\b' + re.escape(indicator) + r'.{0,30}' + re.escape(ngram), text, re.IGNORECASE):
                            potential_foods.add(ngram)

                        # Also check for ngrams that appear before the indicator
                        if ngram in text and re.search(r'\b' + re.escape(ngram) + r'.{0,30}' + re.escape(indicator), text, re.IGNORECASE):
                            potential_foods.add(ngram)

        # Filter out unlikely food candidates (too short, numbers only, etc.)
        filtered_foods = {food for food in potential_foods if len(food) > 2 and not food.isdigit()}

        # Update food list
        self.foods.update(filtered_foods)
        logging.info(f"Added {len(filtered_foods)} potential foods from bootstrapping")

        return filtered_foods

    except Exception as e:
        logging.error(f"Error in bootstrapping: {e}")
        logging.error(traceback.format_exc())
        return set()

VietnameseTextProcessor.process_dataframe = process_dataframe
VietnameseTextProcessor.bootstrap_entity_lists = bootstrap_entity_lists

In [9]:
def main():
    try:
        # Create processor instance
        processor = VietnameseTextProcessor()

        # Load dataset
        logging.info("Loading dataset...")
        try:
            # df = pd.read_csv("/content/21KHDL-TikTok-Analytics/data/interim/small_video_transcription.csv")
            df = pd.read_csv("C:/Users/nguye/OneDrive/Tài liệu/GitHub/21KHDL-TikTok-Analytics/data/interim/small_video_transcription.csv")
            if df.empty:
                logging.error("Loaded dataset is empty")
                return
            logging.info(f"Loaded dataset with {len(df)} rows")
        except Exception as e:
            logging.error(f"Error loading dataset: {e}")
            logging.error(traceback.format_exc())
            return

        # Process a sample for testing (use .head(10) for testing, remove for full processing)
        sample_df = df.head(2)

        # Process the dataframe
        logging.info("Starting text processing and entity extraction...")
        processed_df = processor.process_dataframe(sample_df, text_column='video_transcription')

        # Bootstrap to find more potential entities
        logging.info("Bootstrapping to expand entity lists...")
        processor.bootstrap_entity_lists(processed_df)

        # Save final results
        processed_df.to_csv("extracted_data/fully_processed_data.csv", index=False)
        processor.save_entity_list(processor.foods, "foods")
        processor.save_entity_list(processor.locations, "locations")

        # Save a structured JSON with video_id, author_id and extracted entities
        structured_data = []
        for _, row in processed_df.iterrows():
            structured_data.append({
                'video_id': row.get('video_id', ''),
                'author_id': row.get('author_id', ''),
                'extracted_entities': {
                    'foods': row.get('extracted_foods', []),
                    'locations': row.get('extracted_locations', []),
                    'tastes': row.get('extracted_tastes', [])
                }
            })

        with open("extracted_data/structured_entities.json", 'w', encoding='utf-8') as f:
            json.dump(structured_data, f, ensure_ascii=False, indent=2)

        logging.info("Processing complete. Results saved to 'extracted_data' directory.")

    except Exception as e:
        logging.error(f"Fatal error in main function: {e}")
        logging.error(traceback.format_exc())

In [10]:
if __name__ == "__main__":
    main()

2025-03-08 16:34:10,367 - INFO - No existing foods list found, starting with empty set
2025-03-08 16:34:10,367 - INFO - No existing locations list found, starting with empty set
2025-03-08 16:34:10,370 - INFO - Loading dataset...
2025-03-08 16:34:10,648 - INFO - Loaded dataset with 10673 rows
2025-03-08 16:34:10,649 - INFO - Starting text processing and entity extraction...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preprocessed_text'] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['extracted_foods'] = None
A value is trying to be set on a co