In [1]:
%%shell
git clone --branch vmphat --single-branch https://github.com/vphuhan/21KHDL-TikTok-Analytics.git
cd 21KHDL-TikTok-Analytics
git sparse-checkout init --cone
git sparse-checkout set data/interim
git checkout

Cloning into '21KHDL-TikTok-Analytics'...
remote: Enumerating objects: 906, done.[K
remote: Counting objects: 100% (275/275), done.[K
remote: Compressing objects: 100% (133/133), done.[K
remote: Total 906 (delta 201), reused 142 (delta 142), pack-reused 631 (from 1)[K
Receiving objects: 100% (906/906), 53.50 MiB | 19.40 MiB/s, done.
Resolving deltas: 100% (457/457), done.
Your branch is up to date with 'origin/vmphat'.




In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/21KHDL-TikTok-Analytics/data/interim/audio_text.csv")
# Drop rows with empty text
df = df.dropna(subset=['text']).head(20)

In [None]:
import google.generativeai as genai
import pandas as pd
import os
from tqdm import tqdm

# Setup API key
# Replace with your actual Gemini API key
os.environ["GOOGLE_API_KEY"] = ""
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# Initialize Gemini model
model = genai.GenerativeModel('gemini-pro')



def extract_entities_with_gemini(text, custom_categories=None):
    """
    Extract named entities using Gemini API with support for custom categories.

    Args:
        text (str): The text to extract entities from
        custom_categories (dict, optional): Dictionary with category names as keys and lists of terms as values
                                           Example: {"COLOR": ["red", "blue"], "FOOD": ["pizza", "sushi"]}

    Returns:
        list: List of dictionaries with entity and type information
    """
    custom_categories = {
    "COLOR": ["red", "blue", "green", "vàng"],
    "FOOD": ["pizza", "pasta", "sushi", "rice", "bread"],
    "Media": ["YouTube"]
    }

    try:
        # Build prompt based on whether we have custom categories
        if custom_categories:
            # Create a description of custom categories for the prompt
            categories_desc = "\n".join([
                f"- {category}: Find items that match any of these terms: {', '.join(terms)}"
                for category, terms in custom_categories.items()
            ])

            prompt = f"""Extract entities from the following text and classify them by type.
            Use standard NER categories (PER for person, ORG for organization, LOC for location, MISC for miscellaneous).

            Additionally, use these custom categories:
            {categories_desc}

            Return the result as a JSON list of objects with 'entity' and 'type' keys.

            Text: {text}

            Entities:"""
        else:
            # Standard NER prompt without custom categories
            prompt = f"""Extract named entities from the following text and classify them by type (PER for person, ORG for organization, LOC for location, MISC for miscellaneous).
            Return the result as a JSON list of objects with 'entity' and 'type' keys.

            Text: {text}

            Entities:"""

        response = model.generate_content(prompt)

        # Process the response to create a list of entity dictionaries
        import json
        try:
            # Try to parse JSON directly from response
            entities = json.loads(response.text)
        except json.JSONDecodeError:
            # If direct parsing fails, extract JSON part from text
            try:
                # Look for JSON-like content in the response
                start_idx = response.text.find('[')
                end_idx = response.text.rfind(']') + 1
                if start_idx >= 0 and end_idx > start_idx:
                    json_str = response.text[start_idx:end_idx]
                    entities = json.loads(json_str)
                else:
                    # Fallback if no JSON structure found
                    entities = []
            except:
                entities = []

        # If we have custom categories, also do a direct matching pass
        if custom_categories and entities:
            # Convert text to lowercase for case-insensitive matching
            lower_text = text.lower()

            # For each custom category and its terms
            for category, terms in custom_categories.items():
                for term in terms:
                    term_lower = term.lower()
                    if term_lower in lower_text:
                        # Find all occurrences of this term
                        start = 0
                        while True:
                            start = lower_text.find(term_lower, start)
                            if start == -1:
                                break

                            # Add as a found entity
                            # Use the original casing from the text
                            original_term = text[start:start+len(term)]
                            entities.append({
                                "entity": original_term,
                                "type": category
                            })

                            start += len(term_lower)

        return entities
    except Exception as e:
        print(f"Error extracting entities: {e}")
        return []

def classify_text_with_gemini(text, candidate_labels=["đồ ăn", "nấu nướng", "công nghệ", "đất nước"]):
    """Classify text into one of the candidate labels using Gemini API"""
    try:
        labels_str = ", ".join(candidate_labels)
        prompt = f"""Classify the following Vietnamese text into exactly one of these categories: {labels_str}

        Text: {text}

        Category:"""

        response = model.generate_content(prompt)

        # Check if the response contains one of our labels
        result = response.text.strip()
        for label in candidate_labels:
            if label.lower() in result.lower():
                return label

        # If no exact match found, return the first candidate as fallback
        return candidate_labels[0]
    except Exception as e:
        print(f"Error classifying text: {e}")
        return candidate_labels[0]

# Function to process the dataframe
def process_dataframe(df):
    # Make a copy to avoid modifying the original
    processed_df = df.copy()

    # Extract entities with progress bar
    print("Extracting entities...")
    tqdm.pandas(desc="Extracting entities")
    processed_df["entities"] = processed_df["text"].progress_apply(extract_entities_with_gemini)

    # Classify text with progress bar
    print("Classifying text...")
    tqdm.pandas(desc="Classifying")
    processed_df["topic"] = processed_df["text"].progress_apply(classify_text_with_gemini)

    return processed_df

# Example usage:
# Assuming df is your DataFrame with a "text" column containing Vietnamese text
# df = pd.read_csv("your_data.csv")
result_df = process_dataframe(df)
result_df.to_csv("processed_data.csv", index=False)

Extracting entities...


Extracting entities:  10%|█         | 2/20 [00:00<00:04,  3.84it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  15%|█▌        | 3/20 [00:01<00:09,  1.87it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  20%|██        | 4/20 [00:01<00:07,  2.11it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.




Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.




Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  35%|███▌      | 7/20 [00:03<00:08,  1.54it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  40%|████      | 8/20 [00:04<00:06,  1.76it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  45%|████▌     | 9/20 [00:05<00:07,  1.51it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  50%|█████     | 10/20 [00:05<00:05,  1.76it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  55%|█████▌    | 11/20 [00:05<00:04,  1.94it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  60%|██████    | 12/20 [00:06<00:03,  2.07it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  65%|██████▌   | 13/20 [00:06<00:03,  2.24it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  70%|███████   | 14/20 [00:07<00:02,  2.37it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  75%|███████▌  | 15/20 [00:07<00:02,  2.34it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  80%|████████  | 16/20 [00:07<00:01,  2.45it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  85%|████████▌ | 17/20 [00:08<00:01,  2.54it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  90%|█████████ | 18/20 [00:08<00:00,  2.55it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities:  95%|█████████▌| 19/20 [00:09<00:00,  2.38it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities: 100%|██████████| 20/20 [00:09<00:00,  2.34it/s]

Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Extracting entities: 100%|██████████| 20/20 [00:12<00:00,  1.62it/s]


Error extracting entities: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.
Classifying text...


Classifying:  10%|█         | 2/20 [00:00<00:03,  5.54it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  15%|█▌        | 3/20 [00:00<00:04,  4.07it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  20%|██        | 4/20 [00:01<00:05,  3.06it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  25%|██▌       | 5/20 [00:01<00:04,  3.03it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  30%|███       | 6/20 [00:01<00:04,  2.93it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  35%|███▌      | 7/20 [00:02<00:04,  2.88it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  40%|████      | 8/20 [00:02<00:04,  2.90it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  45%|████▌     | 9/20 [00:02<00:03,  2.92it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  50%|█████     | 10/20 [00:03<00:03,  2.87it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  55%|█████▌    | 11/20 [00:03<00:03,  2.81it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  60%|██████    | 12/20 [00:04<00:02,  2.73it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  65%|██████▌   | 13/20 [00:04<00:02,  2.74it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  70%|███████   | 14/20 [00:04<00:02,  2.64it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  75%|███████▌  | 15/20 [00:05<00:01,  2.73it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  80%|████████  | 16/20 [00:05<00:01,  2.80it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  85%|████████▌ | 17/20 [00:05<00:01,  2.85it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  90%|█████████ | 18/20 [00:06<00:00,  2.70it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying:  95%|█████████▌| 19/20 [00:06<00:00,  2.34it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying: 100%|██████████| 20/20 [00:07<00:00,  2.45it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.


Classifying: 100%|██████████| 20/20 [00:07<00:00,  2.66it/s]

Error classifying text: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.



