In [1]:
%%shell
git clone --branch vmphat --single-branch https://github.com/vphuhan/21KHDL-TikTok-Analytics.git
cd 21KHDL-TikTok-Analytics
git sparse-checkout init --cone
git sparse-checkout set data/interim
git checkout

fatal: destination path '21KHDL-TikTok-Analytics' already exists and is not an empty directory.
Your branch is up to date with 'origin/vmphat'.




In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/21KHDL-TikTok-Analytics/data/interim/audio_text.csv")
# Drop rows with empty text
df = df.dropna(subset=['text'])

In [3]:
# Function to check if text is too short
def is_text_short(text, min_words=10):
    if not isinstance(text, str):  # Handle non-string values (e.g., NaN)
        return True
    words = text.split()  # Split text into words
    return len(words) < min_words

# Drop rows where 'text' is too short (e.g., fewer than 10 words)
min_words = 10  # Set your threshold here
df = df[~df['text'].apply(is_text_short, min_words=min_words)]

# Reset the index (optional)
df = df.reset_index(drop=True).head(20)

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-large-vietnews-summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-large-vietnews-summarization")

def summarize(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Add a summary column to the DataFrame
df["summary"] = df["text"].apply(summarize)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
from transformers import pipeline

# Load NER pipeline
ner_pipeline = pipeline("ner", model="xlm-roberta-large-finetuned-conll03-english", aggregation_strategy="simple")

def extract_entities(text):
    entities = ner_pipeline(text)
    return [{"entity": entity["word"], "type": entity["entity_group"]} for entity in entities]

# Add an entities column to the DataFrame
df["entities"] = df["text"].apply(extract_entities)

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [6]:
from transformers import pipeline

# Define your candidate labels (customize these)
candidate_labels = ["đồ ăn", "nấu nướng", "công nghệ", "đất nước"]

# Load zero-shot pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")

def classify_text(text):
    result = classifier(text, candidate_labels)
    return result["labels"][0]  # Return the top predicted label

# Add a topic column to the DataFrame
df["topic"] = df["text"].apply(classify_text)

Device set to use cpu


In [7]:
df.to_csv("audio_text_processed.csv", index=False)