## Data Collection

In [None]:
import tensorflow as tf
import requests
import re
import pandas as pd
import os
import tqdm
import json
%pip install youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
from openai import OpenAI
import csv
import os.path
import re
import sys
from io import StringIO
import pandas as pd
import argparse
import ast

api_key = "Enter api key"


# Define the feature description to decode the features
feature_description = {
    "id": tf.io.FixedLenFeature([], tf.string),
    "labels": tf.io.VarLenFeature(tf.int64),
}

# Function to parse a single example
def _parse_function(example_proto):
    return tf.io.parse_single_example(example_proto, feature_description)

# Function to extract YouTube ID from the 4-character ID
def get_youtube_id(four_char_id):
    base_url = "https://data.yt8m.org/2/j/i"
    sub_path = f"/{four_char_id[:2]}/{four_char_id}.js"
    url = base_url + sub_path

    try:
        response = requests.get(url, verify=False)
        response.raise_for_status()  # Raise an HTTPError for bad responses

        # Extract the YouTube ID using a regular expression
        match = re.search(r'i\(".*?","(.*?)"\);', response.text)
        if match:
            return match.group(1)
        else:
            return None
    except requests.exceptions.RequestException:
        return None

# Function to get transcript for a YouTube video
def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return " ".join([entry['text'] for entry in transcript])
    except Exception:
        return None

# Directory containing the TFRecord files
tfrecord_dir = "/content/files"  # Update with your directory path
output_dir = "./output_json_files"  # Directory to save JSON files
os.makedirs(output_dir, exist_ok=True)

# Process each TFRecord file
for tfrecord_file in os.listdir(tfrecord_dir):
    if tfrecord_file.endswith(".tfrecord"):
        tfrecord_path = os.path.join(tfrecord_dir, tfrecord_file)
        output_path = os.path.join(output_dir, f"{os.path.splitext(tfrecord_file)[0]}.json")
        print(output_path)

        # Create a dataset from the TFRecord file
        dataset = tf.data.TFRecordDataset(tfrecord_path)
        dataset = dataset.map(_parse_function)

        # Extract the 4-digit codes from the 'id' field and keep the labels and id together as a tuple
        codes = [(example['id'].numpy().decode('utf-8'), example['labels'].values.numpy()) for example in dataset]

        # Print the extracted codes
        print("Extracted Codes:", codes)


        # Example usage
        youtube_ids = []
        for four_char_id, labels in tqdm.tqdm(codes):
            print(f"Getting YouTube ID for {four_char_id}")
            youtube_id = get_youtube_id(four_char_id)
            if youtube_id != "Error":
                youtube_ids.append([youtube_id, labels])

        with open('Vocabulary.csv', mode='r') as file:
            reader = csv.reader(file)
            vocabulary = {rows[0]: rows[3] for rows in reader}

        # Print the vocabulary
        # print("Vocabulary:", vocabulary)
        # print(vocabulary['0'])

        video_data = []
        for youtube_id, labels in youtube_ids:
            print(f"  YouTube ID: {youtube_id}")
            for label in labels:
                print(f"  Label {label}: {vocabulary[str(label)]}")
            print(f"  Labels: {[vocabulary[str(label)] for label in labels]}")
            video_data.append((youtube_id, [vocabulary[str(label)] for label in labels]))

        data_to_write = []
        for video_id, labels in video_data:
            try:
                transcript = YouTubeTranscriptApi.get_transcript(video_id)
                transcript_text = " ".join([entry['text'] for entry in transcript])
                data_to_write.append({
                    'video_id': video_id,
                    'labels': labels,
                    'transcript': transcript_text
                })
                print("Data for video ID", video_id, "added to JSON")
            except Exception as e:
                continue

        # Write data to JSON file
        with open(output_path, 'w') as json_file:
            json.dump(data_to_write, json_file, indent=4)

        print(f"Finished processing {tfrecord_file}. Data saved to {output_path}")

## Preprocessing

In [None]:
import os
import json
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk

# Initialize tools
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords

# Initialize the Porter Stemmer
wstem = PorterStemmer()

# Define punctuations and stopwords
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
stop_words = set(stopwords.words('english'))

# Define the preprocess_text function
def preprocess_text(text):
    """
    Preprocesses input text by:
    - Removing HTML tags
    - Converting to lowercase
    - Removing punctuation
    - Removing numbers
    - Removing excess whitespace
    - Tokenizing
    - Removing stopwords
    """
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = ''.join(char for char in text if char not in punctuations)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize (split into words)
    words = text.split()

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Apply stemming
    words = [wstem.stem(word) for word in words]

    # Return cleaned text as a single string
    return ' '.join(words)

# Folder containing all JSON files
all_files_path = "/content/output_json_files"

# Initialize an empty list to hold all data
video_data = []

# Read all JSON files from the directory
for file_name in os.listdir(all_files_path):
    if file_name.endswith(".json"):  # Only process JSON files
        file_path = os.path.join(all_files_path, file_name)
        with open(file_path, 'r') as file:
            data = json.load(file)
            video_data.extend(data)  # Assuming each file is a list of video records

# Preprocess transcripts and collect data
preprocessed_data = []
corpus = []

for item in video_data:
    preprocessed_transcript = preprocess_text(item['transcript'])
    corpus.append(preprocessed_transcript)
    preprocessed_data.append({
        "video_id": item["video_id"],
        "cleaned_transcript": preprocessed_transcript,
        "labels": item["labels"],
        "generated_labels": None  # Placeholder for future labels
    })

# Apply TF-IDF to remove common words across all transcripts
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Remove words with low TF-IDF scores
filtered_data = []
for i, item in enumerate(preprocessed_data):
    tfidf_scores = zip(vectorizer.get_feature_names_out(), X[i].toarray()[0])
    significant_words = [word for word, score in tfidf_scores if score > 0.1]
    filtered_text = ' '.join(significant_words)
    filtered_data.append({
        "video_id": item["video_id"],
        "cleaned_transcript": filtered_text,
        "labels": item["labels"],
        "generated_labels": item["generated_labels"]
    })

# Convert to DataFrame for structured representation
df = pd.DataFrame(filtered_data)

# Save the processed DataFrame to a CSV file
output_path = "cleaned_dataset.csv"
df.to_csv(output_path, index=False)

# Print a sample to verify
print(df.head())

## Unigram

In [None]:
def split_into_segments(text):
    if pd.isna(text):
        return ["", "", "", ""]

    words = text.split()
    total_words = len(words)

    if total_words == 0:
        return ["", "", "", ""]

    segment_size = max(1, total_words // 4)

    segments = []
    for i in range(0, total_words, segment_size):
        segment = words[i:i + segment_size]
        segments.append(" ".join(segment))

    if len(segments) > 4:
        # Join the remaining segments with the 4th segment
        segments[3] = segments[3] + " " + " ".join(segments[4:])
        segments = segments[:4]
    elif len(segments) < 4:
        while len(segments) < 4:
            segments.append("")

    return segments

df = pd.read_csv("cleaned_dataset.csv")
df['unigrams'] = df['cleaned_transcript'].apply(split_into_segments)

In [None]:
from collections import defaultdict, Counter
import math


def calculate_unigram_language_model(transcript):
  # Calculate the unigram language model for a given transcript.
  if pd.isna(transcript):
    return {}

  words = transcript.split()
  total_words = len(words)
  if total_words == 0:
    return {}

  word_frequencies = Counter(words)
  return {word: freq / total_words for word, freq in word_frequencies.items()}


def calculate_scores(full_lm, segment_lm):
  # Calculate the score for each word based on Full-LM and Segment-LM.

  scores = {}
  for word in segment_lm:
    probability_full_lm = full_lm.get(word, 0)
    probability_segment_lm = segment_lm[word]
    scores[word] = -probability_full_lm + probability_segment_lm
  return scores


def extract_top_n_words(scores, n=5):
  # Extract the top N words based on scores.

  return [word for word, _ in sorted(scores.items(), key=lambda item: item[1], reverse=True)[:n]]


def process_video_segments(full_transcript, segment_transcripts, n=5):
  # Process the video and its segments to extract top N representative words for each segment.

  if pd.isna(full_transcript) or not isinstance(segment_transcripts, list):
    return {}

  full_lm = calculate_unigram_language_model(full_transcript)
  segment_word_map = {}

  for i, segment in enumerate(segment_transcripts):
    if pd.isna(segment) or segment == "":
      segment_word_map[f"Segment-{i+1}"] = []
      continue

    segment_lm = calculate_unigram_language_model(segment)
    scores = calculate_scores(full_lm, segment_lm)
    print(scores)
    top_words = extract_top_n_words(scores, n)
    segment_word_map[f"Segment-{i+1}"] = top_words

  return segment_word_map

df['unigram_word_map'] = None
for index, row in df.iterrows():
  full_transcript = row['cleaned_transcript']
  segment_transcripts = row['unigrams']
  print(segment_transcripts)
  segment_word_map = process_video_segments(full_transcript, segment_transcripts)
  df.at[index, 'unigram_word_map'] = segment_word_map


df['segments'] = df['unigram_word_map'].apply(lambda x: [v for v in x.values()])
df.to_csv('unigram_word_map.csv', index=False)

## GPT

In [None]:
def query_gpt(querygive, api_key):
    client = OpenAI(api_key=api_key)
    #NOTE: update API key to run this part

    chat_completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": querygive}],
        stream=False
    )

    return chat_completion.choices[0].message.content

In [None]:
df = pd.read_csv("unigram_word_map.csv")
df['segments'] = df['segments'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

df['GPT_answer'] = None
batch_size = 1000

df_small = df

for index, row in df_small.iterrows():
    segments = row['segments']
    if segments is None or pd.isna(segments).all() or len(segments) == 0:
        df_small.at[index, 'GPT_answer'] = ""
    else:
        coherent_segments = []
        for segment in segments:
            print(segment)
            post_text = """I am giving you some words that represent a topic. Make a coherent topic out of those words. Do not add additional meaning or inferences.
                Try to restrict to those words alone as much as possible. Keep the topic as short as possible. Return the output in the following format:
                Topic: <topic>. Do not include any other text.""" + " ".join(segment)
            coherent_segments.append(query_gpt(post_text, api_key))
        df_small.at[index, 'GPT_answer'] = coherent_segments

df_small.to_csv("processed_batch_gpt_410.csv", index=False)

## Evaluation

In [None]:
import pandas as pd
import ast  # To safely evaluate string representations of lists

# Load the CSV file
df_results = pd.read_csv("/content/results.csv")

# Drop the column 'generated_labels'
df_results = df_results.drop(columns=['generated_labels'], errors='ignore')

# Rename columns
df_results = df_results.rename(columns={
    'GPT_answer': 'generated_labels'
})

import csv

# Load the vocabulary from the CSV file into a dictionary
with open('/content/Vocabulary.csv', mode='r') as file:
    reader = csv.reader(file)
    vocabulary = {rows[0]: rows[3] for rows in reader}

# Function to map vocabulary to labels
def map_labels(labels, vocab):
    if isinstance(labels, str):  # Convert string to list if necessary
        labels = ast.literal_eval(labels)
    return [vocab.get(str(label), "Unknown") for label in labels]


# Apply the vocabulary mapping to the 'labels' column in the DataFrame
df_results['annotated_labels'] = df_results['labels'].apply(lambda x: map_labels(x, vocabulary))

# Display the updated DataFrame
df_results.head()


# # Function to parse topics from a single string
def parse_topics(row):
    # Check for null or empty values
    if pd.isna(row) or not row.strip():
        return []  # Return an empty list if the row is invalid

    try:
        # Remove brackets, split by ", ", and extract the portion after "Topic: "
        topics = [item.split("Topic: ")[1].strip() for item in row.strip('[]').split(', ')]
        return topics
    except IndexError:
        return []  # Return an empty list if the format is unexpected


# Apply the function to the DataFrame column
df_results['parsed_topics'] = df_results['generated_labels'].apply(parse_topics)

# Display the updated DataFrame
df_results.columns

In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
!pip install rouge_score
from rouge_score import rouge_scorer
!pip install Levenshtein
from Levenshtein import ratio as levenshtein_ratio
import matplotlib.pyplot as plt
import nltk
import seaborn as sns


plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')


rouge_scorer_instance = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
smoother = SmoothingFunction()


THRESHOLD = 0.4



df = df_results


levenshtein_scores, bleu_scores, rouge_scores = [], [], []
levenshtein_bool, bleu_bool, rouge_bool = [], [], []


for index, row in df.iterrows():
    ann = str(row["annotated_labels"])
    gen = str(row["parsed_topics"])


    lev_score = levenshtein_ratio(gen, ann)
    levenshtein_scores.append(lev_score)
    levenshtein_bool.append(lev_score > THRESHOLD)


    reference = nltk.word_tokenize(ann.lower())
    candidate = nltk.word_tokenize(gen.lower())
    bleu_score = sentence_bleu([reference], candidate, smoothing_function=smoother.method1)
    bleu_scores.append(bleu_score)
    bleu_bool.append(bleu_score > THRESHOLD)


    rouge_score = rouge_scorer_instance.score(ann, gen)['rougeL'].fmeasure
    rouge_scores.append(rouge_score)
    rouge_bool.append(rouge_score > THRESHOLD)


similarity_df = pd.DataFrame({
    "Levenshtein": levenshtein_scores,
    "BLEU": bleu_scores,
    "ROUGE": rouge_scores,
    "Levenshtein_Bool": levenshtein_bool,
    "BLEU_Bool": bleu_bool,
    "ROUGE_Bool": rouge_bool
})


mean_scores = {
    "Levenshtein": similarity_df["Levenshtein"].mean(),
    "BLEU": similarity_df["BLEU"].mean(),
    "ROUGE": similarity_df["ROUGE"].mean()
}


f1_scores = {
    "Levenshtein": f1_score([1] * len(levenshtein_bool), levenshtein_bool, zero_division=0),
    "BLEU": f1_score([1] * len(bleu_bool), bleu_bool, zero_division=0),
    "ROUGE": f1_score([1] * len(rouge_bool), rouge_bool, zero_division=0)
}

precision_scores = {
    "Levenshtein": precision_score([1] * len(levenshtein_bool), levenshtein_bool, zero_division=0),
    "BLEU": precision_score([1] * len(bleu_bool), bleu_bool, zero_division=0),
    "ROUGE": precision_score([1] * len(rouge_bool), rouge_bool, zero_division=0)
}

recall_scores = {
    "Levenshtein": recall_score([1] * len(levenshtein_bool), levenshtein_bool, zero_division=0),
    "BLEU": recall_score([1] * len(bleu_bool), bleu_bool, zero_division=0),
    "ROUGE": recall_score([1] * len(rouge_bool), rouge_bool, zero_division=0)
}


plt.figure(figsize=(10, 6))
bars = plt.bar(mean_scores.keys(), mean_scores.values())
plt.title("Mean Similarity Scores", fontsize=14, pad=20)
plt.ylabel("Score", fontsize=12)
plt.grid(True, alpha=0.3)
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}',
             ha='center', va='bottom')
plt.show()


metrics_df = pd.DataFrame({
    "Levenshtein": [f1_scores["Levenshtein"], precision_scores["Levenshtein"], recall_scores["Levenshtein"]],
    "BLEU": [f1_scores["BLEU"], precision_scores["BLEU"], recall_scores["BLEU"]],
    "ROUGE": [f1_scores["ROUGE"], precision_scores["ROUGE"], recall_scores["ROUGE"]]
}, index=["F1-Score", "Precision", "Recall"])


ax = metrics_df.plot(kind="bar", figsize=(12, 7))
plt.title("Model Performance Metrics", fontsize=14, pad=20)
plt.ylabel("Score", fontsize=12)
plt.xlabel("Metrics", fontsize=12)
plt.xticks(rotation=0)
plt.legend(title="Metric", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)


for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', padding=3)
plt.tight_layout()
plt.show()