In [59]:
import pandas as pd

In [60]:
def split_into_segments(text):
    if pd.isna(text):
        return ["", "", "", ""]
        
    words = text.split()
    total_words = len(words)
    
    if total_words == 0:
        return ["", "", "", ""]
    
    segment_size = max(1, total_words // 4) 
    
    segments = []
    for i in range(0, total_words, segment_size):
        segment = words[i:i + segment_size]
        segments.append(" ".join(segment))
    
    if len(segments) > 4:
        # Join the remaining segments with the 4th segment
        segments[3] = segments[3] + " " + " ".join(segments[4:])
        segments = segments[:4]
    elif len(segments) < 4:
        while len(segments) < 4:
            segments.append("")
            
    return segments

df = pd.read_csv("/Users/yoshe/Desktop/CS410-topic-summarization-and-labeling/cleaned_dataset.csv")
df['unigrams'] = df['cleaned_transcript'].apply(split_into_segments)

In [61]:
from collections import defaultdict, Counter
import math


def calculate_unigram_language_model(transcript):
  # """
  #     Calculate the unigram language model for a given transcript.

  #     Args:
  #         transcript (str): The transcript of the text.

  #     Returns:
  #         dict: A dictionary with words as keys and their probabilities as values.
  #     """
  if pd.isna(transcript):
    return {}
    
  words = transcript.split()
  total_words = len(words)
  if total_words == 0:
    return {}
    
  word_frequencies = Counter(words)
  return {word: freq / total_words for word, freq in word_frequencies.items()}


def calculate_scores(full_lm, segment_lm):
  # """
  #     Calculate the score for each word based on Full-LM and Segment-LM.

  #     Args:
  #         full_lm (dict): Unigram language model for the full transcript.
  #         segment_lm (dict): Unigram language model for a segment.

  #     Returns:
  #         dict: A dictionary with words as keys and their scores as values.
  #     """
  scores = {}
  for word in segment_lm:
    probability_full_lm = full_lm.get(word, 0)
    probability_segment_lm = segment_lm[word]
    scores[word] = -probability_full_lm + probability_segment_lm
  return scores


def extract_top_n_words(scores, n=5):
  # """
  #     Extract the top N words based on scores.

  #     Args:
  #         scores (dict): A dictionary with words as keys and their scores as values.
  #         n (int): The number of top words to extract.

  #     Returns:
  #         list: A list of the top N words.
  #     """
  return [word for word, _ in sorted(scores.items(), key=lambda item: item[1], reverse=True)[:n]]


def process_video_segments(full_transcript, segment_transcripts, n=5):
  # """
  #     Process the video and its segments to extract top N representative words for each segment.

  #     Args:
  #         full_transcript (str): The full transcript of the video.
  #         segment_transcripts (list): A list of transcripts for each segment.
  #         n (int): The number of top words to extract for each segment.

  #     Returns:
  #         dict: A dictionary mapping each segment to its top N descriptive words.
  #     """
  if pd.isna(full_transcript) or not isinstance(segment_transcripts, list):
    return {}
    
  full_lm = calculate_unigram_language_model(full_transcript)
  segment_word_map = {}

  for i, segment in enumerate(segment_transcripts):
    if pd.isna(segment) or segment == "":
      segment_word_map[f"Segment-{i+1}"] = []
      continue
      
    segment_lm = calculate_unigram_language_model(segment)
    scores = calculate_scores(full_lm, segment_lm)
    print(scores)
    top_words = extract_top_n_words(scores, n)
    segment_word_map[f"Segment-{i+1}"] = top_words

  return segment_word_map

df['unigram_word_map'] = None
for index, row in df.iterrows():
  full_transcript = row['cleaned_transcript']
  segment_transcripts = row['unigrams']
  print(segment_transcripts)
  segment_word_map = process_video_segments(full_transcript, segment_transcripts)
  df.at[index, 'unigram_word_map'] = segment_word_map


df['segments'] = df['unigram_word_map'].apply(lambda x: [v for v in x.values()])
df.to_csv('unigram_word_map.csv', index=False)

['back bj game', 'jordan know michael', 'music number pippen', 'practic reggi return scotti time today']
{'back': 0.26666666666666666, 'bj': 0.26666666666666666, 'game': 0.26666666666666666}
{'jordan': 0.26666666666666666, 'know': 0.26666666666666666, 'michael': 0.26666666666666666}
{'music': 0.26666666666666666, 'number': 0.26666666666666666, 'pippen': 0.26666666666666666}
{'practic': 0.09999999999999999, 'reggi': 0.09999999999999999, 'return': 0.09999999999999999, 'scotti': 0.09999999999999999, 'time': 0.09999999999999999, 'today': 0.09999999999999999}
['applaus follow', 'fore incred', 'lot music', 'paradox relax sound speech']
{'applaus': 0.4, 'follow': 0.4}
{'fore': 0.4, 'incred': 0.4}
{'lot': 0.4, 'music': 0.4}
{'paradox': 0.15, 'relax': 0.15, 'sound': 0.15, 'speech': 0.15}
['aint bru cage cate celebr', 'chick im jame kid kj', 'leav lie life music progress', 'save sl slain stay stupid']
{'aint': 0.15000000000000002, 'bru': 0.15000000000000002, 'cage': 0.15000000000000002, 'cate': 