# Install Dependencies

In [58]:
"""'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    🗿 READ ME 🗿
    - This Only Needs To Run Once.
    - This Needs to Restart the Kernel after Installing the Dependencies.  
    - To Avoid Unintended Restart: This adds an init.flag file in root folder after successful installation.
    - To Rerun: Delete init.flag file in root folder.
    
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''"""
# Import OS Dependency
import os, subprocess, sys

# Check If This Already Runned Once
if os.path.exists("init.flag"):
    print("Installation already completed.")
else:
    print("Starting Installation...")

    # Ensure pip is Installed
    subprocess.check_call([sys.executable, "-m", "ensurepip"], shell=True)

    # Install Main Dependencies
    subprocess.check_call(f'python -m pip install nltk "ffmpeg-python" "openai-whisper" pandas pytubefix bertopic "scikit-learn"', shell=True)
    
    # Install Other Dependencies
    subprocess.check_call(f'python -m pip install torch tqdm', shell=True)
    
    # Install BERTopic Dependencies
    subprocess.check_call(f'python -m pip install "numpy<2" "tf-keras"', shell=True)

    # Add Flag File to Set that this Already Runned Once
    open("init.flag", "w").close()

    # Restart the Kernel to Load Installed Dependences
    os._exit(00)

Installation already completed.


# Import Dependencies

In [59]:
# Import Main Dependencies
import os, re, nltk, ffmpeg, whisper
import pandas as pd
from pandas import DataFrame
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from pytubefix import YouTube, Stream
from pytubefix.cli import on_progress
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Import Other Dependencies
import torch
from tqdm.auto import tqdm

# Additional Downloads
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("stopwords")

[nltk_data] Downloading package punkt_tab to C:\Users\MSI
[nltk_data]     Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\MSI Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to C:\Users\MSI
[nltk_data]     Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Define Utilities

In [60]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with "_"
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub("_", filename)

    return sanitized_filename
    
def read_unique_items_from_file(file: str) -> list:
    with open(file, "r") as f:
        return list(set(url.strip() for url in f.readlines() if url.strip()))

# Set Configurations

In [61]:
# File Names
yt_video_links_filename = "YouTube Video Links.txt"
transcript_sentences_filename = "transcript_sentences.csv"
related_transcript_sentences_filename = "related_transcript_sentences.csv"

# Folder Names
video_output_path = "Video"
audio_output_path = "Audio"
transcription_output_path = "Transcription"
cities_path = "State Cities"

# Boolean Flags
remove_video = True
remove_audio = True

# Numeric Constants 
max_consecutive_words_for_topic = 3 # e.g. Unigram: "Donald" | Bigram: "Donald Trump" | Trigram: "President Donald Trump"

# Sentence Categories
presidential_candidates = {
    "Donald Trump": [
        "Donald", "Trump"
    ],
    "Kamala Harris": [
        "Kamala", "Harris"
    ]
}
state_cities = {
    "Michigan": read_unique_items_from_file(os.path.join(cities_path, "michigan-cities.txt")),
    "Arizona": read_unique_items_from_file(os.path.join(cities_path, "arizona-cities.txt")),
    "Pennsylvania": read_unique_items_from_file(os.path.join(cities_path, "pennsylvania-cities.txt"))
}

# Words for Sentence Filtering
stop_words = set(stopwords.words("english"))
generic_abstract_nouns = {
    "thing", "stuff", "event",
    "aspect", "issue", "place",
    "person"
}

# Additional Preprocessing of Configurations
presidential_candidates = {presidential_candidate: list(set(names)) for presidential_candidate, names in presidential_candidates.items()}

# Collect Data (YouTube Videos)

In [62]:
def download_youtube_video(video_filename: str, stream: Stream) -> tuple[str, str]:
    # Create Video Directory
    os.makedirs(video_output_path, exist_ok=True)
    
    # Set Path for Video File
    video_file = os.path.join(video_output_path, video_filename)
    
    # Delete Old Existing Video File (note: to clean any corrupted file)
    if os.path.exists(video_file):
        os.remove(video_file)
        
    # Download Video File
    print("") # Just New Line for Better Output
    print(f'Downloading (Video): {video_filename}')
    print("") # Just New Line for Better Output
    stream.download(output_path=video_output_path, filename=video_filename)
    print("") # Just New Line for Better Output
    print("") # Just New Line for Better Output
    
    # Return Video File and Name
    return video_file, video_filename

# Audio Extraction (Video to Audio)

In [63]:
def extract_audio_from_video(video_file: str, video_filename: str) -> tuple[str, str]:
    # Create the Audio Directory
    os.makedirs(audio_output_path, exist_ok=True)

    # Set Audio File Name ("[YouTube Video ID] [title].mp3")
    audio_filename = f'{os.path.splitext(video_filename)[0]}.mp3'

    # Set Path for Audio File
    audio_file = os.path.join(audio_output_path, audio_filename)
    
    # Delete Old Existing Audio File (note: to clean any corrupted file)
    if os.path.exists(audio_file):
        os.remove(audio_file)
    
    # Extract Audio File
    print(f'Extracting (Audio): {audio_filename}')
    print("") # Just New Line for Better Output
    (
        ffmpeg
        .input(video_file)
        .output(audio_file, format="mp3", acodec="libmp3lame", loglevel="info")
        .run(overwrite_output=True)
    )
    
    # Return Audio File and Name
    return audio_file, audio_filename

# Transcription (Audio to Text)

In [64]:
def transcribe_audio_to_text(audio_file: str, audio_filename: str):
    # Create the Transcription Directory
    os.makedirs(transcription_output_path, exist_ok=True)
    
    # Set Transcription File Name ("[YouTube Video ID] [title].txt")
    transcription_filename = f'{os.path.splitext(audio_filename)[0]}.txt'
    
    # Set Path for Transcription File
    transcription_file = os.path.join(transcription_output_path, transcription_filename)
            
    # Get/Download OpenAI Whisper Model
    """ 
    Models: 
        tiny, base, small, medium, large, turbo
    English-Only:
        tiny.en, base.en, small.en, medium.en
    
    Required VRAM:              Speed:
        1) 1GB - tiny, base         1) 10x - tiny
        2) 2GB - small              2) 8x - turbo
        3) 5GB - medium             3) 7x - base
        4) 6GB - turbo              4) 4x - small
        5) 10GB - large             5) 2x - medium
                                    6) 1x - large
    
    Quote from OpenAI: 
        - The .en models for English-only applications tend to perform better, especially for the tiny.en and base.en models.
        We observed that the difference becomes less significant for the small.en and medium.en models.
    
    Note: 4GB lang VRAM ko kaya small.en ginamit
    """  
    print(f'Transcribing (Text): {transcription_filename}')
    print("") # Just New Line for Better Output
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
    model = whisper.load_model("small.en", device=device)

    # Transcribe Audio File (Saves Whole Text in Memory Before Disk to Avoid Corruption)
    result = model.transcribe(audio_file, fp16=False, verbose=False)
    try:
        with open(transcription_file, "w", encoding="utf-8") as f:
            f.write(result["text"])
    except:
        if os.path.exists(transcription_file):
            os.remove(transcription_file)

# Execute Data Gathering

In [65]:
yt_urls = read_unique_items_from_file(yt_video_links_filename)

with tqdm(total=len(yt_urls), desc="Getting YouTube URLs") as pbar:
    for index, url in enumerate(yt_urls):        
        current = f'{index+1}/{len(yt_urls)}'
                
        # Get Video Information
        yt = YouTube(url, on_progress_callback=on_progress)
        stream = yt.streams.get_audio_only()
        
        # Sanitize Video File Name and Add YouTube Video ID
        video_filename = f'[{yt.video_id}] {sanitize_filename(stream.default_filename)}'
        
        # Get File Name Without Extension (e.g., ".mp4")
        filename = os.path.splitext(video_filename)[0]
        
        # Skip If Transcription Already Exists
        transcription_exists = False
        pbar.set_description(f'Checking Existing Transcriptions (Video {current})')
        if os.path.exists(transcription_output_path):
            transcription_filename = f'{filename}.txt'
            for existing_transcription_filename in os.listdir(transcription_output_path):
                if existing_transcription_filename == transcription_filename:
                    existing_transcription_path = os.path.join(transcription_output_path, existing_transcription_filename)
                    if os.path.exists(existing_transcription_path):
                        transcription_exists = True
        if transcription_exists:
            # Delete/Keep Video File
            if remove_video:
                video_file = os.path.join(video_output_path, video_filename)
                if os.path.exists(video_file):
                    os.remove(video_file)
                    
            # Delete/Keep Audio File
            if remove_audio:
                audio_filename = f'{filename}.mp3'
                audio_file = os.path.join(audio_output_path, audio_filename)
                if os.path.exists(audio_file):
                    os.remove(audio_file)
                    
            pbar.update(1)
            continue

        # Log YouTube URL being Processed
        print("") # Just New Line for Better Output
        print(f'Found YouTube Video (URL): {url}')
        
        # Download YouTube Video
        pbar.set_description(f'Downloading (Video {current}) ')
        video_file, video_filename = download_youtube_video(video_filename, stream)
        
        # Extract Audio from Video -> Delete/Keep Video File
        pbar.set_description(f'Extracting (Audio {current})')
        audio_file, audio_filename = extract_audio_from_video(video_file, video_filename)
        if remove_video: os.remove(video_file)
        
        # Transcribe Audio to Text -> Delete/Keep Audio File
        pbar.set_description(f'Transcribing (Text {current})')
        transcribe_audio_to_text(audio_file, audio_filename)
        if remove_audio: os.remove(audio_file)
        
        pbar.update(1)
    pbar.set_description("Finished Data Gathering")

Getting YouTube URLs:   0%|          | 0/15 [00:00<?, ?it/s]

# Data Preprocessing

In [66]:
def process_transcripts_into_csv_of_sentences() -> DataFrame:
    # Initialize List of Sentences
    list_of_sentences = []
    
    def is_sentence_complete(sentence: str) -> bool:
        """
        Its a Proper Sentence If:
            1) It has Atleast 1 Noun or Pronoun
            2) It has Atleast 1 Verb
        """
        pos_tags = pos_tag(word_tokenize(sentence)) # POS Tagging
        has_subject = any(tag in ["NN", "NNS", "NNP", "NNPS"] for _, tag in pos_tags) # Exclude Sentence w/out Noun and Pronoun
        has_verb = any(tag.startswith("VB") for _, tag in pos_tags) # Exclude Sentence w/out Verb
    
        return has_subject and has_verb
    
    # Collect List of All Sentences from Transcripts
    transcription_files = os.listdir(transcription_output_path)
    with tqdm(total=len(transcription_files), desc="Collecting Sentences from Transcripts") as pbar:
        for filename in transcription_files:
            file_path = os.path.join(transcription_output_path, filename)
            
            with open(file_path, "r") as file:
                text = file.read()
                
                # Split into Sentences
                sentences = list(set(sent_tokenize(text)))
    
                # Filter Proper Sentences (With Noun/Proper-Noun and Verb)
                sentences = [sentence for sentence in sentences if is_sentence_complete(sentence)]
                
                # Add Sentence to the List
                list_of_sentences.extend(sentences)
                
            pbar.update(1)

    # Save List of All Sentences into CSV file
    df = pd.DataFrame(list_of_sentences, columns=["Sentence"])
    df.to_csv(transcript_sentences_filename, index=False)
    return df

list_of_sentences = process_transcripts_into_csv_of_sentences()
list_of_sentences.head()

Collecting Sentences from Transcripts:   0%|          | 0/15 [00:00<?, ?it/s]

Unnamed: 0,Sentence
0,"And I have to say, I've noticed this, especial..."
1,Are you following up?
2,"Somebody who was raised in the middle class, b..."
3,And I will make sure that your abortion rights...
4,He's a Marine.


# Text Cleaning: Topic Modeling and Sentence Filtering

In [67]:
def filter_related_sentences() -> tuple[DataFrame, BERTopic]:
    # Get All Sentences from Transcript
    df = pd.read_csv(transcript_sentences_filename)
    sentences = df["Sentence"].tolist()
    
    # Set Filter for Words as Possible Topics
    def filter_possible_topics(text: str) -> list:
        """
        Filter Words If its a Possible Topic:
            1) Only Nouns and Proper Nouns (e.g. Dollars, Currency)
            2) No Stop Words (e.g. in, to)
            3) No Generic Abstract Nouns (e.g. thing, stuff)
            4) Minumum of Three Letter Words (e.g. USA)
            5) Exclude Numbers
        """
        
        pos_tags = pos_tag(word_tokenize(text)) # POS Tagging
        possible_topics = [
            token.lower() for token, pos in pos_tags
            if pos in ["NN", "NNS", "NNP", "NNPS"] # Nouns / Proper Nouns
            and token.lower() not in stop_words # Exclude Stop Words
            and token.lower() not in generic_abstract_nouns # Exclude Generic Abstract Nouns
            and len(token) > 2 # Exclude One/Two Letter Words
            and not token.isnumeric() # Exclude Numbers
        ]
        
        return possible_topics
    vectorizer_model = CountVectorizer(
        ngram_range=(1, max_consecutive_words_for_topic),
        tokenizer=filter_possible_topics
    )

    # Train BERTopic model
    topic_model = BERTopic(
        embedding_model="all-MiniLM-L6-v2",
        n_gram_range=(1, max_consecutive_words_for_topic),
        vectorizer_model=vectorizer_model,        
        verbose=True
    )
    topic_model.fit_transform(sentences)
    
    # Get BERTopic Results
    topic_info = topic_model.get_topic_info()
    
    # Initialize Lists for our filtered results
    list_of_related_sentences = []
    
    # Analyze each topic row in topic_info
    for _, row in topic_info.iterrows():
        if row["Topic"] == -1: continue # Skip Outlier

        # Get List of Topics and its Related Sentences
        topic_keywords = row["Representation"]
        related_sentences = row["Representative_Docs"]
        
        # Check Candidate Mentions in Topics
        presidential_candidate_mentions = set() # Avoid Duplicates
        for presidential_candidate, names in presidential_candidates.items():
            if (
                any(name.lower() in keyword.lower() for name in names for keyword in topic_keywords) 
                or any(presidential_candidate.lower() in keyword.lower() for keyword in topic_keywords)
            ): 
                presidential_candidate_mentions.add(presidential_candidate)
        
        # Make Sure Only 1 Candidate is Mentioned
        if len(presidential_candidate_mentions) != 1: continue

        # Check State Mentions in Topics (Including Cities)
        state_mentions = set() # Avoid Duplicates
        for state, cities in state_cities.items():
            if (
                any(city.lower() in keyword.lower() for city in cities for keyword in topic_keywords) 
                or any(state.lower() in keyword.lower() for keyword in topic_keywords)
            ): 
                state_mentions.add(state)

        # Make Sure Only 1 State is Mentioned
        if len(presidential_candidate_mentions) != 1: continue
        
        """
        Add Related Sentences Only If:
            1) Only 1 Candidate is Mentioned
            2) Only 1 State is Mentioned
        """
        if len(presidential_candidate_mentions) == 1 and len(state_mentions) == 1:
            presidential_candidate = presidential_candidate_mentions.pop()
            state = state_mentions.pop()

            # Add All Related Sentences with Corresponding Presidential Candidate, State, and Topic Keywords
            for sentence in related_sentences:
                list_of_related_sentences.append({
                    "Sentence": sentence,
                    "Presidential_Candidate": presidential_candidate,
                    "State": state,
                    "Topic_Keywords": topic_keywords
                })
    
    # Save List of All Related Sentences into CSV file
    df = pd.DataFrame(list_of_related_sentences)
    df.to_csv(related_transcript_sentences_filename, index=False)
    return df, topic_model

list_of_related_sentences, bertopic_model = filter_related_sentences()
print(f'Number of Related Sentences: {len(list_of_related_sentences)}')
print(f'Number of Topic Clusters: {len(bertopic_model.get_topic_info())}')
bertopic_model.get_topic_info()

2024-10-19 20:06:01,176 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/103 [00:00<?, ?it/s]

2024-10-19 20:06:18,919 - BERTopic - Embedding - Completed ✓
2024-10-19 20:06:18,919 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-19 20:06:30,446 - BERTopic - Dimensionality - Completed ✓
2024-10-19 20:06:30,446 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-19 20:06:30,588 - BERTopic - Cluster - Completed ✓
2024-10-19 20:06:30,604 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-19 20:06:32,046 - BERTopic - Representation - Completed ✓


Number of Related Sentences: 9
Number of Topic Clusters: 72


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1288,-1_people_president_country_time,"[people, president, country, time, years, trum...","[People are dying., These are the greatest peo..."
1,0,121,0_bit_mcmahon_michelle_interview,"[bit, mcmahon, michelle, interview, business, ...","[Somebody who was raised in the middle class, ..."
2,1,111,1_couple things_conversation_bit_okay,"[couple things, conversation, bit, okay, coupl...","[Well, I'll say a couple of things., Sorry, I ..."
3,2,111,2_care_health_health care_seniors,"[care, health, health care, seniors, medicare,...",[If you look at her health care plan by allowi...
4,3,76,3_criminals_gang_murder_immigrants,"[criminals, gang, murder, immigrants, people, ...",[Armed Venezuelan gang members storming an apa...
...,...,...,...,...,...
67,66,12,66_governor_governors_whitmer_welcome,"[governor, governors, whitmer, welcome, left d...","[By the way, this is a really great governor, ..."
68,67,12,67_latino_latino voters_voters_sort,"[latino, latino voters, voters, sort, message,...","[You know, but I think that I think it's somet..."
69,68,10,68_reuben_ruben_gallego_ruben gallegos states,"[reuben, ruben, gallego, ruben gallegos states...",[It is my distinct honor to welcome our next S...
70,69,10,69_women_women rights_rights_idea,"[women, women rights, rights, idea, idea women...",[And it is my pledge to you when Congress pass...


In [68]:
list_of_related_sentences.head()

Unnamed: 0,Sentence,Presidential_Candidate,State,Topic_Keywords
0,We win Pennsylvania.,Donald Trump,Pennsylvania,"[pennsylvania, pennsylvania pennsylvania, dona..."
1,It's good to be back in Pennsylvania.,Donald Trump,Pennsylvania,"[pennsylvania, pennsylvania pennsylvania, dona..."
2,"You know, we win Pennsylvania.",Donald Trump,Pennsylvania,"[pennsylvania, pennsylvania pennsylvania, dona..."
3,We then fixed the border.,Donald Trump,Pennsylvania,"[border, border border, country border, way do..."
4,And the third is our border.,Donald Trump,Pennsylvania,"[border, border border, country border, way do..."


In [84]:
"""
Sa tingin ko need natin 5k sentences minimum for Related di lang for gathered.
Kasi mamaya 5k Unrelated Sentences nakuha natin tas 100 lang dun Related with candidate & state.

Ang naiisip ko since 6 Combination = 3 candidate * 2 state
Gawin natin 5000/6 = 834 Related Sentences required set natin as minimum per Combination

Trump  - Arizona      = 834 Related Sentences
Harris - Arizona      = 834 Related Sentences
Trump  - Michigan     = 834 Related Sentences
Harris - Michigan     = 834 Related Sentences
Trump  - Pennsylvania = 834 Related Sentences
Harris - Pennsylvania = 834 Related Sentences
                     --------------------------
                      ~5000 Related Sentences
"""
(
    pd
    .read_csv(related_transcript_sentences_filename)
    .groupby(["Presidential_Candidate", "State"])
    .size()
    .reset_index(name="count")
    .style.hide(axis="index")
)

Presidential_Candidate,State,count
Donald Trump,Michigan,3
Donald Trump,Pennsylvania,6


# Data Sentiment Annotation

# Train-Validation-Test Split

# Training: Sentiment Analysis with BERT

# Validation: Hyperparameter Tuning and Model Optimization

# Testing: Model Evaluation

# Comparative Analysis 

# Proof of Concept