# Import Dependencies

In [1]:
# Import Main Dependencies
import os, re, nltk
import pandas as pd
from pandas import DataFrame
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Import Other Dependencies
from tqdm.auto import tqdm

# Additional Downloads
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("stopwords")




[nltk_data] Downloading package punkt_tab to C:\Users\MSI
[nltk_data]     Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\MSI Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to C:\Users\MSI
[nltk_data]     Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Define Utilities

In [2]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with "_"
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub("_", filename)

    return sanitized_filename
    
def read_unique_items_from_file(file: str) -> list:
    with open(file, "r") as f:
        return list(set(url.strip() for url in f.readlines() if url.strip()))

# Set Configurations

In [3]:
# File Names
transcript_sentences_filename = "transcript_sentences.csv"
related_transcript_sentences_filename = "related_transcript_sentences.csv"

# Folder Names
transcription_output_path = "Transcription"
cities_path = "State Cities"

# Boolean Flags
remove_video = True
remove_audio = True

# Numeric Constants 
max_consecutive_words_for_topic = 3 # e.g. Unigram: "Donald" | Bigram: "Donald Trump" | Trigram: "President Donald Trump"

# Sentence Categories
presidential_candidates = {
    "Donald Trump": [
        "Donald", "Trump"
    ],
    "Kamala Harris": [
        "Kamala", "Harris"
    ]
}
state_cities = {
    "Michigan": read_unique_items_from_file(os.path.join(cities_path, "michigan-cities.txt")),
    "Arizona": read_unique_items_from_file(os.path.join(cities_path, "arizona-cities.txt")),
    "Pennsylvania": read_unique_items_from_file(os.path.join(cities_path, "pennsylvania-cities.txt"))
}

# Words for Sentence Filtering
stop_words = set(stopwords.words("english"))
generic_abstract_nouns = {
    "thing", "stuff", "event",
    "aspect", "issue", "place",
    "person"
}

# Additional Preprocessing of Configurations
presidential_candidates = {presidential_candidate: list(set(names)) for presidential_candidate, names in presidential_candidates.items()}

# Sentence Extraction (Transcripts to CSV)

In [8]:
def process_transcripts_into_csv_of_sentences() -> DataFrame:
    # Initialize List of Sentences
    list_of_sentences = []
    
    def is_sentence_complete(sentence: str) -> bool:
        """
        Its a Proper Sentence If:
            1) It has Atleast 1 Noun or Pronoun
            2) It has Atleast 1 Verb
        """
        pos_tags = pos_tag(word_tokenize(sentence)) # POS Tagging
        has_subject = any(tag in ["NN", "NNS", "NNP", "NNPS"] for _, tag in pos_tags) # Exclude Sentence w/out Noun and Pronoun
        has_verb = any(tag.startswith("VB") for _, tag in pos_tags) # Exclude Sentence w/out Verb
    
        return has_subject and has_verb
    
    # Collect List of All Sentences from Transcripts
    transcription_files = os.listdir(transcription_output_path)
    with tqdm(total=len(transcription_files), desc="Collecting Sentences from Transcripts") as pbar:
        for filename in transcription_files:
            file_path = os.path.join(transcription_output_path, filename)
            
            with open(file_path, "r") as file:
                text = file.read()
                
                # Split into Sentences
                sentences = list(set(sent_tokenize(text)))
    
                # Filter Proper Sentences (With Noun/Proper-Noun and Verb)
                sentences = [sentence for sentence in sentences if is_sentence_complete(sentence)]
                
                # Add Sentence to the List
                list_of_sentences.extend(sentences)
                
            pbar.update(1)

    # Save List of All Sentences into CSV file
    df = pd.DataFrame(list(set(list_of_sentences)), columns=["Sentence"])
    df.to_csv(transcript_sentences_filename, index=False)
    return df

list_of_sentences = process_transcripts_into_csv_of_sentences()
print(f'Number of Sentences: {len(list_of_sentences)}')
list_of_sentences.head()

Collecting Sentences from Transcripts:   0%|          | 0/69 [00:00<?, ?it/s]

Number of Sentences: 5242


Unnamed: 0,Sentence
0,Harris' policies on expanding healthcare acces...
1,And so I think that's what you're hearing from...
2,"Every time since 2016, Donald Trump, or one of..."
3,"I think it's dead, but you can never say it be..."
4,And then he comes over with the hundred dollar...


# BERTopic: Relevant Sentence Filtering (CSV)

In [9]:
def filter_related_sentences() -> tuple[DataFrame, BERTopic]:
    # Get All Sentences from Transcript
    df = pd.read_csv(transcript_sentences_filename)
    sentences = df["Sentence"].tolist()
    
    # Set Filter for Words as Possible Topics
    def filter_possible_topics(text: str) -> list:
        """
        Filter Words If its a Possible Topic:
            1) Only Nouns and Proper Nouns (e.g. Dollars, Currency)
            2) No Stop Words (e.g. in, to)
            3) No Generic Abstract Nouns (e.g. thing, stuff)
            4) Minumum of Three Letter Words (e.g. USA)
            5) Exclude Numbers
        """
        
        pos_tags = pos_tag(word_tokenize(text)) # POS Tagging
        possible_topics = [
            token.lower() for token, pos in pos_tags
            if pos in ["NN", "NNS", "NNP", "NNPS"] # Nouns / Proper Nouns
            and token.lower() not in stop_words # Exclude Stop Words
            and token.lower() not in generic_abstract_nouns # Exclude Generic Abstract Nouns
            and len(token) > 2 # Exclude One/Two Letter Words
            and not token.isnumeric() # Exclude Numbers
        ]
        
        return possible_topics
    vectorizer_model = CountVectorizer(
        ngram_range=(1, max_consecutive_words_for_topic),
        tokenizer=filter_possible_topics
    )

    # Train BERTopic model
    topic_model = BERTopic(
        embedding_model="all-MiniLM-L6-v2",
        n_gram_range=(1, max_consecutive_words_for_topic),
        vectorizer_model=vectorizer_model,        
        verbose=True
    )
    topic_model.fit_transform(sentences)
    
    # Get BERTopic Results
    topic_info = topic_model.get_topic_info()
    
    # Initialize Lists for our filtered results
    list_of_related_sentences = []
    
    # Analyze each topic row in topic_info
    for _, row in topic_info.iterrows():
        if row["Topic"] == -1: continue # Skip Outlier

        # Get List of Topics and its Related Sentences
        topic_keywords = row["Representation"]
        related_sentences = row["Representative_Docs"]
        
        # Check Candidate Mentions in Topics
        presidential_candidate_mentions = set() # Avoid Duplicates
        for presidential_candidate, names in presidential_candidates.items():
            if (
                any(name.lower() in keyword.lower() for name in names for keyword in topic_keywords) 
                or any(presidential_candidate.lower() in keyword.lower() for keyword in topic_keywords)
            ): 
                presidential_candidate_mentions.add(presidential_candidate)
        
        # Make Sure Only 1 Candidate is Mentioned
        if len(presidential_candidate_mentions) != 1: continue

        # Check State Mentions in Topics (Including Cities)
        state_mentions = set() # Avoid Duplicates
        for state, cities in state_cities.items():
            if (
                any(city.lower() in keyword.lower() for city in cities for keyword in topic_keywords) 
                or any(state.lower() in keyword.lower() for keyword in topic_keywords)
            ): 
                state_mentions.add(state)

        # Make Sure Only 1 State is Mentioned
        if len(presidential_candidate_mentions) != 1: continue
        
        """
        Add Related Sentences Only If:
            1) Only 1 Candidate is Mentioned
            2) Only 1 State is Mentioned
        """
        if len(presidential_candidate_mentions) == 1 and len(state_mentions) == 1:
            presidential_candidate = presidential_candidate_mentions.pop()
            state = state_mentions.pop()

            # Add All Related Sentences with Corresponding Presidential Candidate, State, and Topic Keywords
            for sentence in related_sentences:
                list_of_related_sentences.append({
                    "Sentence": sentence,
                    "Presidential_Candidate": presidential_candidate,
                    "State": state,
                    "Topic_Keywords": topic_keywords
                })
    
    # Save List of All Related Sentences into CSV file
    df = pd.DataFrame(list_of_related_sentences)
    df.to_csv(related_transcript_sentences_filename, index=False)
    return df, topic_model

list_of_related_sentences, bertopic_model = filter_related_sentences()
print(f'Number of Related Sentences: {len(list_of_related_sentences)}')
print(f'Number of Topic Clusters: {len(bertopic_model.get_topic_info())}')
bertopic_model.get_topic_info()

2024-10-20 16:37:25,446 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/164 [00:00<?, ?it/s]

2024-10-20 16:37:55,997 - BERTopic - Embedding - Completed ✓
2024-10-20 16:37:55,997 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-20 16:38:07,188 - BERTopic - Dimensionality - Completed ✓
2024-10-20 16:38:07,202 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-20 16:38:07,623 - BERTopic - Cluster - Completed ✓
2024-10-20 16:38:07,670 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-20 16:38:13,939 - BERTopic - Representation - Completed ✓


Number of Related Sentences: 6
Number of Topic Clusters: 92


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2086,-1_trump_election_people_state,"[trump, election, people, state, voters, penns...","[You have you have some terrific people., We'v..."
1,0,233,0_questions_loomer_part_morelos,"[questions, loomer, part, morelos, debate, tru...",[And that's why they're expecting to see lots ...
2,1,141,1_trump_president_grievances_president trump,"[trump, president, grievances, president trump...","[ We want Trump!, And while most Trump support..."
3,2,131,2_economy_money_store_inflation,"[economy, money, store, inflation, grocery, me...","[Whether it's the economy., What do you think ..."
4,3,127,3_harris_campaign_harris campaign_favor,"[harris, campaign, harris campaign, favor, vic...",[Harris campaign is trying to address that as ...
...,...,...,...,...,...
87,86,11,86_conspiracy_conspiracy theories_theories_con...,"[conspiracy, conspiracy theories, theories, co...","[If you talk about it, oh, it's a conspiracy.,..."
88,87,11,87_report_reports_reporting oops report_end re...,"[report, reports, reporting oops report, end r...","[So we we call it the end of the report., Oops..."
89,88,11,88_ryan_tim ryan_tim_democrat,"[ryan, tim ryan, tim, democrat, j.d, congressm...","[Tim Ryan, the Democrat nominee, congressman, ..."
90,89,11,89_election day_day_election_days election,"[election day, day, election, days election, o...",[What happens on Election Day is the most impo...


In [10]:
list_of_related_sentences.head()

Unnamed: 0,Sentence,Presidential_Candidate,State,Topic_Keywords
0,"At the same time, Trump is such a unique figur...",Donald Trump,Pennsylvania,"[pennsylvania, trump pennsylvania, town, every..."
1,"Every other pollster, Emerson, Insider Advanta...",Donald Trump,Pennsylvania,"[pennsylvania, trump pennsylvania, town, every..."
2,This is part of a swing that Donald Trump did ...,Donald Trump,Pennsylvania,"[pennsylvania, trump pennsylvania, town, every..."
3,The pushing with Kamala with more current thin...,Kamala Harris,Pennsylvania,"[kamala, bit bit, bit bit bit, kamala face, bi..."
4,So some are even pointing fingers at her campa...,Kamala Harris,Pennsylvania,"[kamala, bit bit, bit bit bit, kamala face, bi..."


In [11]:
"""
Sa tingin ko need natin 5k sentences minimum for Related di lang for gathered.
Kasi mamaya 5k Unrelated Sentences nakuha natin tas 100 lang dun Related with candidate & state.

Ang naiisip ko since 6 Combination = 3 candidate * 2 state
Gawin natin 5000/6 = 834 Related Sentences required set natin as minimum per Combination

Trump  - Arizona      = 834 Related Sentences
Harris - Arizona      = 834 Related Sentences
Trump  - Michigan     = 834 Related Sentences
Harris - Michigan     = 834 Related Sentences
Trump  - Pennsylvania = 834 Related Sentences
Harris - Pennsylvania = 834 Related Sentences
                     --------------------------
                      ~5000 Related Sentences
"""
(
    pd
    .read_csv(related_transcript_sentences_filename)
    .groupby(["Presidential_Candidate", "State"])
    .size()
    .reset_index(name="count")
    .style.hide(axis="index")
)

Presidential_Candidate,State,count
Donald Trump,Pennsylvania,3
Kamala Harris,Pennsylvania,3
