# Import Dependencies

In [1]:
# Import Dependencies
import os, re, torch, nltk
import pandas as pd
from pandas import DataFrame
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm

# Additional Downloads
nltk.download("punkt_tab", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("stopwords", quiet=True)




True

# Define Utilities

In [2]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with "_"
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub("_", filename)

    return sanitized_filename
    
def read_unique_items_from_file(file: str) -> list:
    if os.path.exists(file):
        with open(file, "r") as f:
            return list(set(e.strip() for e in f.readlines() if e.strip()))
    return []

# Set Configurations

In [3]:
# File Names
transcript_sentences_filename = "transcript_sentences.csv"
relevant_transcript_sentences_filename = "relevant_transcript_sentences.csv"

# Folder Names
transcription_output_path = "Transcription"
cities_path = "State Cities"

# Numeric Constants 
max_pair_of_words_for_topic = 2 # e.g. Unigram: "Donald" | Bigram: "Donald Trump"
min_number_of_word_in_relevant_sentence = 5 # Only Accepts 5-Word Sentence as Relevant | e.g. "This is a nice place"
min_similarity_of_topic_modeling = 0.1 # Range:[0.1, 1] | Minimum Similarity for Topic Assignment | Higher Value Means Stricter Match

# Sentence Categories
presidential_candidates = {
    "Donald Trump": [
        "Donald", "Trump"
    ],
    "Kamala Harris": [
        "Kamala", "Harris"
    ]
}
state_cities = {
    "Michigan": read_unique_items_from_file(os.path.join(cities_path, "michigan-cities.txt")),
    "Arizona": read_unique_items_from_file(os.path.join(cities_path, "arizona-cities.txt")),
    "Pennsylvania": read_unique_items_from_file(os.path.join(cities_path, "pennsylvania-cities.txt"))
}

# Words for Sentence Filtering
stop_words = set(stopwords.words("english"))

# Additional Preprocessing of Configurations
presidential_candidates = {presidential_candidate: list(set(names)) for presidential_candidate, names in presidential_candidates.items()}
presidential_candidates_and_states_combinations = [
    f"{location}_{name}".lower()
    for full_name, names in presidential_candidates.items() 
    for location in [state for state in state_cities] + [city for cities in state_cities.values() for city in cities]
    for name in [full_name] + names
]
presidential_candidates_and_states_combinations_in_2d = [
    [location.lower(), full_name.lower()] + [name.lower() for name in names]
    for full_name, names in presidential_candidates.items() 
    for location in [state for state in state_cities] + [city for cities in state_cities.values() for city in cities]
]

# Sentence Extraction (Transcripts to CSV)

In [4]:
def process_transcripts_into_csv_of_sentences() -> DataFrame:    
    # Initialize List of Sentences
    list_of_sentences = []
        
    # Collect List of Sentences from Transcription Files
    transcription_files = os.listdir(transcription_output_path)
    total_transcription_file = len(transcription_files)
    with tqdm(total=total_transcription_file, desc=f'Collecting Sentences [0/{total_transcription_file} Transcript]') as pbar:
        for index, filename in enumerate(transcription_files):
            current = f'{index+1}/{total_transcription_file}'
            if filename == ".ipynb_checkpoints":
                pbar.update(1)
                continue
            
            pbar.set_description(f'Collecting Sentences [{current} Transcript]')

            # Open Transcription File
            file_path = os.path.join(transcription_output_path, filename)
            with open(file_path, "r") as file:
                transcription = file.read()
                
                # Split Transcript into Sentences
                sentences = sent_tokenize(transcription)

                # Remove Consecutive Duplicates (Caused by Whisper)
                sentences = [sentence for i, sentence in enumerate(sentences) if i == 0 or sentence != sentences[i-1]]
                
                # Add the Sentences
                list_of_sentences.extend(sentences)
            
            pbar.update(1)

    # Save List of All Sentences into CSV file
    df = pd.DataFrame(list(set(list_of_sentences)), columns=["Sentence"])
    df.to_csv(transcript_sentences_filename, index=False, errors="ignore")
    return df

list_of_sentences = process_transcripts_into_csv_of_sentences()
print(f'Number of Sentences: {len(list_of_sentences)}')
list_of_sentences.head()

Collecting Sentences [0/608 Transcript]:   0%|          | 0/608 [00:00<?, ?it/s]

Number of Sentences: 50221


Unnamed: 0,Sentence
0,He could win by 21 but still like to say that ...
1,Clinton did manage to hold Minnesota by a poin...
2,Do you think it changed the race?
3,"And on Zoom, we are joined by Republican consu..."
4,"How you doing, Danielle?"


# BERTopic: Relevant Sentence Filtering (CSV)

In [5]:
def filter_relevant_sentences() -> tuple[DataFrame, BERTopic]:
    # Get All Collected Sentences from Transcript
    df = pd.read_csv(transcript_sentences_filename, encoding_errors="ignore")
    sentences = df["Sentence"].tolist()
    
    # Set Filter for Words as Possible Topics
    def filter_possible_topics(text: str) -> list:
        """
        Filter Words If its a Possible Topic:
            1) Only Nouns and Proper Nouns (e.g. Dollars, Currency)
            2) No Stop Words (e.g. in, to)
            3) Minumum of Two Letter Words (e.g. Ox)
            4) Exclude Numbers
        """
        
        pos_tags = pos_tag(word_tokenize(text)) # POS Tagging
        possible_topics = [
            token.lower() for token, pos in pos_tags
            if pos in ["NN", "NNS", "NNP", "NNPS"] # Nouns / Proper Nouns
            and token.lower() not in stop_words # Exclude Stop Words
            and len(token) > 1 # Exclude One Letter Words (e.g. Included: Ox)
            and not token.isnumeric() # Exclude Numbers
        ]
        
        return possible_topics
    vectorizer_model = CountVectorizer(
        ngram_range=(1, max_pair_of_words_for_topic),
        tokenizer=filter_possible_topics
    )

    # Train BERTopic model
    topic_model = BERTopic(
        embedding_model="all-MiniLM-L6-v2",
        n_gram_range=(1, max_pair_of_words_for_topic),
        vectorizer_model=vectorizer_model,
        seed_topic_list=presidential_candidates_and_states_combinations_in_2d,
        zeroshot_topic_list=presidential_candidates_and_states_combinations,
        zeroshot_min_similarity=min_similarity_of_topic_modeling,
        nr_topics=len(presidential_candidates_and_states_combinations),
        verbose=True
    )
    topic_ids, _ = topic_model.fit_transform(sentences)
    
    # Get BERTopic Results
    topic_info = topic_model.get_topic_info()
    topics_and_documents = pd.DataFrame({"Topic": topic_ids, "Representative_Docs": sentences})
    
    # Initialize Lists for Relevant Sentences
    list_of_relevant_sentences = []

    def is_sentence_complete(sentence: str) -> bool:
        # Exclude Sentence with Less than 5 or N Words
        return len(word_tokenize(sentence)) >= min_number_of_word_in_relevant_sentence
        
    # Get Relevant Sentences
    for _, row in topic_info.iterrows():
        topic_id = row["Topic"]
        if topic_id == -1: continue # Skip Outlier
    
        # Get List of Relevant Topics and Sentences
        topic_ngramed_keywords = [
            ngramed_keyword 
            for ngramed_keyword in row["Representation"]
            if ngramed_keyword
        ]
        relevant_sentences = topics_and_documents[topics_and_documents["Topic"] == topic_id]["Representative_Docs"].tolist()
        
        # Check Candidate Mentions in Topics
        presidential_candidate_mentions = set() # Avoid Duplicates
        for presidential_candidate, names in presidential_candidates.items():
            if (
                any(
                    word and presidential_candidate
                    and presidential_candidate.strip().lower() == word.strip().lower()
                    for ngramed_keyword in topic_ngramed_keywords
                    for word in ngramed_keyword.split(" ")
                )
                or any(
                    word and name
                    and name.strip().lower() == word.strip().lower()
                    for name in names
                    for ngramed_keyword in topic_ngramed_keywords
                    for word in ngramed_keyword.split(" ")
                )
            ): 
                presidential_candidate_mentions.add(presidential_candidate)
        
        # Make Sure Only 1 Candidate is Mentioned
        if len(presidential_candidate_mentions) != 1: continue

        # Check State Mentions in Topics (Including Cities)
        state_mentions = set() # Avoid Duplicates
        for state, cities in state_cities.items():
            if (
                any(
                    word and state
                    and state.strip().lower() == word.strip().lower()
                    for ngramed_keyword in topic_ngramed_keywords
                    for word in ngramed_keyword.split(" ")
                )
                or any(
                    word and city
                    and city.strip().lower() == word.strip().lower()
                    for city in cities
                    for ngramed_keyword in topic_ngramed_keywords
                    for word in ngramed_keyword.split(" ")
                )
            ): 
                state_mentions.add(state)

        # Make Sure Only 1 State is Mentioned
        if len(state_mentions) != 1: continue
        """
        Add Relevant Sentences Only If:
            1) Only 1 Candidate is Mentioned in the Topic
            2) Only 1 State is Mentioned in the Topic
        """
        if (
            len(presidential_candidate_mentions) == 1
            and len(state_mentions) == 1
        ):
            presidential_candidate = presidential_candidate_mentions.pop()
            state = state_mentions.pop()
            
            # Add All Relevant Sentences with their Corresponding Presidential Candidate, State, and Topic Keywords
            for sentence in relevant_sentences:
                # Filter Complete Sentence with Word Count >= 5 or N
                if is_sentence_complete(sentence):
                    list_of_relevant_sentences.append({
                        "Sentence": sentence,
                        "Presidential_Candidate": presidential_candidate,
                        "State": state,
                        "Topic_Keywords": topic_ngramed_keywords
                    })
    
    # Save List of All Relevant Sentences into CSV file
    df = pd.DataFrame(list_of_relevant_sentences)
    df.to_csv(relevant_transcript_sentences_filename, index=False, errors="ignore")
    return df, topic_model

list_of_relevant_sentences, bertopic_model = filter_relevant_sentences()

2024-10-30 00:35:00,385 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1570 [00:00<?, ?it/s]

2024-10-30 00:37:15,782 - BERTopic - Embedding - Completed ✓
2024-10-30 00:37:15,782 - BERTopic - Guided - Find embeddings highly related to seeded topics.


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

2024-10-30 00:37:20,040 - BERTopic - Guided - Completed ✓
2024-10-30 00:37:20,050 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-30 00:38:19,619 - BERTopic - Dimensionality - Completed ✓
2024-10-30 00:38:19,620 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-10-30 00:38:23,500 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-10-30 00:38:48,641 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-30 00:38:48,654 - BERTopic - Cluster - Completed ✓
2024-10-30 00:38:48,660 - BERTopic - Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...
2024-10-30 00:38:48,782 - BERTopic - Zeroshot Step 2 - Completed ✓
2024-10-30 00:38:48,785 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-30 00:39:22,030 - BERTopic - Representation - Completed ✓
2024-10-30 00:39:22,031 - BERTopic - Topic redu

In [6]:
bertopic_model.get_topic_info().sort_values(by='Count', ascending=False).head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
8,7,905,pennsylvania_trump,"[pennsylvania, trump pennsylvania, pennsylvani...","[Trump is taking Pennsylvania., So technically..."
164,163,856,ishpeming_trump,"[trump democrats, problem trump, democracy, po...",[We use his own words a lot of the time to rem...
293,292,819,belding_trump,"[argument, reality, bullshit, evidence, truth,...",[Sources say Trump meanwhile has had less form...
1729,1728,800,coolidge_kamala harris,"[clinton, joe biden, joe, democrat, clinton ba...","[Another formerly reliably Republican state, B..."
2,1,741,michigan_trump,"[michigan, state michigan, michigan michigan, ...",[This is probably more probable than winning M...


In [7]:
list_of_relevant_sentences.head()

Unnamed: 0,Sentence,Presidential_Candidate,State,Topic_Keywords
0,"Now Trump won Michigan back in 2016, but lost ...",Donald Trump,Michigan,"[michigan trump, trump michigan, state michiga..."
1,With the state's primary election just a few ...,Donald Trump,Michigan,"[michigan trump, trump michigan, state michiga..."
2,"Michigan had it tied between Trump and Harris,...",Donald Trump,Michigan,"[michigan trump, trump michigan, state michiga..."
3,"Additionally, Sky News reports that Arab Ameri...",Donald Trump,Michigan,"[michigan trump, trump michigan, state michiga..."
4,Trump has taken an unusual tack in Michigan la...,Donald Trump,Michigan,"[michigan trump, trump michigan, state michiga..."


In [8]:
"""
Sa tingin ko need natin 5k sentences minimum for Relevant Sentences di lang for gathered.
Kasi mamaya 5k Random Sentences nakuha natin tas 100 lang dun Relevant with candidate & state.

Ang naiisip ko since meron 6 Combinations = 3 candidate * 2 state
Gawin natin 5000/6 = 834 Relevant Sentences required set natin as minimum per Combination

Trump  - Arizona      = 834 Relevant Sentences
Harris - Arizona      = 834 Relevant Sentences
Trump  - Michigan     = 834 Relevant Sentences
Harris - Michigan     = 834 Relevant Sentences
Trump  - Pennsylvania = 834 Relevant Sentences
Harris - Pennsylvania = 834 Relevant Sentences
                     --------------------------
                      ~5000 Relevant Sentences
"""
def print_statistics():
    try:
        grouped_df = (
            list_of_relevant_sentences
            .groupby(["Presidential_Candidate", "State"])
            .size()
            .reset_index(name="count")
        )
        total_count = grouped_df["count"].sum()
        total_row = pd.DataFrame({"Presidential_Candidate": [""], "State": ["Total"], "count": [total_count]})
        grouped_df = pd.concat([grouped_df, total_row], ignore_index=True)
        return grouped_df.style.hide(axis="index")
    except: return "No Relevant Sentences"

print_statistics()

Presidential_Candidate,State,count
Donald Trump,Arizona,678
Donald Trump,Michigan,414
Donald Trump,Pennsylvania,1831
Kamala Harris,Arizona,325
Kamala Harris,Michigan,64
Kamala Harris,Pennsylvania,1009
,Total,4321
