# Import Dependencies

In [1]:
# Import Dependencies
import os, re, nltk
import pandas as pd
from pandas import DataFrame
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm

# Additional Downloads
nltk.download("punkt_tab", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("stopwords", quiet=True)




True

# Define Utilities

In [2]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with "_"
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub("_", filename)

    return sanitized_filename
    
def read_unique_items_from_file(file: str) -> list:
    if os.path.exists(file):
        with open(file, "r", errors="ignore") as f:
            return list(set(e.strip() for e in f.readlines() if e.strip()))
    return []

# Set Configurations

In [3]:
# File Names
transcript_sentences_filename = "transcript_sentences.csv"
relevant_transcript_sentences_filename = "relevant_transcript_sentences.csv"

# Folder Names
transcription_path = "Transcription"
cities_transcription_paths = {
    "Michigan": os.path.join(transcription_path, "Michigan"),
    "Arizona": os.path.join(transcription_path, "Arizona"),
    "Pennsylvania": os.path.join(transcription_path, "Pennsylvania"),
}
cities_path = "State Cities"

# Numeric Constants 
max_pair_of_words_for_topic = 2
"""
    > Maximum words to consider for topic extraction
        1: Unigram (e.g., "Donald")
        2: Bigram (e.g., "Donald Trump")
"""

min_number_of_word_in_relevant_sentence = 5
"""
    > Minimum word count required for a sentence to be considered relevant
    Example: "This is a nice place" = 5 words
"""

min_similarity_of_topic_modeling = 0.7
"""
    > Minimum similarity threshold for topic matching
        Range: [0.1, 1.0]
    Note: Higher values require closer matches
    Example: 0.7 = 70% similarity required
"""

max_topic_count = None
"""
    Topic count limiter for dimensionality reduction
        None: No reduction, keep all discovered topics
        "auto": Automatically Reduces Topic Count
        Number: Force reduce to specified number of topics
    Note: (1) Using Number for Numeric reduction may merge unrelated topics together
          (2) Lower Number may increase precision but risk missing relevant topics
"""

# Sentence Categories
presidential_candidates = {
    "Donald Trump": [
        "Donald", "Trump",
        "Trump Donald", "Donald John", "John Trump",
        "Donald J", "J. Donald", "J. Trump", "Trump J",
        "Trump D", "D. Trump", "John D", "D. John",
        "Donald T", "T. Donald", "John T", "T. John",
        "Donald John Trump", "Donald J Trump", "D. J. Trump", 
        "President Donald", "President Trump",
        "President Donald Trump"
    ],
    "Kamala Harris": [
        "Kamala", "Harris",
        "Harris Kamala", "Kamala Devi", "Devi Harris",
        "Kamala D", "D. Kamala", "D. Harris", "Harris D",
        "Harris K", "K. Harris", "Devi K", "K. Devi",
        "Kamala H", "H. Kamala", "Devi H", "H. Devi",
        "Kamala Devi Harris", "Kamala D Harris", "K. D. Harris",  
        "President Kamala", "President Harris",
        "President Kamala Harris"
    ]
}
original_state_cities = ["Arizona", "Michigan", "Pennsylvania"]
state_cities = {
    "Arizona": read_unique_items_from_file(os.path.join(cities_path, "arizona-cities.txt")),
    "Michigan": read_unique_items_from_file(os.path.join(cities_path, "michigan-cities.txt")),
    "Pennsylvania": read_unique_items_from_file(os.path.join(cities_path, "pennsylvania-cities.txt")),
    "Alabama": ["AL", "A.L"],
    "Alaska": ["AK", "A.K"],
    "Arkansas": ["AR", "A.R"],
    "California": ["CA", "C.A"],
    "Colorado": ["CO", "C.O"],
    "Connecticut": ["CT", "C.T"],
    "Delaware": ["DE", "D.E"],
    "Florida": ["FL", "F.L"],
    "Georgia": ["GA", "G.A"],
    "Hawaii": ["HI", "H.I"],
    "Idaho": ["ID", "I.D"],
    "Illinois": ["IL", "I.L"],
    "Indiana": ["IN", "I.N"],
    "Iowa": ["IA", "I.A"],
    "Kansas": ["KS", "K.S"],
    "Kentucky": ["KY", "K.Y"],
    "Louisiana": ["LA", "L.A"],
    "Maine": ["ME", "M.E"],
    "Maryland": ["MD", "M.D"],
    "Massachusetts": ["MA", "M.A"],
    "Minnesota": ["MN", "M.N"],
    "Mississippi": ["MS", "M.S"],
    "Missouri": ["MO", "M.O"],
    "Montana": ["MT", "M.T"],
    "Nebraska": ["NE", "N.E"],
    "Nevada": ["NV", "N.V"],
    "New Hampshire": ["NH", "N.H"],
    "New Jersey": ["NJ", "N.J"],
    "New Mexico": ["NM", "N.M"],
    "New York": ["NY", "N.Y"],
    "North Carolina": ["NC", "N.C"],
    "North Dakota": ["ND", "N.D"],
    "Ohio": ["OH", "O.H"],
    "Oklahoma": ["OK", "O.K"],
    "Oregon": ["OR", "O.R"],
    "Rhode Island": ["RI", "R.I"],
    "South Carolina": ["SC", "S.C"],
    "South Dakota": ["SD", "S.D"],
    "Tennessee": ["TN", "T.N"],
    "Texas": ["TX", "T.X"],
    "Utah": ["UT", "U.T"],
    "Vermont": ["VT", "V.T"],
    "Virginia": ["VA", "V.A"],
    "Washington": ["WA", "W.A"],
    "West Virginia": ["WV", "W.V"],
    "Wisconsin": ["WI", "W.I"],
    "Wyoming": ["WY", "W.Y"],
}

# Words for Sentence Filtering
stop_words = set(stopwords.words("english"))

# Additional Preprocessing of Configurations
presidential_candidates = {presidential_candidate: list(set(names)) for presidential_candidate, names in presidential_candidates.items()}
presidential_candidates_combinations = [
    name.lower()
    for full_name, names in presidential_candidates.items()
    for name in ([full_name] if len(full_name.split()) <= max_pair_of_words_for_topic else []) + [
        name for name in names
        if len(name.split()) <= max_pair_of_words_for_topic
    ]
]
presidential_candidates_combinations_in_2d = [
    ([full_name.lower()] if len(full_name.split()) <= max_pair_of_words_for_topic else []) + [
        name.lower()
        for name in names
        if len(name.split()) <= max_pair_of_words_for_topic
    ]
    for full_name, names in presidential_candidates.items()
]

# Sentence Extraction (Transcripts to CSV)

In [4]:
def process_transcripts_into_csv_of_sentences() -> pd.DataFrame:
    # Initialize list of sentences and possible states
    list_of_sentences = []

    # Collect sentences from each state's transcription files
    for state, path in cities_transcription_paths.items():
        transcription_files = os.listdir(path)
        total_transcription_files = len(transcription_files)

        with tqdm(total=total_transcription_files, desc=f'Collecting Sentences for {state} [0/{total_transcription_files} Transcript]') as pbar:
            for index, filename in enumerate(transcription_files):
                current = f'{index + 1}/{total_transcription_files}'
                if filename == ".ipynb_checkpoints":
                    pbar.update(1)
                    continue

                pbar.set_description(f'Collecting Sentences for {state} [{current} Transcript]')

                # Open transcription file
                file_path = os.path.join(path, filename)
                with open(file_path, "r", errors="ignore") as file:
                    transcription = file.read()

                    # Split transcript into sentences
                    sentences = sent_tokenize(transcription)

                    # Remove consecutive duplicates
                    sentences = [sentence for i, sentence in enumerate(sentences) if i == 0 or sentence != sentences[i - 1]]

                    # Append each sentence with the state name
                    list_of_sentences.extend([(sentence, state) for sentence in sentences])

                pbar.update(1)

    # Convert the list of sentences and states into a DataFrame
    df = pd.DataFrame(list(set(list_of_sentences)), columns=["Sentence", "Possible_State"])
    df.to_csv(transcript_sentences_filename, index=False, errors="ignore")
    return df

# Run the function and print summary
list_of_sentences = process_transcripts_into_csv_of_sentences()
print(f'Number of Sentences: {len(list_of_sentences)}')
list_of_sentences

Collecting Sentences for Michigan [0/260 Transcript]:   0%|          | 0/260 [00:00<?, ?it/s]

Collecting Sentences for Arizona [0/182 Transcript]:   0%|          | 0/182 [00:00<?, ?it/s]

Collecting Sentences for Pennsylvania [0/268 Transcript]:   0%|          | 0/268 [00:00<?, ?it/s]

Number of Sentences: 57876


Unnamed: 0,Sentence,Possible_State
0,"These guys come home from guys, females, I'm s...",Michigan
1,Ain't nobody here voting for Kamala Harris.,Pennsylvania
2,Cheap Chinese imports.,Michigan
3,What happened to Mike Pence wasn't an isolated...,Arizona
4,I'm asking you to be excited about the future ...,Michigan
...,...,...
57871,All right Garrett joining us now Garrett what ...,Michigan
57872,Democrats talk about challenging in Florida.,Pennsylvania
57873,"From the night of the election, the stop the s...",Arizona
57874,I was right about the crime stats going way up.,Pennsylvania


# BERTopic: Relevant Sentence Filtering (CSV)

In [5]:
def filter_relevant_sentences() -> tuple[DataFrame, BERTopic]:
    # Get All Collected Sentences from Transcript and a Map with their Respective Possible State
    df = pd.read_csv(transcript_sentences_filename, encoding_errors="ignore")
    sentences_possible_state = pd.Series(df['Possible_State'].values, index=df['Sentence']).to_dict()
    sentences = df["Sentence"].tolist()
    
    # Define Filter for Words as Possible Topics
    def filter_possible_topics(text: str) -> list:
        """
            Filter Words If it's a Possible Topic:
                1) Only Nouns and Proper Nouns (e.g. Dollars, Currency)
                2) No Stop Words (e.g. in, to)
                3) Minimum of Two-Letter Words (e.g. Ox)
                4) Exclude Numbers
        """
        pos_tags = pos_tag(word_tokenize(text)) # POS Tagging
        # Return Possible Topics
        return [
            token.lower() for token, pos in pos_tags
            if pos in ["NN", "NNS", "NNP", "NNPS"] # Nouns / Proper Nouns
            and token.lower() not in stop_words # Exclude Stop Words
            and len(token) > 1 # Exclude One-Letter Words (e.g. Included: Ox)
            and not token.isnumeric() # Exclude Numbers
        ]
    vectorizer_model = CountVectorizer(
        ngram_range=(1, max_pair_of_words_for_topic),
        tokenizer=filter_possible_topics
    )

    # Train BERTopic model
    topic_model = BERTopic(
        embedding_model="all-MiniLM-L6-v2",
        n_gram_range=(1, max_pair_of_words_for_topic),
        vectorizer_model=vectorizer_model,
        seed_topic_list=presidential_candidates_combinations_in_2d,
        zeroshot_topic_list=presidential_candidates_combinations,
        zeroshot_min_similarity=min_similarity_of_topic_modeling,
        nr_topics=None if max_topic_count is None else "auto" if max_topic_count == "auto" else max(len(presidential_candidates_combinations), max_topic_count),
        verbose=True
    )
    topic_ids, _ = topic_model.fit_transform(sentences)
    
    # Get BERTopic Results
    topic_info = topic_model.get_topic_info()
    topics_and_documents = pd.DataFrame({"Topic": topic_ids, "Representative_Docs": sentences})

    # Initialize Lists for Relevant Sentences
    list_of_relevant_sentences = []
    
    # Define Filters for Relevant Sentences
    """
        Add Relevant Sentences Only If:
            1) Only 1 Candidate is Mentioned in the Topic
            2) No Other State is Mentioned in the Topic Different from Possible State
            3) Sentence has Word Count Greater than N or 5
    """
    def get_only_if_1_candidate_mentioned_in_the_topic(topic_ngramed_keywords: list[str]) ->  str | None:
        # Collect Candidate Mentions in Topics
        presidential_candidate_mentions = set() # Avoid Duplicates
        for presidential_candidate, names in presidential_candidates.items():
            if (
                # Any Candidate is Mentioned in Topic
                any(
                    (
                        presidential_candidate and ngramed_keyword
                        and f' {presidential_candidate.strip().lower()} ' in f' {ngramed_keyword.strip().lower()} '
                    ) or (
                        presidential_candidate and word
                        and presidential_candidate.strip().lower() == word.strip().lower()
                    )
                    for ngramed_keyword in topic_ngramed_keywords
                    for word in ngramed_keyword.split(" ")
                )
                # Any Other Candidate Names is Mentioned in Topic
                or any(
                    (
                        name and ngramed_keyword
                        and f' {name.strip().lower()} ' in f' {ngramed_keyword.strip().lower()} '
                    ) or (
                        name and word
                        and name.strip().lower() == word.strip().lower()
                    )
                    for name in names
                    for ngramed_keyword in topic_ngramed_keywords
                    for word in ngramed_keyword.split(" ")
                )
            ):
                # Add The Candidate Mentioned
                presidential_candidate_mentions.add(presidential_candidate)
        # Return the Candidate If It's the Only 1 Mentioned
        if len(presidential_candidate_mentions) == 1:
            return presidential_candidate_mentions.pop()
        else:
            return None
    def get_if_no_other_state_mentioned_in_topic_different_from_possible_state(topic_ngramed_keywords: list[str], sentence: str) ->  str | None:
        # Get Possible State for the Sentence
        possible_state = sentences_possible_state[sentence]
        if possible_state not in state_cities: raise ValueError(f'This Sentence has Invalid Possible State ({possible_state}): "{sentence}"')
        # Filter Sentence with Topic of [Other State] Not in [Arizona, Michigan, Pennsylvania]
        if possible_state not in original_state_cities: return None
        # Filter Sentence with Topic of [Other State] Different from its [Possible State]
        other_states = [state for state in state_cities if state is not possible_state]
        if any(
            f' {other_state.strip().lower()} ' in f' {ngramed_keyword.strip().lower()} '
            or (
                word
                and other_state.strip().lower() == word.strip().lower()
            )
            for other_state in other_states
            for ngramed_keyword in topic_ngramed_keywords
            for word in ngramed_keyword.split(" ")
        ): return None
        # Filter Sentence with Topics of [Other States' Cities] Different from its [Possible State Cities]
        other_state_cities = [
            other_city
            for other_cities in {
                state: state_cities[state]
                for state in state_cities
                if state is not possible_state
            }.values()
            for other_city in other_cities
            if other_city
        ]
        if any(
            f' {other_city.strip().lower()} ' in f' {ngramed_keyword.strip().lower()} '
            or (
                word
                and other_city.strip().lower() == word.strip().lower()
            )
            for other_city in other_state_cities
            for ngramed_keyword in topic_ngramed_keywords
            for word in ngramed_keyword.split(" ")
        ): return None
        # Return the Possible State
        return possible_state
    def sentence_has_word_count_greater_than_n(sentence: str, min_number_of_word_in_relevant_sentence: int = min_number_of_word_in_relevant_sentence) -> bool:
        # Only include word tags
        word_tags = {
            "CC",  # conjunctions (and, or, but)
            "CD",  # cardinal numbers
            "DT",  # determiners (the, a, this)
            "EX",  # existential there
            "FW",  # foreign words
            "IN",  # prepositions
            "JJ", "JJR", "JJS",  # adjectives
            "LS",  # List markers (First, Second, One, Two, A, B, etc.)
            "MD",  # modals (can, should)
            "NN", "NNP", "NNPS", "NNS",  # nouns
            "PDT",  # pre-determiners
            "PRP", "PRP$",  # pronouns
            "RB", "RBR", "RBS",  # adverbs
            "RP",  # particles
            "TO",  # to
            "UH",  # interjections
            "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",  # verbs
            "WDT", "WP", "WP$", "WRB"  # wh-words
        }
        pos_tags = pos_tag(word_tokenize(sentence)) # POS Tagging
        word_count = sum(1 for word, pos in pos_tags if pos in word_tags)
        return word_count >= min_number_of_word_in_relevant_sentence

    # Get Relevant Sentences
    for _, row in topic_info.iterrows():
        topic_id = row["Topic"]
        if topic_id == -1: continue # Skip Outlier
    
        # Get List of Topics and their Sentences
        topic_ngramed_keywords = [
            ngramed_keyword 
            for ngramed_keyword in row["Representation"]
            if ngramed_keyword
        ]
        topic_sentences = topics_and_documents[topics_and_documents["Topic"] == topic_id]["Representative_Docs"].tolist()
        
        for sentence in topic_sentences:
            # Check and Get 1 Candidate from Topics
            presidential_candidate = get_only_if_1_candidate_mentioned_in_the_topic(topic_ngramed_keywords)
            if presidential_candidate is None: continue
            
            # Check and Get 1 State from Topics and [Possible State assigned in Sentence] 
            state = get_if_no_other_state_mentioned_in_topic_different_from_possible_state(topic_ngramed_keywords, sentence)
            if state is None: continue
            
            # Check if sentence has word count greater than N (default: 5)
            if not sentence_has_word_count_greater_than_n(sentence): continue
            
            # Add Relevant Sentence with their Respective Candidate and State
            list_of_relevant_sentences.append({
                "Sentence": sentence,
                "Presidential_Candidate": presidential_candidate,
                "State": state,
                "Topic_Keywords": topic_ngramed_keywords
            })
    
    # Save List of All Relevant Sentences into CSV file
    df = pd.DataFrame(list_of_relevant_sentences)
    df.to_csv(relevant_transcript_sentences_filename, index=False, errors="ignore")
    return df, topic_model

list_of_relevant_sentences, bertopic_model = filter_relevant_sentences()

2024-11-21 01:13:34,031 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1809 [00:00<?, ?it/s]

2024-11-21 01:15:31,936 - BERTopic - Embedding - Completed ✓
2024-11-21 01:15:31,938 - BERTopic - Guided - Find embeddings highly related to seeded topics.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-11-21 01:15:32,170 - BERTopic - Guided - Completed ✓
2024-11-21 01:15:32,171 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-21 01:16:23,173 - BERTopic - Dimensionality - Completed ✓
2024-11-21 01:16:23,175 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-11-21 01:16:23,482 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-11-21 01:17:02,510 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-21 01:17:06,932 - BERTopic - Cluster - Completed ✓
2024-11-21 01:17:06,932 - BERTopic - Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...
2024-11-21 01:17:07,021 - BERTopic - Zeroshot Step 2 - Completed ✓
2024-11-21 01:17:07,025 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-21 01:17:38,129 - BERTopic - Representation - Completed ✓


In [6]:
bertopic_model.get_topic_info().sort_values(by="Count", ascending=False)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,25609,-1_trump_harris_people_election,"[trump, harris, people, election, states, stat...",[The most important battleground state is Penn...
308,307,1144,307_biden_joe biden_joe_president biden,"[biden, joe biden, joe, president biden, biden...","[Why are you voting for Joe Biden?, I was unco..."
393,392,882,392_character_guy_man_blah,"[character, guy, man, blah, ego, blah blah, bl...","[He's brain damaged, blah, blah, blah, blah, b..."
525,524,619,524_michigan_state michigan_michigan michigan_...,"[michigan, state michigan, michigan michigan, ...","[Michigan is very odd as well., As far as Mich..."
275,274,542,274_thank_name_thanks_john,"[thank, name, thanks, john, ruben ruben, j.d, ...","[Thank you so much for joining us., Thank you,..."
...,...,...,...,...,...
12,11,3,president harris,"[today vice, harris today, president harris, h...","[Harris or Trump., What about vice president H..."
8,7,3,j. trump,"[j. trump, donald j., j., trump donald, donald...","[That is Donald J. Trump., Absolutely 100% Don..."
9,8,1,trump donald,"[donald trump, donald, trump, , , , , , , ]",[Donald Trump?]
14,13,1,d. kamala,"[kamala kamala, kamala, , , , , , , , ]","[Kamala, kamala, kamala, kamala.]"


In [7]:
list_of_relevant_sentences

Unnamed: 0,Sentence,Presidential_Candidate,State,Topic_Keywords
0,"And I think Donald Trump, I'm going to call it...",Donald Trump,Michigan,"[trump america, america trump, trump trump, tr..."
1,"Donald Trump all the way, 100%.",Donald Trump,Arizona,"[trump america, america trump, trump trump, tr..."
2,That's not what Harris needed.,Kamala Harris,Pennsylvania,"[harris harris, problem harris, harris problem..."
3,Let's say Harris holds onto that.,Kamala Harris,Michigan,"[harris harris, problem harris, harris problem..."
4,I'm trying to keep an open mind about Harris.,Kamala Harris,Pennsylvania,"[harris harris, problem harris, harris problem..."
...,...,...,...,...
6014,So he's out his polling is outperforming his l...,Donald Trump,Michigan,"[trump polls, polls trump, polls, vote polls, ..."
6015,We're going to look at the election polling in...,Donald Trump,Michigan,"[trump polls, polls trump, polls, vote polls, ..."
6016,They are up significantly in most polling from...,Donald Trump,Arizona,"[trump polls, polls trump, polls, vote polls, ..."
6017,The larger polls I'm seeing has Trump up by tw...,Donald Trump,Pennsylvania,"[trump polls, polls trump, polls, vote polls, ..."


In [8]:
"""
Sa tingin ko need natin 5k sentences minimum for Relevant Sentences di lang for gathered.
Kasi mamaya 5k Random Sentences nakuha natin tas 100 lang dun Relevant with candidate & state.

Ang naiisip ko since meron 6 Combinations = 3 candidate * 2 state
Gawin natin 5000/6 = 834 Relevant Sentences required set natin as minimum per Combination

Trump  - Arizona      = 834 Relevant Sentences
Harris - Arizona      = 834 Relevant Sentences
Trump  - Michigan     = 834 Relevant Sentences
Harris - Michigan     = 834 Relevant Sentences
Trump  - Pennsylvania = 834 Relevant Sentences
Harris - Pennsylvania = 834 Relevant Sentences
               -------------------------------
               Total: ~5000 Relevant Sentences
"""
def print_statistics():
    try:
        grouped_df = (
            list_of_relevant_sentences
            .groupby(["Presidential_Candidate", "State"])
            .size()
            .reset_index(name="count")
        )
        total_count = grouped_df["count"].sum()
        total_row = pd.DataFrame({"Presidential_Candidate": [""], "State": ["Total"], "count": [total_count]})
        grouped_df = pd.concat([grouped_df, total_row], ignore_index=True)
        return grouped_df.style.hide(axis="index")
    except: return "No Relevant Sentences"
print_statistics()

Presidential_Candidate,State,count
Donald Trump,Arizona,1059
Donald Trump,Michigan,1613
Donald Trump,Pennsylvania,2063
Kamala Harris,Arizona,242
Kamala Harris,Michigan,372
Kamala Harris,Pennsylvania,670
,Total,6019
