# Import Dependencies

In [36]:
# Import Dependencies
import os, nltk, spacy, neuralcoref
import pandas as pd
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from tqdm.auto import tqdm

# Additional Downloads
nltk.download("punkt", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
nltk.download("stopwords", quiet=True)

# Load Spacy Language Model with Sentencizer and NeuralCoref 
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe(nlp.create_pipe("sentencizer"))
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x1f3b3dc2dd8>

# Define Utilities

In [37]:
def read_unique_items_from_file(file: str) -> list:
    if os.path.exists(file):
        with open(file, "r", errors="ignore") as f:
            return list(set(e.strip() for e in f.readlines() if e.strip()))
    return []

# Set Configurations

In [38]:
# File Names
transcript_documents_filename = "transcript_documents.csv"
relevant_transcript_sentences_filename = "relevant_transcript_sentences.csv"

# Folder Names
transcription_path = "Transcription"
cities_transcription_paths = {
    "Michigan": os.path.join(transcription_path, "Michigan"),
    "Arizona": os.path.join(transcription_path, "Arizona"),
    "Pennsylvania": os.path.join(transcription_path, "Pennsylvania"),
}
cities_path = "State Cities"

# Numeric Constants 
max_pair_of_words_for_main_subject_mention = 3
"""
    > Maximum words to consider for main subject mentions
        1: Unigram (e.g., "Donald")
        2: Bigram (e.g., "Donald Trump")
"""

min_similarity_of_topic_modeling = 0.7
"""
    > Minimum similarity threshold for topic matching
        Range: [0.1, 1.0]
    Note: Higher values require closer matches
    Example: 0.7 = 70% similarity required
"""

# Sentence Categories
presidential_candidates = {
    "Donald Trump": [
        "Donald", "Trump",
        "Trump Donald", "Donald John", "John Trump",
        "Donald J", "J. Donald", "J. Trump", "Trump J",
        "Trump D", "D. Trump", "John D", "D. John",
        "Donald T", "T. Donald", "John T", "T. John",
        "Donald John Trump", "Donald J Trump", "D. J. Trump", 
        "President Donald", "President Trump",
        "President Donald Trump"
    ],
    "Kamala Harris": [
        "Kamala", "Harris",
        "Harris Kamala", "Kamala Devi", "Devi Harris",
        "Kamala D", "D. Kamala", "D. Harris", "Harris D",
        "Harris K", "K. Harris", "Devi K", "K. Devi",
        "Kamala H", "H. Kamala", "Devi H", "H. Devi",
        "Kamala Devi Harris", "Kamala D Harris", "K. D. Harris",  
        "President Kamala", "President Harris",
        "President Kamala Harris"
    ]
}
original_state_cities = ["Arizona", "Michigan", "Pennsylvania"]
state_cities = {
    "Arizona": read_unique_items_from_file(os.path.join(cities_path, "arizona-cities.txt")),
    "Michigan": read_unique_items_from_file(os.path.join(cities_path, "michigan-cities.txt")),
    "Pennsylvania": read_unique_items_from_file(os.path.join(cities_path, "pennsylvania-cities.txt")),
    "Alabama": ["AL", "A.L"],
    "Alaska": ["AK", "A.K"],
    "Arkansas": ["AR", "A.R"],
    "California": ["CA", "C.A"],
    "Colorado": ["CO", "C.O"],
    "Connecticut": ["CT", "C.T"],
    "Delaware": ["DE", "D.E"],
    "Florida": ["FL", "F.L"],
    "Georgia": ["GA", "G.A"],
    "Hawaii": ["HI", "H.I"],
    "Idaho": ["ID", "I.D"],
    "Illinois": ["IL", "I.L"],
    "Indiana": ["IN", "I.N"],
    "Iowa": ["IA", "I.A"],
    "Kansas": ["KS", "K.S"],
    "Kentucky": ["KY", "K.Y"],
    "Louisiana": ["LA", "L.A"],
    "Maine": ["ME", "M.E"],
    "Maryland": ["MD", "M.D"],
    "Massachusetts": ["MA", "M.A"],
    "Minnesota": ["MN", "M.N"],
    "Mississippi": ["MS", "M.S"],
    "Missouri": ["MO", "M.O"],
    "Montana": ["MT", "M.T"],
    "Nebraska": ["NE", "N.E"],
    "Nevada": ["NV", "N.V"],
    "New Hampshire": ["NH", "N.H"],
    "New Jersey": ["NJ", "N.J"],
    "New Mexico": ["NM", "N.M"],
    "New York": ["NY", "N.Y"],
    "North Carolina": ["NC", "N.C"],
    "North Dakota": ["ND", "N.D"],
    "Ohio": ["OH", "O.H"],
    "Oklahoma": ["OK", "O.K"],
    "Oregon": ["OR", "O.R"],
    "Rhode Island": ["RI", "R.I"],
    "South Carolina": ["SC", "S.C"],
    "South Dakota": ["SD", "S.D"],
    "Tennessee": ["TN", "T.N"],
    "Texas": ["TX", "T.X"],
    "Utah": ["UT", "U.T"],
    "Vermont": ["VT", "V.T"],
    "Virginia": ["VA", "V.A"],
    "Washington": ["WA", "W.A"],
    "West Virginia": ["WV", "W.V"],
    "Wisconsin": ["WI", "W.I"],
    "Wyoming": ["WY", "W.Y"],
}

# Words for Sentence Filtering
stop_words = set(stopwords.words("english"))

# Additional Preprocessing of Configurations
presidential_candidates = {presidential_candidate: list(set(names)) for presidential_candidate, names in presidential_candidates.items()}

# Sentence Extraction (Transcripts to CSV)

In [39]:
def preprocess_transcripts_into_csv_of_documents() -> pd.DataFrame:
    # Initialize list of sentences and possible states
    list_of_documents = []
    
    # Collect documents from each state's transcription files
    for state, path in cities_transcription_paths.items():
        transcription_files = os.listdir(path)
        total_transcription_files = len(transcription_files)

        with tqdm(total=total_transcription_files, desc=f'Preprocessing Documents for {state} [0/{total_transcription_files} Transcript]') as pbar:
            for index, filename in enumerate(transcription_files):
                current = f'{index + 1}/{total_transcription_files}'
                if filename == ".ipynb_checkpoints":
                    pbar.update(1)
                    continue

                pbar.set_description(f'Preprocessing Documents for {state} [{current} Transcript]')

                # Open transcription file
                file_path = os.path.join(path, filename)
                with open(file_path, "r", errors="ignore") as file:
                    transcription = file.read()

                    # Split transcript into tokenized sentences
                    tokenized_sentences = sent_tokenize(transcription)

                    # Merge related sentence tokens to complete sentence
                    sentences = []
                    previous_sentence_token = ""
                    
                    for i, tokenized_sentence in enumerate(tokenized_sentences):
                        # Strip Unnecessary White Spaces
                        tokenized_sentence = tokenized_sentence.strip()
                        
                        # Remove if Tokenized Sentence without Punctuation is Empty
                        if tokenized_sentence[:-1].strip() == "": continue
                        
                        # Remove consecutive duplicates (Whisper Hallucination)
                        if i != 0 and tokenized_sentence == tokenized_sentences[i - 1]: continue
                        
                        """
                            Add Current Sentence Token to Previous If Either:
                                1) Merged Sentences Is Still Incomplete 
                                2) Previous Sentence is a question
                                3) Previous Sentence ends with ellipsis
                        """
                        def is_sentence_not_complete(sentence_token: str):
                            pos_tags = pos_tag(word_tokenize(sentence_token))
                        
                            subject_count = 0
                            predicate_count = 0
                        
                            for word, tag in pos_tags:
                                if tag in {"NN", "NNS", "NNP", "NNPS", "PRP"}: # At least 2 Subject
                                    subject_count += 1
                                elif tag.startswith("V"):  # At least 1 Verb
                                    predicate_count += 1
                                # Early Check and Return
                                if subject_count >= 2 and predicate_count >= 1:
                                    return False
                        
                            return True
                        
                        current_merged_sentence_tokens = sentences[-1] if i > 0 else None
                        if (
                            current_merged_sentence_tokens is not None
                            and (
                                # Incomplete Merged Sentence Tokens
                                is_sentence_not_complete(current_merged_sentence_tokens) 
                                # Previous Sentence Token is a Question
                                or previous_sentence_token.endswith("?")
                                # Previous Sentence Token Ends with Ellipsis
                                or previous_sentence_token.endswith("...")
                                or previous_sentence_token.endswith("..") 
                            )
                        ):
                            # If conditions are met, connect with the previous sentence
                            if previous_sentence_token.endswith("..."):
                                sentences[-1] = f'{current_merged_sentence_tokens[:-3]}, {tokenized_sentence[:1].lower()}{tokenized_sentence[1:]}'
                            elif previous_sentence_token.endswith(".."):
                                sentences[-1] = f'{current_merged_sentence_tokens[:-2]}, {tokenized_sentence[:1].lower()}{tokenized_sentence[1:]}'
                            elif previous_sentence_token.endswith("."):
                                sentences[-1] = f'{current_merged_sentence_tokens[:-1]}, {tokenized_sentence[:1].lower()}{tokenized_sentence[1:]}'
                            elif previous_sentence_token.endswith("?"):
                                if tokenized_sentence.endswith("?"):
                                    sentences[-1] = f'{current_merged_sentence_tokens[:-1]}, {tokenized_sentence[:1].lower()}{tokenized_sentence[1:]}'
                                else:
                                 sentences[-1] = f'{current_merged_sentence_tokens[:-1]}: {tokenized_sentence[:1].lower()}{tokenized_sentence[1:]}'
                            else:
                                sentences[-1] = f'{current_merged_sentence_tokens} {tokenized_sentence}'
                        else:
                            # Otherwise, treat as a new sentence
                            sentences.append(tokenized_sentence)
                
                        # Update the previous sentence
                        previous_sentence_token = tokenized_sentence
                        
                    # Append each transcription documents with the possible-state
                    list_of_documents.append((" ".join(sentences).strip(), state))

                pbar.update(1)

    # Convert the list of sentences and states into a DataFrame
    df = pd.DataFrame(list_of_documents, columns=["Document", "Possible_State"])
    df.to_csv(transcript_documents_filename, index=False, errors="ignore")
    return df

list_of_documents = preprocess_transcripts_into_csv_of_documents()
print(f'Number of Documents: {len(list_of_documents)}')
list_of_documents

Preprocessing Documents for Michigan [0/260 Transcript]:   0%|          | 0/260 [00:00<?, ?it/s]

Preprocessing Documents for Arizona [0/168 Transcript]:   0%|          | 0/168 [00:00<?, ?it/s]

Preprocessing Documents for Pennsylvania [0/268 Transcript]:   0%|          | 0/268 [00:00<?, ?it/s]

Number of Documents: 696


Unnamed: 0,Document,Possible_State
0,This morning we are continuing our post debate...,Michigan
1,Kamala Harris leads in key battleground states...,Michigan
2,The Biden administration said on Tuesday it wo...,Michigan
3,Election night on Sky News is going to be very...,Michigan
4,"Hey guys, welcome back to today's video. Today...",Michigan
...,...,...
691,"We're here live in Butler, PA at another merch...",Pennsylvania
692,Pennsylvania's latest poll results are just un...,Pennsylvania
693,"When you watch this clip, I want you to ask yo...",Pennsylvania
694,And so we have this Quinnipiac poll and this w...,Pennsylvania


# BERTopic: Relevant Sentence Filtering (CSV)

In [40]:
def filter_relevant_sentences() -> pd.DataFrame:
    # Get All Collected Sentences from Transcript and a Map with their Respective Possible State
    df = pd.read_csv(transcript_documents_filename)
    documents = pd.Series(df['Possible_State'].values, index=df['Document']).to_dict()

    # Initialize Lists for Relevant Sentences
    list_of_relevant_sentences = []
    seen_relevant_sentences = set()

    document_items = documents.items()
    total_document_items = len(document_items)
    processed_document = 0
    with tqdm(total=total_document_items, desc=f'Extracting Relevant Sentences [0/{total_document_items} Documents]') as pbar:
        for document, possible_state in document_items:
            # Get Document Object from Language Model
            document_obj = nlp(document)

            # Get Relevant Sentences for Candidates
            for sentence_obj in document_obj.sents:
                presidential_candidate_mentions = set() # Avoid Duplicates
                no_other_state_mentioned_different_from_possible_state = True
                possible_relevant_sentence = None

                # Define Methods for Relevant Sentences
                def get_mentions(sentence_object: str, max_ngrams: int = max_pair_of_words_for_main_subject_mention) -> dict:
                    mentions = {}
                    for n in range(1, max_ngrams + 1): # Try n-gram mentions sizes from 1 to max_n
                        for idx in range(n, len(sentence_object) + 1): # Form n-gram length mentions from sentences
                            span = sentence_object[idx - n:idx] # n-gram span
                            if span._.is_coref: # Add only if span has coreference info
                                main_mention = span._.coref_cluster.main.text # Get string from span object
                                mentions[idx-n] = main_mention # Use the index of the first word in mentioned n-gram
                                
                    return mentions

                def add_and_get_presidential_candidate_mentions(ngramed_mention: str) -> set:
                    for presidential_candidate, names in presidential_candidates.items():
                        if (
                            any(
                                (
                                    presidential_candidate and ngramed_mention
                                    and f' {presidential_candidate.strip().lower()} ' in f' {ngramed_mention.strip().lower()} '
                                ) or (
                                    presidential_candidate and word
                                    and presidential_candidate.strip().lower() == word.strip().lower()
                                )
                                for word in ngramed_mention.split(" ")
                            )
                            # Any Other Candidate Names is Mentioned in Topic
                            or any(
                                (
                                    name and ngramed_mention
                                    and f' {name.strip().lower()} ' in f' {ngramed_mention.strip().lower()} '
                                ) or (
                                    name and word
                                    and name.strip().lower() == word.strip().lower()
                                )
                                for name in names
                                for word in ngramed_mention.split(" ")
                            )
                        ):
                            presidential_candidate_mentions.add(presidential_candidate)

                    return presidential_candidate_mentions

                def check_mentioned_state_different_from_possible_state(ngramed_mention: str) -> bool:
                    if possible_state not in state_cities: raise ValueError(f'This Sentence has Invalid Possible State ({possible_state}): "{sentence}"')
                    # Filter Sentence with Topic of [Other State] Not in [Arizona, Michigan, Pennsylvania]
                    if possible_state not in original_state_cities: return False
                    # Filter Sentence with Topic of [Other State] Different from its [Possible State]
                    other_states = [state for state in state_cities if state is not possible_state]
                    if any(
                        f' {other_state.strip().lower()} ' in f' {ngramed_mention.strip().lower()} '
                        or (
                            word
                            and other_state.strip().lower() == word.strip().lower()
                        )
                        for other_state in other_states
                        for word in ngramed_mention.split(" ")
                    ): return False
                    # Filter Sentence with Topics of [Other States' Cities] Different from its [Possible State Cities]
                    other_state_cities = [
                        other_city
                        for other_cities in {
                            state: state_cities[state]
                            for state in state_cities
                            if state is not possible_state
                        }.values()
                        for other_city in other_cities
                        if other_city
                    ]
                    if any(
                        f' {other_city.strip().lower()} ' in f' {ngramed_mention.strip().lower()} '
                        or (
                            word
                            and other_city.strip().lower() == word.strip().lower()
                        )
                        for other_city in other_state_cities
                        for word in ngramed_mention.split(" ")
                    ): return False
                    return True

                mentions = get_mentions(sentence_obj)
                for mention_idx, ngramed_mention in mentions.items():
                    # Ensure No Other State are Mentioned Different from Possible State
                    if no_other_state_mentioned_different_from_possible_state:
                        no_other_state_mentioned_different_from_possible_state = check_mentioned_state_different_from_possible_state(ngramed_mention)
                    else: break

                    # Add Mentioned Candidate and Ensure Only 1 Candidate is Mentioned
                    if len(add_and_get_presidential_candidate_mentions(ngramed_mention)) > 1: break

                    sentence = sentence_obj.text.strip()
                    word = nlp(sentence)
                    for idx, token in enumerate(word):
                        if mention_idx == idx and token.dep_ in {
                            "nsubj",
                            "nsubjpass",
                            "compound",
                            "dobj",
                            "poss"
                        }:
                            possible_relevant_sentence = sentence
                            break

                    if possible_relevant_sentence is not None: break

                if (
                    # Relevant Sentence was Found and Unique 
                    possible_relevant_sentence is not None
                    and possible_relevant_sentence not in seen_relevant_sentences
                    # Re-ensure No Other State is Mentioned Aside from Possible State
                    and no_other_state_mentioned_different_from_possible_state
                    # Re-ensure Only 1 Candidate is Mentioned
                    and len(presidential_candidate_mentions) == 1
                ):
                    # Additional Cleaning
                    if possible_relevant_sentence.startswith(", "):
                        possible_relevant_sentence = possible_relevant_sentence[2:]
                    # Get Presidential Candidate Mentioned
                    presidential_candidate = presidential_candidate_mentions.pop()
                    # Add Relevant Sentence
                    list_of_relevant_sentences.append({
                        "Sentence": possible_relevant_sentence,
                        "Presidential_Candidate": presidential_candidate,
                        "State": possible_state
                    })
                    seen_relevant_sentences.add(possible_relevant_sentence)

            processed_document += 1
            pbar.set_description(f'Extracting Relevant Sentences [{processed_document}/{total_document_items} Documents]')
            pbar.update(1)

    # Save List of All Relevant Sentences into CSV file
    df = pd.DataFrame(list_of_relevant_sentences)
    df.to_csv(relevant_transcript_sentences_filename, index=False, errors="ignore")
    return df

list_of_relevant_sentences = filter_relevant_sentences()
list_of_relevant_sentences

Extracting Relevant Sentences [0/694 Documents]:   0%|          | 0/694 [00:00<?, ?it/s]

Unnamed: 0,Sentence,Presidential_Candidate,State
0,Are you a previous Trump supporter: the first ...,Donald Trump,Michigan
1,"Okay, then what happened: and then I just felt...",Donald Trump,Michigan
2,I like the fact that he was not a politician.,Donald Trump,Michigan
3,"Oh absolutely, small business owner Andrew Cin...",Donald Trump,Michigan
4,Before Harris stepped in I thought it was goin...,Kamala Harris,Michigan
...,...,...,...
8186,"So 48, 14, 18, 22, 26 minus three is 23, 12345...",Kamala Harris,Pennsylvania
8187,"Harris 48, Trump 47, so that one essentially a...",Kamala Harris,Pennsylvania
8188,It just kept going up after Trump.,Kamala Harris,Pennsylvania
8189,"Ace, I really hope the nonsense of trying to h...",Donald Trump,Pennsylvania


In [41]:
"""
Sa tingin ko need natin 5k sentences minimum for Relevant Sentences di lang for gathered.
Kasi mamaya 5k Random Sentences nakuha natin tas 100 lang dun Relevant with candidate & state.

Ang naiisip ko since meron 6 Combinations = 3 candidate * 2 state
Gawin natin 5000/6 = 834 Relevant Sentences required set natin as minimum per Combination

Trump  - Arizona      = 834 Relevant Sentences
Harris - Arizona      = 834 Relevant Sentences
Trump  - Michigan     = 834 Relevant Sentences
Harris - Michigan     = 834 Relevant Sentences
Trump  - Pennsylvania = 834 Relevant Sentences
Harris - Pennsylvania = 834 Relevant Sentences
               -------------------------------
               Total: ~5000 Relevant Sentences
"""
def print_statistics():
    try:
        grouped_df = (
            list_of_relevant_sentences
            .groupby(["Presidential_Candidate", "State"])
            .size()
            .reset_index(name="count")
        )
        total_count = grouped_df["count"].sum()
        total_row = pd.DataFrame({"Presidential_Candidate": [""], "State": ["Total"], "count": [total_count]})
        grouped_df = pd.concat([grouped_df, total_row], ignore_index=True)
        return grouped_df
    except Exception as e: 
        print(e)
        return "No Relevant Sentences"
print_statistics()

Unnamed: 0,Presidential_Candidate,State,count
0,Donald Trump,Arizona,1092
1,Donald Trump,Michigan,1644
2,Donald Trump,Pennsylvania,2165
3,Kamala Harris,Arizona,528
4,Kamala Harris,Michigan,1117
5,Kamala Harris,Pennsylvania,1645
6,,Total,8191
