# Import Dependencies

In [15]:
# Import Dependencies
import os, re, torch, nltk
import pandas as pd
from pandas import DataFrame
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm

# Additional Downloads
nltk.download("punkt_tab", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("stopwords", quiet=True)

True

# Define Utilities

In [16]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with "_"
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub("_", filename)

    return sanitized_filename
    
def read_unique_items_from_file(file: str) -> list:
    if os.path.exists(file):
        with open(file, "r") as f:
            return list(set(e.strip() for e in f.readlines() if e.strip()))
    return []

# Set Configurations

In [17]:
# File Names
transcript_sentences_filename = "transcript_sentences.csv"
relevant_transcript_sentences_filename = "relevant_transcript_sentences.csv"

# Folder Names
transcription_output_path = "Transcription"
cities_path = "State Cities"

# Numeric Constants 
max_consecutive_words_for_topic = 2 # e.g. Unigram: "Donald" | Bigram: "Donald Trump"
min_number_of_word_in_relevant_sentence = 5 # e.g. 5-words: "This is a nice place"
min_similarity_of_topic_modeling = 0.7 # Allow Sentences 70% Similar to Candidate-State Combination

# Sentence Categories
presidential_candidates = {
    "Donald Trump": [
        "Donald", "Trump"
    ],
    "Kamala Harris": [
        "Kamala", "Harris"
    ]
}
state_cities = {
    "Michigan": read_unique_items_from_file(os.path.join(cities_path, "michigan-cities.txt")),
    "Arizona": read_unique_items_from_file(os.path.join(cities_path, "arizona-cities.txt")),
    "Pennsylvania": read_unique_items_from_file(os.path.join(cities_path, "pennsylvania-cities.txt"))
}

# Words for Sentence Filtering
stop_words = set(stopwords.words("english"))

# Additional Preprocessing of Configurations
presidential_candidates = {presidential_candidate: list(set(names)) for presidential_candidate, names in presidential_candidates.items()}
presidential_candidates_and_states_combinations = [
    f"{location}_{name}".lower()
    for full_name, names in presidential_candidates.items() 
    for location in [state for state in state_cities] + [city for cities in state_cities.values() for city in cities]
    for name in [full_name] + names
]
presidential_candidates_and_states_combinations_in_2d = [
    [location.lower(), full_name.lower()] + [name.lower() for name in names]
    for full_name, names in presidential_candidates.items() 
    for location in [state for state in state_cities] + [city for cities in state_cities.values() for city in cities]
]

# Sentence Extraction (Transcripts to CSV)

In [18]:
def process_transcripts_into_csv_of_sentences() -> DataFrame:    
    # Initialize List of Sentences
    list_of_sentences = []
        
    # Collect List of Sentences from Transcription Files
    transcription_files = os.listdir(transcription_output_path)
    total_transcription_file = len(transcription_files)
    with tqdm(total=total_transcription_file, desc=f'Collecting Sentences [0/{total_transcription_file} Transcript]') as pbar:
        for index, filename in enumerate(transcription_files):
            current = f'{index+1}/{total_transcription_file}'
            if filename == ".ipynb_checkpoints":
                pbar.update(1)
                continue
            
            pbar.set_description(f'Collecting Sentences [{current} Transcript]')

            # Open Transcription File
            file_path = os.path.join(transcription_output_path, filename)
            with open(file_path, "r") as file:
                transcription = file.read()
                
                # Split Transcript into Sentences
                sentences = sent_tokenize(transcription)

                # Remove Consecutive Duplicates (Caused by Whisper)
                sentences = [sentence for i, sentence in enumerate(sentences) if i == 0 or sentence != sentences[i-1]]
                
                # Add the Sentences
                list_of_sentences.extend(sentences)
            
            pbar.update(1)

    # Save List of All Sentences into CSV file
    df = pd.DataFrame(list(set(list_of_sentences)), columns=["Sentence"])
    df.to_csv(transcript_sentences_filename, index=False, errors="ignore")
    return df

list_of_sentences = process_transcripts_into_csv_of_sentences()
print(f'Number of Sentences: {len(list_of_sentences)}')
list_of_sentences.head()

Collecting Sentences [0/295 Transcript]:   0%|          | 0/295 [00:00<?, ?it/s]

Number of Sentences: 26452


Unnamed: 0,Sentence
0,Our daughter Sophia was born into this world w...
1,"She says these are the words of an autocrat, s..."
2,"15 states, plus the 2nd congressional distric..."
3,I will sell my car right now.
4,I'm going to zoom in here on the Philadelphia ...


# BERTopic: Relevant Sentence Filtering (CSV)

In [19]:
def filter_relevant_sentences() -> tuple[DataFrame, BERTopic]:
    # Get All Collected Sentences from Transcript
    df = pd.read_csv(transcript_sentences_filename, encoding_errors="ignore")
    sentences = df["Sentence"].tolist()
    
    # Set Filter for Words as Possible Topics
    def filter_possible_topics(text: str) -> list:
        """
        Filter Words If its a Possible Topic:
            1) Only Nouns and Proper Nouns (e.g. Dollars, Currency)
            2) No Stop Words (e.g. in, to)
            3) No Generic Abstract Nouns (e.g. thing, stuff)
            4) Minumum of Three Letter Words (e.g. USA)
            5) Exclude Numbers
        """
        
        pos_tags = pos_tag(word_tokenize(text)) # POS Tagging
        possible_topics = [
            token.lower() for token, pos in pos_tags
            if pos in ["NN", "NNS", "NNP", "NNPS"] # Nouns / Proper Nouns
            and token.lower() not in stop_words # Exclude Stop Words
            and len(token) > 1 # Exclude One Letter Words (e.g. Included: Ox)
            and not token.isnumeric() # Exclude Numbers
        ]
        
        return possible_topics
    vectorizer_model = CountVectorizer(
        ngram_range=(1, max_consecutive_words_for_topic),
        tokenizer=filter_possible_topics
    )

    # Train BERTopic model
    topic_model = BERTopic(
        embedding_model="all-MiniLM-L6-v2",
        n_gram_range=(1, max_consecutive_words_for_topic),
        vectorizer_model=vectorizer_model,
        seed_topic_list=presidential_candidates_and_states_combinations_in_2d,
        zeroshot_topic_list=presidential_candidates_and_states_combinations,
        zeroshot_min_similarity=min_similarity_of_topic_modeling,
        verbose=True
    )
    topics, _ = topic_model.fit_transform(sentences)
    
    # Get BERTopic Results
    topic_info = topic_model.get_topic_info()
    topics_and_documents = pd.DataFrame({"Topic": topics, "Representative_Docs": sentences})
    
    # Initialize Lists for Relevant Sentences
    list_of_relevant_sentences = []

    def is_sentence_complete(sentence: str) -> bool:
         # Exclude Sentence with Less than 5 or N Words
        return len(word_tokenize(sentence)) < minimum_number_of_word_in_relevant_sentence
        
    # Get Relevant Sentences
    for _, row in topic_info.iterrows():
        topic = row["Topic"]
        if topic == -1: continue # Skip Outlier

        # Get List of Relevant Topics and Sentences
        topic_keywords = row["Representation"]
        relevant_sentences = topics_and_documents[topics_and_documents["Topic"] == topic]["Representative_Docs"].tolist()
        
        # Check Candidate Mentions in Topics
        presidential_candidate_mentions = set() # Avoid Duplicates
        for presidential_candidate, names in presidential_candidates.items():
            if (
                any(name.lower() in keyword.lower() for name in names for keyword in topic_keywords) 
                or any(presidential_candidate.lower() in keyword.lower() for keyword in topic_keywords)
            ): 
                presidential_candidate_mentions.add(presidential_candidate)
        
        # Make Sure Only 1 Candidate is Mentioned
        if len(presidential_candidate_mentions) != 1: continue

        # Check State Mentions in Topics (Including Cities)
        state_mentions = set() # Avoid Duplicates
        for state, cities in state_cities.items():
            if (
                any(city.lower() in keyword.lower() for city in cities for keyword in topic_keywords) 
                or any(state.lower() in keyword.lower() for keyword in topic_keywords)
            ): 
                state_mentions.add(state)

        # Make Sure Only 1 State is Mentioned
        if len(state_mentions) != 1: continue
        """
        Add Relevant Sentences Only If:
            1) Only 1 Candidate is Mentioned
            2) Only 1 State is Mentioned
        """
        if (
            len(presidential_candidate_mentions) == 1
            and len(state_mentions) == 1
        ):
            presidential_candidate = presidential_candidate_mentions.pop()
            state = state_mentions.pop()
            
            # Add All Relevant Sentences with their Corresponding Presidential Candidate, State, and Topic Keywords
            for sentence in relevant_sentences:
                # Filter Complete Sentence
                if len(word_tokenize(sentence)) >= min_number_of_word_in_relevant_sentence:
                    list_of_relevant_sentences.append({
                        "Sentence": sentence,
                        "Presidential_Candidate": presidential_candidate,
                        "State": state,
                        "Topic_Keywords": topic_keywords
                    })
    
    # Save List of All Relevant Sentences into CSV file
    df = pd.DataFrame(list_of_relevant_sentences)
    df.to_csv(relevant_transcript_sentences_filename, index=False, errors="ignore")
    return df, topic_model

list_of_relevant_sentences, bertopic_model = filter_relevant_sentences()
print(f'Number of Relevant Sentences: {len(list_of_relevant_sentences)}')
list_of_relevant_sentences.head()

2024-10-22 19:14:56,985 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/827 [00:00<?, ?it/s]

2024-10-22 19:16:26,571 - BERTopic - Embedding - Completed ✓
2024-10-22 19:16:26,574 - BERTopic - Guided - Find embeddings highly related to seeded topics.


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

2024-10-22 19:16:30,492 - BERTopic - Guided - Completed ✓
2024-10-22 19:16:30,505 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-22 19:17:10,220 - BERTopic - Dimensionality - Completed ✓
2024-10-22 19:17:10,235 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-10-22 19:17:15,367 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-10-22 19:17:32,757 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-22 19:17:40,518 - BERTopic - Cluster - Completed ✓
2024-10-22 19:17:40,520 - BERTopic - Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...
2024-10-22 19:17:40,696 - BERTopic - Zeroshot Step 2 - Completed ✓
2024-10-22 19:17:40,696 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-22 19:18:03,389 - BERTopic - Representation - Completed ✓


Number of Relevant Sentences: 1816


Unnamed: 0,Sentence,Presidential_Candidate,State,Topic_Keywords
0,"On the counter, women under 40 and women acros...",Kamala Harris,Pennsylvania,"[counter women, areas past, past areas, harris..."
1,"In our Commonwealth, Kamala Harris just seems ...",Kamala Harris,Pennsylvania,"[counter women, areas past, past areas, harris..."
2,Arizona is incredibly close.,Donald Trump,Arizona,"[arizona, arizona arizona, arizona lead, trump..."
3,"Well, hello Arizona.",Donald Trump,Arizona,"[arizona, arizona arizona, arizona lead, trump..."
4,"Trump's winning Arizona, according to the Real...",Donald Trump,Arizona,"[arizona, arizona arizona, arizona lead, trump..."


In [20]:
bertopic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,10936,-1_trump_candidates_people_day,"[trump, candidates, people, day, election, dem...","[But do you care more about the election now?,..."
1,0,3,michigan_donald trump,"[ways party, dominance election, polls leads, ...","[And if we recall the 2016 election, it wasn't..."
2,1,4,michigan_donald,"[michigan state, today michigan, look michigan...","[And if you look at, let's just look at Michig..."
3,2,13,michigan_trump,"[michigan, votes michigan, michigan michigan, ...",[A one point swing in Wisconsin and Michigan m...
4,3,2,elizabethtown_kamala harris,"[counter women, areas past, past areas, harris...","[In our Commonwealth, Kamala Harris just seems..."
...,...,...,...,...,...
522,521,14,521_person beliefs_mates person_brothers shop_...,"[person beliefs, mates person, brothers shop, ...","[You vote for a leader who sees you, who has a..."
523,522,14,522_people something_november vote_election tr...,"[people something, november vote, election try...","[And then once you've got a plan, then you've ..."
524,523,41,523_vote vote_vote_vote people_people vote,"[vote vote, vote, vote people, people vote, no...","[Put down your vote and do what?, Vote for vot..."
525,524,33,524_voting_voting mail_states voting_vote elec...,"[voting, voting mail, states voting, vote elec...","[Early voting., The early voting looks very go..."


In [21]:
"""
Sa tingin ko need natin 5k sentences minimum for Relevant Sentences di lang for gathered.
Kasi mamaya 5k Random Sentences nakuha natin tas 100 lang dun Relevant with candidate & state.

Ang naiisip ko since meron 6 Combinations = 3 candidate * 2 state
Gawin natin 5000/6 = 834 Relevant Sentences required set natin as minimum per Combination

Trump  - Arizona      = 834 Relevant Sentences
Harris - Arizona      = 834 Relevant Sentences
Trump  - Michigan     = 834 Relevant Sentences
Harris - Michigan     = 834 Relevant Sentences
Trump  - Pennsylvania = 834 Relevant Sentences
Harris - Pennsylvania = 834 Relevant Sentences
                     --------------------------
                      ~5000 Relevant Sentences
"""
def print_statistics():
    try:
        return (
            pd
            .read_csv(relevant_transcript_sentences_filename, encoding_errors="ignore")
            .groupby(["Presidential_Candidate", "State"])
            .size()
            .reset_index(name="count")
            .style.hide(axis="index")
        )
    except: return "No Relevant Sentences"
        
print_statistics()

Presidential_Candidate,State,count
Donald Trump,Arizona,19
Donald Trump,Michigan,17
Donald Trump,Pennsylvania,1098
Kamala Harris,Arizona,1
Kamala Harris,Michigan,16
Kamala Harris,Pennsylvania,665
