# Import Dependencies

In [1]:
# Import Main Dependencies
import os, re, nltk
import pandas as pd
from pandas import DataFrame
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Import Other Dependencies
from tqdm.auto import tqdm

# Additional Downloads
nltk.download("punkt_tab", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("stopwords", quiet=True)




True

# Define Utilities

In [2]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with "_"
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub("_", filename)

    return sanitized_filename
    
def read_unique_items_from_file(file: str) -> list:
    with open(file, "r") as f:
        return list(set(url.strip() for url in f.readlines() if url.strip()))

# Set Configurations

In [8]:
# File Names
transcript_sentences_filename = "transcript_sentences.csv"
related_transcript_sentences_filename = "related_transcript_sentences.csv"

# Folder Names
transcription_output_path = "Transcription"
cities_path = "State Cities"

# Boolean Flags
remove_video = True
remove_audio = True

# Numeric Constants 
max_consecutive_words_for_topic = 2 # e.g. Unigram: "Donald" | Bigram: "Donald Trump" | Trigram: "President Donald Trump"

# Sentence Categories
presidential_candidates = {
    "Donald Trump": [
        "Donald", "Trump"
    ],
    "Kamala Harris": [
        "Kamala", "Harris"
    ]
}
state_cities = {
    "Michigan": read_unique_items_from_file(os.path.join(cities_path, "michigan-cities.txt")),
    "Arizona": read_unique_items_from_file(os.path.join(cities_path, "arizona-cities.txt")),
    "Pennsylvania": read_unique_items_from_file(os.path.join(cities_path, "pennsylvania-cities.txt"))
}

# Words for Sentence Filtering
stop_words = set(stopwords.words("english"))
generic_abstract_nouns = {
    "thing", "stuff", "event",
    "aspect", "issue", "place",
    "person"
}

# Additional Preprocessing of Configurations
presidential_candidates = {presidential_candidate: list(set(names)) for presidential_candidate, names in presidential_candidates.items()}
presidential_candidates_and_states_combinations = [
    f"{pattern}_{loc}".lower() for name, parts in presidential_candidates.items() 
    for loc in [state for state in state_cities] + [city for cities in state_cities.values() for city in cities]
    for pattern in [name, '_'.join(parts), parts[0], parts[1]]
]

# Sentence Extraction (Transcripts to CSV)

In [10]:
def process_transcripts_into_csv_of_sentences() -> DataFrame:
    # Initialize List of Sentences
    list_of_sentences = []
    
    def is_sentence_complete(sentence: str) -> bool:
        """
        Its a Proper Sentence If:
            1) It has Atleast 1 Noun or Pronoun
            2) It has Atleast 1 Verb
        """
        pos_tags = pos_tag(word_tokenize(sentence)) # POS Tagging
        has_subject = any(tag in ["NN", "NNS", "NNP", "NNPS"] for _, tag in pos_tags) # Exclude Sentence w/out Noun and Pronoun
        has_verb = any(tag.startswith("VB") for _, tag in pos_tags) # Exclude Sentence w/out Verb
    
        return has_subject and has_verb
    
    # Collect List of All Sentences from Transcripts
    transcription_files = os.listdir(transcription_output_path)
    with tqdm(total=len(transcription_files), desc="Collecting Sentences from Transcripts") as pbar:
        for filename in transcription_files:
            if filename == ".ipynb_checkpoints":
                pbar.update(1)
                continue
                
            file_path = os.path.join(transcription_output_path, filename)
            
            with open(file_path, "r") as file:
                text = file.read()
                
                # Split into Sentences
                sentences = list(set(sent_tokenize(text)))
    
                # Filter Proper Sentences (With Noun/Proper-Noun and Verb)
                sentences = [sentence for sentence in sentences if is_sentence_complete(sentence)]
                
                list_of_sentences.extend(sentences)
                
            pbar.update(1)

    # Save List of All Sentences into CSV file
    df = pd.DataFrame(list(set(list_of_sentences)), columns=["Sentence"])
    df.to_csv(transcript_sentences_filename, index=False)
    return df

list_of_sentences = process_transcripts_into_csv_of_sentences()
print(f'Number of Sentences: {len(list_of_sentences)}')
list_of_sentences.head()

Collecting Sentences from Transcripts:   0%|          | 0/269 [00:00<?, ?it/s]

Number of Sentences: 20478


Unnamed: 0,Sentence
0,And the people of Pennsylvania were better off...
1,"She doesn't even live in the district, but tha..."
2,"Well, the polls are going back and forth like..."
3,"Okay, I'm not going to go forever on this, but..."
4,Don't try to sanitize it and color it up for t...


# BERTopic: Relevant Sentence Filtering (CSV)

In [11]:
def filter_related_sentences() -> tuple[DataFrame, BERTopic]:
    # Get All Sentences from Transcript
    df = pd.read_csv(transcript_sentences_filename)
    sentences = df["Sentence"].tolist()
    
    # Set Filter for Words as Possible Topics
    def filter_possible_topics(text: str) -> list:
        """
        Filter Words If its a Possible Topic:
            1) Only Nouns and Proper Nouns (e.g. Dollars, Currency)
            2) No Stop Words (e.g. in, to)
            3) No Generic Abstract Nouns (e.g. thing, stuff)
            4) Minumum of Three Letter Words (e.g. USA)
            5) Exclude Numbers
        """
        
        pos_tags = pos_tag(word_tokenize(text)) # POS Tagging
        possible_topics = [
            token.lower() for token, pos in pos_tags
            if pos in ["NN", "NNS", "NNP", "NNPS"] # Nouns / Proper Nouns
            and token.lower() not in stop_words # Exclude Stop Words
            and token.lower() not in generic_abstract_nouns # Exclude Generic Abstract Nouns
            and len(token) > 2 # Exclude One/Two Letter Words
            and not token.isnumeric() # Exclude Numbers
        ]
        
        return possible_topics
    vectorizer_model = CountVectorizer(
        ngram_range=(1, max_consecutive_words_for_topic),
        tokenizer=filter_possible_topics
    )

    # Train BERTopic model
    topic_model = BERTopic(
        embedding_model="all-MiniLM-L6-v2",
        n_gram_range=(1, max_consecutive_words_for_topic),
        vectorizer_model=vectorizer_model,
        zeroshot_topic_list=presidential_candidates_and_states_combinations,
        verbose=True
    )
    topics, probs = topic_model.fit_transform(sentences)
    
    # Get BERTopic Results
    topic_info = topic_model.get_topic_info()
    topics_and_documents = pd.DataFrame({"Topic": topics, "Representative_Docs": sentences})
    
    # Initialize Lists for our filtered results
    list_of_related_sentences = []

    # Analyze each topic row in topic_info
    for _, row in topic_info.iterrows():
        topic = row["Topic"]
        if topic == -1: continue # Skip Outlier

        # Get List of Topics and its Related Sentences
        topic_keywords = row["Representation"]
        related_sentences = topics_and_documents[topics_and_documents["Topic"] == topic]["Representative_Docs"].tolist()
        
        # Check Candidate Mentions in Topics
        presidential_candidate_mentions = set() # Avoid Duplicates
        for presidential_candidate, names in presidential_candidates.items():
            if (
                any(name.lower() in keyword.lower() for name in names for keyword in topic_keywords) 
                or any(presidential_candidate.lower() in keyword.lower() for keyword in topic_keywords)
            ): 
                presidential_candidate_mentions.add(presidential_candidate)
        
        # Make Sure Only 1 Candidate is Mentioned
        if len(presidential_candidate_mentions) != 1: continue

        # Check State Mentions in Topics (Including Cities)
        state_mentions = set() # Avoid Duplicates
        for state, cities in state_cities.items():
            if (
                any(city.lower() in keyword.lower() for city in cities for keyword in topic_keywords) 
                or any(state.lower() in keyword.lower() for keyword in topic_keywords)
            ): 
                state_mentions.add(state)

        # # Make Sure Only 1 State is Mentioned
        if len(state_mentions) != 1: continue
        """
        Add Related Sentences Only If:
            1) Only 1 Candidate is Mentioned
            2) Only 1 State is Mentioned
        """
        if (
            len(presidential_candidate_mentions) == 1
            and len(state_mentions) == 1
        ):
            presidential_candidate = presidential_candidate_mentions.pop()
            state = state_mentions.pop()

            # Add All Related Sentences with Corresponding Presidential Candidate, State, and Topic Keywords
            for sentence in related_sentences:
                list_of_related_sentences.append({
                    "Sentence": sentence,
                    "Presidential_Candidate": presidential_candidate,
                    "State": state,
                    "Topic_Keywords": topic_keywords
                })
    
    # Save List of All Related Sentences into CSV file
    df = pd.DataFrame(list_of_related_sentences)
    df.to_csv(related_transcript_sentences_filename, index=False)
    return df, topic_model

list_of_related_sentences, bertopic_model = filter_related_sentences()
print(f'Number of Related Sentences: {len(list_of_related_sentences)}')
print(f'Number of Topic Clusters: {len(bertopic_model.get_topic_info())}')
bertopic_model.get_topic_info()

2024-10-21 02:38:38,045 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/640 [00:00<?, ?it/s]

2024-10-21 02:39:42,103 - BERTopic - Embedding - Completed ✓
2024-10-21 02:39:42,103 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-21 02:40:14,139 - BERTopic - Dimensionality - Completed ✓
2024-10-21 02:40:14,169 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-10-21 02:40:19,813 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-10-21 02:40:37,479 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-21 02:40:41,905 - BERTopic - Cluster - Completed ✓
2024-10-21 02:40:41,905 - BERTopic - Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...
2024-10-21 02:40:41,975 - BERTopic - Zeroshot Step 2 - Completed ✓
2024-10-21 02:40:41,979 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-21 02:40:53,101 - BERTopic - Representation - Completed ✓


Number of Related Sentences: 591
Number of Topic Clusters: 269


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,9486,-1_trump_state_harris_states,"[trump, state, harris, states, election, penns...","[You just need one more state and you've won, ..."
1,0,1,trump_michigan,"[trump way, president trump, way, president, t...",[President Trump is on his way to Michigan sho...
2,1,4,kamala harris_washington,"[life children, trump politicians, children li...","[Now you have Kamala Harris versus Trump., Pol..."
3,2,1,kamala harris_ypsilanti,"[people harris, people, harris, , , , , , , ]",[Do people really believe that Kamala Harris of.]
4,3,1,donald trump_pennsylvania,"[lead pennsylvania, trump lead, lead, donald t...",[Donald Trump is in the lead in Pennsylvania.]
...,...,...,...,...,...
264,263,41,263_question question_question_question answer...,"[question question, question, question answer,...","[And just repeat my question., I have answered..."
265,264,33,264_reason_reasons_hell opposite_ingraham cases,"[reason, reasons, hell opposite, ingraham case...","[There's a reason for that., Give me one reaso..."
266,265,27,265_problem problem_problem_problem problems_p...,"[problem problem, problem, problem problems, p...","[That's the problem., So it is a problem., You..."
267,266,17,266_case_case case_point case_use course,"[case, case case, point case, use course, long...","[That's always the case., That's no longer the..."


In [12]:
list_of_related_sentences.head()

Unnamed: 0,Sentence,Presidential_Candidate,State,Topic_Keywords
0,Donald Trump is in the lead in Pennsylvania.,Donald Trump,Pennsylvania,"[lead pennsylvania, trump lead, lead, donald t..."
1,Donald Trump is winning Pennsylvania.,Donald Trump,Pennsylvania,"[trump work, leads pennsylvania, trump leads, ..."
2,But now Donald Trump leads in Pennsylvania.,Donald Trump,Pennsylvania,"[trump work, leads pennsylvania, trump leads, ..."
3,Donald Trump putting in the work also in Penns...,Donald Trump,Pennsylvania,"[trump work, leads pennsylvania, trump leads, ..."
4,Trump is taking Pennsylvania.,Donald Trump,Pennsylvania,"[pennsylvania significance, part pennsylvania,..."


In [13]:
"""
Sa tingin ko need natin 5k sentences minimum for Related di lang for gathered.
Kasi mamaya 5k Unrelated Sentences nakuha natin tas 100 lang dun Related with candidate & state.

Ang naiisip ko since 6 Combination = 3 candidate * 2 state
Gawin natin 5000/6 = 834 Related Sentences required set natin as minimum per Combination

Trump  - Arizona      = 834 Related Sentences
Harris - Arizona      = 834 Related Sentences
Trump  - Michigan     = 834 Related Sentences
Harris - Michigan     = 834 Related Sentences
Trump  - Pennsylvania = 834 Related Sentences
Harris - Pennsylvania = 834 Related Sentences
                     --------------------------
                      ~5000 Related Sentences
"""

(
    pd
    .read_csv(related_transcript_sentences_filename)
    .groupby(["Presidential_Candidate", "State"])
    .size()
    .reset_index(name="count")
    .style.hide(axis="index")
) if os.path.exists(related_transcript_sentences_filename) else "No Related Sentences"

Presidential_Candidate,State,count
Donald Trump,Arizona,42
Donald Trump,Pennsylvania,457
Kamala Harris,Pennsylvania,92
