# Import Dependencies

In [76]:
# Import Main Dependencies
import os, re, nltk
import pandas as pd
from pandas import DataFrame
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from transformers import BertTokenizer, BertForNextSentencePrediction
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Import Other Dependencies
import torch
from tqdm.auto import tqdm

# Additional Downloads
nltk.download("punkt_tab", quiet=True)
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("stopwords", quiet=True)

True

# Define Utilities

In [77]:
def sanitize_filename(filename: str) -> str:
    # Escape Double Quotes
    filename = filename.replace('"', '\\"')

    # Replace Invalid Characters with "_"
    invalid_chars = re.compile(r'[<>:"/\\|?*]')
    sanitized_filename = invalid_chars.sub("_", filename)

    return sanitized_filename
    
def read_unique_items_from_file(file: str) -> list:
    with open(file, "r") as f:
        return list(set(url.strip() for url in f.readlines() if url.strip()))

# Set Configurations

In [93]:
# File Names
transcript_sentences_filename = "transcript_paragraphs.csv"
related_transcript_sentences_filename = "related_transcript_sentences.csv"

# Folder Names
transcription_output_path = "Transcription"
cities_path = "State Cities"

# Boolean Flags
remove_video = True
remove_audio = True

# Numeric Constants 
max_consecutive_words_for_topic = 2 # e.g. Unigram: "Donald" | Bigram: "Donald Trump" | Trigram: "President Donald Trump"
minimum_number_of_word_in_related_sentence = 5

# Sentence Categories
presidential_candidates = {
    "Donald Trump": [
        "Donald", "Trump"
    ],
    "Kamala Harris": [
        "Kamala", "Harris"
    ]
}
state_cities = {
    "Michigan": read_unique_items_from_file(os.path.join(cities_path, "michigan-cities.txt")),
    "Arizona": read_unique_items_from_file(os.path.join(cities_path, "arizona-cities.txt")),
    "Pennsylvania": read_unique_items_from_file(os.path.join(cities_path, "pennsylvania-cities.txt"))
}

# Words for Sentence Filtering
stop_words = set(stopwords.words("english"))
generic_abstract_nouns = {
    "thing", "stuff", "event",
    "aspect", "issue", "place",
    "person"
}

# Additional Preprocessing of Configurations
presidential_candidates = {presidential_candidate: list(set(names)) for presidential_candidate, names in presidential_candidates.items()}
presidential_candidates_and_states_combinations = [
    f"{pattern}_{loc}".lower() for name, parts in presidential_candidates.items() 
    for loc in [state for state in state_cities] + [city for cities in state_cities.values() for city in cities]
    for pattern in [name, '_'.join(parts)] + parts
]

# Sentence Extraction (Transcripts to CSV)

In [None]:
def process_transcripts_into_csv_of_paragraphs() -> DataFrame:
    # Initialize BERT models
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
    model.eval()
    
    # Initialize List of Paragraphs
    list_of_paragraphs = []
    
    def para_tokenize(sentences: list, current: str, threshold: float = 0.5) -> list:        
        # Handle empty input
        if not sentences: return []
        
        # Initialize Paragraphs w/ First Sentence
        paragraphs = [[sentences[0]]]
        
        # Add Next Sentence if its Related to Current Sentence
        last_sentences_idx = len(sentences) - 1
        with tqdm(total=last_sentences_idx - 1, desc=f'Identifying Paragraphs [{current} Transcript]') as pbar:
            for i in range(1, last_sentences_idx):
                current_sentence = sentences[i]
                last_sentence = paragraphs[-1][-1]
                
                # Encode Next and Current Sentence
                encoding = tokenizer(
                    last_sentence,
                    current_sentence, 
                    return_tensors="pt", 
                    padding=True, 
                    truncation=True
                )
                
                # Check Relation Between Next and Current Sentences
                with torch.no_grad():
                    outputs = model(**encoding)
                    probs = torch.softmax(outputs.logits, dim=1)
                    is_related = bool(probs[0][0] > threshold)
                    
                # Add Next Sentence w/ Current Paragraph List or Create New Paragraph List
                if is_related:
                    paragraphs[-1].append(current_sentence)
                else:
                    paragraphs.append([current_sentence])
                pbar.update(1)

            # Return List of Each Paragraph List as Paragraph Text
            return [" ".join(para) for para in paragraphs]

    # Collect List of Paragraphs from Transcription Files
    transcription_files = os.listdir(transcription_output_path)
    total_transcription_file = len(transcription_files)
    with tqdm(total=total_transcription_file, desc="Segmenting Transcripts") as pbar:
        for index, filename in enumerate(transcription_files):
            current = f'{index+1}/{total_transcription_file}'
            if filename == ".ipynb_checkpoints":
                pbar.update(1)
                continue
            
            pbar.set_description(f'Splitting Transcripts [{current} Transcript] ')

            # Open Transcription File
            file_path = os.path.join(transcription_output_path, filename)
            with open(file_path, "r") as file:
                text = file.read()
                
                # Split into Sentences
                sentences = list(set(sent_tokenize(text)))
    
                # Segment into Paragraphs
                list_of_paragraphs.extend(para_tokenize(sentences, current))
            
            pbar.update(1)

    # Save List of All Paragraphs into CSV file
    df = pd.DataFrame(list(set(list_of_paragraphs)), columns=["Paragraph"])
    df.to_csv(transcript_sentences_filename, index=False)
    return df

list_of_paragraphs = process_transcripts_into_csv_of_paragraphs()
print(f'Number of Paragraph: {len(list_of_paragraphs)}')
list_of_paragraphs.head()

Segmenting Transcripts:   0%|          | 0/269 [00:00<?, ?it/s]

Identifying Paragraphs [2/269 Transcript]:   0%|          | 0/75 [00:00<?, ?it/s]

Identifying Paragraphs [3/269 Transcript]:   0%|          | 0/95 [00:00<?, ?it/s]

Identifying Paragraphs [4/269 Transcript]:   0%|          | 0/91 [00:00<?, ?it/s]

Identifying Paragraphs [5/269 Transcript]:   0%|          | 0/133 [00:00<?, ?it/s]

Identifying Paragraphs [6/269 Transcript]:   0%|          | 0/99 [00:00<?, ?it/s]

Identifying Paragraphs [7/269 Transcript]:   0%|          | 0/86 [00:00<?, ?it/s]

Identifying Paragraphs [8/269 Transcript]:   0%|          | 0/56 [00:00<?, ?it/s]

Identifying Paragraphs [9/269 Transcript]:   0%|          | 0/40 [00:00<?, ?it/s]

Identifying Paragraphs [10/269 Transcript]:   0%|          | 0/40 [00:00<?, ?it/s]

Identifying Paragraphs [11/269 Transcript]:   0%|          | 0/227 [00:00<?, ?it/s]

Identifying Paragraphs [12/269 Transcript]:   0%|          | 0/42 [00:00<?, ?it/s]

Identifying Paragraphs [13/269 Transcript]:   0%|          | 0/87 [00:00<?, ?it/s]

Identifying Paragraphs [14/269 Transcript]:   0%|          | 0/237 [00:00<?, ?it/s]

Identifying Paragraphs [15/269 Transcript]:   0%|          | 0/14 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Identifying Paragraphs [16/269 Transcript]:   0%|          | 0/90 [00:00<?, ?it/s]

Identifying Paragraphs [17/269 Transcript]:   0%|          | 0/46 [00:00<?, ?it/s]

Identifying Paragraphs [18/269 Transcript]:   0%|          | 0/54 [00:00<?, ?it/s]

Identifying Paragraphs [19/269 Transcript]:   0%|          | 0/34 [00:00<?, ?it/s]

Identifying Paragraphs [20/269 Transcript]:   0%|          | 0/95 [00:00<?, ?it/s]

Identifying Paragraphs [21/269 Transcript]:   0%|          | 0/42 [00:00<?, ?it/s]

Identifying Paragraphs [22/269 Transcript]:   0%|          | 0/90 [00:00<?, ?it/s]

Identifying Paragraphs [23/269 Transcript]:   0%|          | 0/80 [00:00<?, ?it/s]

Identifying Paragraphs [24/269 Transcript]:   0%|          | 0/145 [00:00<?, ?it/s]

Identifying Paragraphs [25/269 Transcript]:   0%|          | 0/57 [00:00<?, ?it/s]

Identifying Paragraphs [26/269 Transcript]:   0%|          | 0/197 [00:00<?, ?it/s]

Identifying Paragraphs [27/269 Transcript]:   0%|          | 0/89 [00:00<?, ?it/s]

Identifying Paragraphs [28/269 Transcript]:   0%|          | 0/57 [00:00<?, ?it/s]

Identifying Paragraphs [29/269 Transcript]:   0%|          | 0/26 [00:00<?, ?it/s]

Identifying Paragraphs [30/269 Transcript]:   0%|          | 0/39 [00:00<?, ?it/s]

Identifying Paragraphs [31/269 Transcript]:   0%|          | 0/189 [00:00<?, ?it/s]

Identifying Paragraphs [32/269 Transcript]:   0%|          | 0/81 [00:00<?, ?it/s]

Identifying Paragraphs [33/269 Transcript]:   0%|          | 0/84 [00:00<?, ?it/s]

# BERTopic: Relevant Sentence Filtering (CSV)

In [95]:
def get_related_sentences_from_related_paragraphs() -> tuple[DataFrame, BERTopic]:
    # Get All Collected Paragraphs from Transcript
    df = pd.read_csv(transcript_sentences_filename)
    paragraphs = df["Paragraph"].tolist()
    
    # Set Filter for Words as Possible Topics
    def filter_possible_topics(text: str) -> list:
        """
        Filter Words If its a Possible Topic:
            1) Only Nouns and Proper Nouns (e.g. Dollars, Currency)
            2) No Stop Words (e.g. in, to)
            3) No Generic Abstract Nouns (e.g. thing, stuff)
            4) Minumum of Three Letter Words (e.g. USA)
            5) Exclude Numbers
        """
        
        pos_tags = pos_tag(word_tokenize(text)) # POS Tagging
        possible_topics = [
            token.lower() for token, pos in pos_tags
            if pos in ["NN", "NNS", "NNP", "NNPS"] # Nouns / Proper Nouns
            and token.lower() not in stop_words # Exclude Stop Words
            and token.lower() not in generic_abstract_nouns # Exclude Generic Abstract Nouns
            and len(token) > 2 # Exclude One/Two Letter Words
            and not token.isnumeric() # Exclude Numbers
        ]
        
        return possible_topics
    vectorizer_model = CountVectorizer(
        ngram_range=(1, max_consecutive_words_for_topic),
        tokenizer=filter_possible_topics
    )

    # Train BERTopic model
    topic_model = BERTopic(
        embedding_model="all-MiniLM-L6-v2",
        n_gram_range=(1, max_consecutive_words_for_topic),
        vectorizer_model=vectorizer_model,
        zeroshot_topic_list=presidential_candidates_and_states_combinations,
        verbose=True
    )
    topics, probs = topic_model.fit_transform(paragraphs)
    
    # Get BERTopic Results
    topic_info = topic_model.get_topic_info()
    topics_and_documents = pd.DataFrame({"Topic": topics, "Representative_Docs": paragraphs})
    
    # Initialize Lists for Related Sentences
    list_of_related_sentences = []

    def is_sentence_complete(sentence: str) -> bool:
        """
        Its a Complete Sentence If:
            1) It has Atleast 1 Noun or Pronoun
            2) It has Atleast 1 Verb
            3) Minimum of 5 or N Words
        """
        word_tokens = word_tokenize(sentence) # Tokenize Sentence into Words

        if len(word_tokens) < minimum_number_of_word_in_related_sentence: return False # Exclude Sentence with Less than 5 or N Words

        pos_tags = pos_tag(word_tokens) # POS Tagging
        has_subject = any(tag in ["NN", "NNS", "NNP", "NNPS"] for _, tag in pos_tags) # Exclude Sentence w/out Noun and Pronoun
        has_verb = any(tag.startswith("VB") for _, tag in pos_tags) # Exclude Sentence w/out Verb*
    
        return has_subject and has_verb
        
    # Get Related Sentences from Related Paragraphs
    for _, row in topic_info.iterrows():
        topic = row["Topic"]
        if topic == -1: continue # Skip Outlier

        # Get List of Topics and its Corresponding Paragraphs
        topic_keywords = row["Representation"]
        related_sentences = topics_and_documents[topics_and_documents["Topic"] == topic]["Representative_Docs"].tolist()
        
        # Check Candidate Mentions in Topics
        presidential_candidate_mentions = set() # Avoid Duplicates
        for presidential_candidate, names in presidential_candidates.items():
            if (
                any(name.lower() in keyword.lower() for name in names for keyword in topic_keywords) 
                or any(presidential_candidate.lower() in keyword.lower() for keyword in topic_keywords)
            ): 
                presidential_candidate_mentions.add(presidential_candidate)
        
        # Make Sure Only 1 Candidate is Mentioned
        if len(presidential_candidate_mentions) != 1: continue

        # Check State Mentions in Topics (Including Cities)
        state_mentions = set() # Avoid Duplicates
        for state, cities in state_cities.items():
            if (
                any(city.lower() in keyword.lower() for city in cities for keyword in topic_keywords) 
                or any(state.lower() in keyword.lower() for keyword in topic_keywords)
            ): 
                state_mentions.add(state)

        # Make Sure Only 1 State is Mentioned
        if len(state_mentions) != 1: continue
        """
        Add Related Sentences Only If:
            1) Only 1 Candidate is Mentioned
            2) Only 1 State is Mentioned
        """
        if (
            len(presidential_candidate_mentions) == 1
            and len(state_mentions) == 1
        ):
            presidential_candidate = presidential_candidate_mentions.pop()
            state = state_mentions.pop()

            # Get and Filter Sentences in Related Paragraphs
            related_sentences = [
                sentence for paragraph in related_sentences
                for sentence in sent_tokenize(paragraph)
                if is_sentence_complete(sentence)
            ]
            
            # Add All Related Sentences with their Corresponding Presidential Candidate, State, and Topic Keywords
            for sentence in related_sentences:
                list_of_related_sentences.append({
                    "Sentence": sentence,
                    "Presidential_Candidate": presidential_candidate,
                    "State": state,
                    "Topic_Keywords": topic_keywords
                })
    
    # Save List of All Related Sentences into CSV file
    df = pd.DataFrame(list_of_related_sentences)
    df.to_csv(related_transcript_sentences_filename, index=False)
    return df, topic_model

list_of_related_sentences, bertopic_model = get_related_sentences_from_related_paragraphs()
print(f'Number of Related Sentences: {len(list_of_related_sentences)}')
list_of_related_sentences.head()

2024-10-21 06:55:25,049 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/211 [00:00<?, ?it/s]

2024-10-21 06:56:40,390 - BERTopic - Embedding - Completed ✓
2024-10-21 06:56:40,390 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-21 06:56:52,225 - BERTopic - Dimensionality - Completed ✓
2024-10-21 06:56:52,233 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-10-21 06:57:00,142 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-10-21 06:57:21,582 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-21 06:57:21,954 - BERTopic - Cluster - Completed ✓
2024-10-21 06:57:21,954 - BERTopic - Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...
2024-10-21 06:57:21,979 - BERTopic - Zeroshot Step 2 - Completed ✓
2024-10-21 06:57:21,979 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-10-21 06:57:36,277 - BERTopic - Representation - Completed ✓


Number of Related Sentences: 1304


Unnamed: 0,Sentence,Presidential_Candidate,State,Topic_Keywords
0,Donald Trump is winning Pennsylvania.,Donald Trump,Pennsylvania,"[trump pennsylvania, donald trump, donald, pen..."
1,Trump is taking Pennsylvania.,Donald Trump,Pennsylvania,"[trump pennsylvania, pennsylvania, trump, , , ..."
2,But they would say we're going to stop Magga.,Donald Trump,Pennsylvania,"[magga, maga, magga maga, donald trump, butler..."
3,"As you can see, I'm not just MAGA, I'm Dark MAGA.",Donald Trump,Pennsylvania,"[magga, maga, magga maga, donald trump, butler..."
4,So months after that assassination bid on form...,Donald Trump,Pennsylvania,"[magga, maga, magga maga, donald trump, butler..."


In [96]:
bertopic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3634,-1_trump_pennsylvania_people_state,"[trump, pennsylvania, people, state, president...","[Let's look at hurricane relief, which has bee..."
1,0,1,kamala harris_nogales,"[kamala harris, kamala, harris, , , , , , , ]",[Kamala Harris will take you all. First one.]
2,1,1,kamala_nogales,"[kamala, , , , , , , , , ]",[Kamala.]
3,2,1,kamala harris_york,"[answer harris, answer, harris, , , , , , , ]","[So yes, is the answer. Kamala Harris.]"
4,3,1,kamala harris_ypsilanti,"[people harris, people, harris, , , , , , , ]",[Do people really believe that Kamala Harris of.]
...,...,...,...,...,...
89,88,99,88_poll_points_trump poll_vote,"[poll, points, trump poll, vote, polls, vote t...",[Let's find out together. Pennsylvania has a v...
90,89,144,89_county_pennsylvania_counties_state,"[county, pennsylvania, counties, state, philad...",[We're going to actually look at voter registr...
91,90,11,90_pennsylvania winner_favorite trump_firewall...,"[pennsylvania winner, favorite trump, firewall...","[Cumberland, Donald Trump won that by 18 and 1..."
92,91,55,91_biden_haley_county_primary,"[biden, haley, county, primary, pennsylvania, ...","[So that's not, though, where the whole story ..."


In [103]:
"""
Sa tingin ko need natin 5k sentences minimum for Related di lang for gathered.
Kasi mamaya 5k Unrelated Sentences nakuha natin tas 100 lang dun Related with candidate & state.

Ang naiisip ko since 6 Combination = 3 candidate * 2 state
Gawin natin 5000/6 = 834 Related Sentences required set natin as minimum per Combination

Trump  - Arizona      = 834 Related Sentences
Harris - Arizona      = 834 Related Sentences
Trump  - Michigan     = 834 Related Sentences
Harris - Michigan     = 834 Related Sentences
Trump  - Pennsylvania = 834 Related Sentences
Harris - Pennsylvania = 834 Related Sentences
                     --------------------------
                      ~5000 Related Sentences
"""
def print_statistics():
    try:
        return (
            pd
            .read_csv(related_transcript_sentences_filename)
            .groupby(["Presidential_Candidate", "State"])
            .size()
            .reset_index(name="count")
            .style.hide(axis="index")
        )
    except: return "No Related Sentences"
        
print_statistics()

Presidential_Candidate,State,count
Donald Trump,Michigan,71
Donald Trump,Pennsylvania,409
Kamala Harris,Pennsylvania,824
