# Library import

In [11]:
import re
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer
import numpy as np

# Data Preprocessing

In [4]:
df = pd.read_csv('../../documents_data.csv')

In [5]:
df['Date'] = pd.to_datetime(df['Date'].str.split().str[0], format='%Y-%m-%d')
df = df[df['Content'].notna()]

In [6]:
df.sort_values(by='Date', inplace=True)

# Generate embeddings for phrases

In [7]:
miu_phrases = [
    # Core "Made in USA" and Manufacturing Phrases
    r"\bmade in usa\b", r"\bmade in america\b", r"\bmade in u\.s\.\b", r"\bmade in us\b",
    r"\bamerican made\b", r"\busa made\b", r"\bu\.s\. made\b", r"\bus made\b",
    r"\busa produced\b", r"\bamerica produced\b", r"\bamerican produced\b", r"\bus produced\b", r"\bu\.s\. produced\b",
    r"\busa manufactured\b", r"\bamerica manufactured\b", r"\bamerican manufactured\b", r"\bus manufactured\b", r"\bu\.s\. manufactured\b",
    r"\bamerican worker\w*\b", r"\bamerican job\w*\b",
    r"\bveteran\w* owned\b", r"\bveteran\w* founded\b", r"\bfounded by veteran\w*\b",
    r"\bhandcrafted in america\b", r"\bhandcrafted in usa\b", r"\bhandcrafted in u\.s\.\b", r"\bhandcrafted in us\b",
    r"\bcrafted in america\b", r"\bcrafted in usa\b", r"\bcrafted in u\.s\.\b", r"\bcrafted in us\b",
    
    # Patriotism/Nationalism Phrases
    r"\bbuy american\b", r"\bbuy usa\b", r"\bbuy america\b",  # Can suggest nationalism in economic decisions
    r"\bsupport america\b", r"\bsupport usa\b", r"\bsupport u\.s\.\b",
    r"\bpatriot\w*\b",  # Matches "patriot", "patriotic", etc. – could signify nationalism/patriotism
    r"\bnational pride\b", r"\bnational heritage\b", r"\bnational identity\b",  # Phrases that explicitly convey nationalism
    r"\bchoose american\b", r"\bchoose usa\b", r"\bchoose u\.s\.\b", r"\bchoose america\b",  # Nationalistic consumer choices
    r"\bamerica\w* heritage\b", r"\bamerica\w* pride\b", r"\bamerican tradition\b",  # Nationalism tied to American values
    r"\bicon of america\w*\b", r"\bicon of usa\b", r"\bicon of u\.s\.\b"  # American symbols tied to pride
]


In [8]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [20]:
# Generate embeddings for MIU phrases
miu_embeddings = model.encode(miu_phrases, convert_to_tensor=True)  # Use convert_to_tensor to leverage GPU tensors

In [21]:
def split_content(content):
    # Simple sentence splitting by punctuation (you can improve this based on your needs)
    return re.split(r'[.!?]', content)

# Optimized function to calculate similarity and track occurrences
def track_similar_phrases(content, miu_phrases, miu_embeddings, threshold=0.7):
    # Split content into sentences/chunks
    chunks = split_content(content)

    if not chunks:  # In case there's no chunk to process
        return {}, 0

    # Generate embeddings for content chunks in batches (GPU-optimized)
    chunk_embeddings = model.encode(chunks, convert_to_tensor=True)

    # Compute cosine similarities in batch (between all chunks and MIU phrases at once)
    similarities = cosine_similarity(chunk_embeddings.cpu().numpy(), miu_embeddings.cpu().numpy())  # Convert tensors back to numpy arrays for sklearn cosine similarity

    # Initialize match tracking
    match_count = 0
    phrase_matches = {}

    # Loop through all chunks and their similarities
    for i, sim_row in enumerate(similarities):  # Iterate over each chunk's similarity vector
        for j, similarity in enumerate(sim_row):  # Iterate over the similarity to each MIU phrase
            if similarity >= threshold:
                phrase = miu_phrases[j]
                match_count += 1
                phrase_matches[phrase] = phrase_matches.get(phrase, 0) + 1

    return phrase_matches, match_count

In [22]:
df_test = df.head(20)

# Apply the function to each row in the test DataFrame
df_test['Matched Phrases'], df_test['Total Occurrences'] = zip(*df_test['Content'].apply(
    lambda x: track_similar_phrases(x, miu_phrases, miu_embeddings)))

# Display the results for the first 1000 rows
print(df_test.head(10))  # Display the first 10 rows as a quick check

                                                   Title       Date  \
37278  Ambassador Bush's Informal Remarks to the New ... 1980-07-17   
37279  Statement by the Vice President About the Atte... 1981-03-30   
37280  Exchange Between the Vice President and Report... 1981-03-31   
37281  Remarks of the Vice President and Prime Minist... 1981-03-31   
37282  Remarks of the Vice President and Deputy Prime... 1981-04-02   
37283  Remarks of the Vice President on Presenting th... 1981-04-03   
37284  Remarks of the Vice President on Senate Passag... 1981-04-03   
37285  Remarks of the Vice President Concerning Law D... 1981-04-06   
37286  Remarks of the Vice President at the Annual Re... 1981-04-07   
37287  Remarks of the Vice President Announcing the W... 1985-07-19   

                                                 Content  \
37278  Just a few minutes before he appeared at the c...   
37279  Well, I have a very brief statement that I wou...   
37280  The Vice President. The medical

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['Matched Phrases'], df_test['Total Occurrences'] = zip(*df_test['Content'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['Matched Phrases'], df_test['Total Occurrences'] = zip(*df_test['Content'].apply(


In [23]:
df_test

Unnamed: 0,Title,Date,Content,Citation,President,Categories,Attributes,Location,Link,Matched Phrases,Total Occurrences
37278,Ambassador Bush's Informal Remarks to the New ...,1980-07-17,Just a few minutes before he appeared at the c...,"George Bush, Ambassador Bush's Informal Remark...",George Bush,"Vice Presidential Candidates, Elections and Tr...","Campaign Remarks & Rallies, , LocationMichigan",Michigan,https://www.presidency.ucsb.edu/documents/amba...,{},0
37279,Statement by the Vice President About the Atte...,1981-03-30,"Well, I have a very brief statement that I wou...","George Bush, Statement by the Vice President A...",George Bush,"Vice Presidential, Remarks by the Vice Preside...","Vice Presidents, Watch Video, LocationWashingt...","Washington, DC",https://www.presidency.ucsb.edu/documents/stat...,{},0
37280,Exchange Between the Vice President and Report...,1981-03-31,The Vice President. The medical reports were v...,"George Bush, Exchange Between the Vice Preside...",George Bush,"Vice Presidential, Remarks by the Vice Preside...","Vice Presidents, , LocationWashington, DC","Washington, DC",https://www.presidency.ucsb.edu/documents/exch...,{},0
37281,Remarks of the Vice President and Prime Minist...,1981-03-31,The Vice President. We've just had a delightfu...,"George Bush, Remarks of the Vice President and...",George Bush,"Vice Presidential, Remarks by the Vice Preside...","Vice Presidents, , LocationWashington, DC","Washington, DC",https://www.presidency.ucsb.edu/documents/rema...,{},0
37282,Remarks of the Vice President and Deputy Prime...,1981-04-02,"The Vice President. Well, let me say that we'v...","George Bush, Remarks of the Vice President and...",George Bush,"Vice Presidential, Remarks by the Vice Preside...","Vice Presidents, , LocationWashington, DC","Washington, DC",https://www.presidency.ucsb.edu/documents/rema...,{},0
37283,Remarks of the Vice President on Presenting th...,1981-04-03,I know I speak for everybody here that we wish...,"George Bush, Remarks of the Vice President on ...",George Bush,"Vice Presidential, Remarks by the Vice Preside...","Vice Presidents, , LocationWashington, DC","Washington, DC",https://www.presidency.ucsb.edu/documents/rema...,{},0
37284,Remarks of the Vice President on Senate Passag...,1981-04-03,"I just have a brief statement. First, an expre...","George Bush, Remarks of the Vice President on ...",George Bush,"Vice Presidential, Remarks by the Vice Preside...","Vice Presidents, , LocationWashington, DC","Washington, DC",https://www.presidency.ucsb.edu/documents/rema...,{},0
37285,Remarks of the Vice President Concerning Law D...,1981-04-06,Let me just read a brief statement.\nPresident...,"George Bush, Remarks of the Vice President Con...",George Bush,"Vice Presidential, Remarks by the Vice Preside...","Vice Presidents, , LocationWashington, DC","Washington, DC",https://www.presidency.ucsb.edu/documents/rema...,{},0
37286,Remarks of the Vice President at the Annual Re...,1981-04-07,"Thank you, Senator Packwood. And first let me ...","George Bush, Remarks of the Vice President at ...",George Bush,"Vice Presidential, Remarks by the Vice Preside...","Vice Presidents, , LocationWashington, DC","Washington, DC",https://www.presidency.ucsb.edu/documents/rema...,{},0
37287,Remarks of the Vice President Announcing the W...,1985-07-19,The Vice President. We're here today to announ...,"George Bush, Remarks of the Vice President Ann...",George Bush,"Vice Presidential, Remarks by the Vice Preside...","Vice Presidents, , LocationWashington, DC","Washington, DC",https://www.presidency.ucsb.edu/documents/rema...,{},0


In [24]:
df_test.iloc[11]['Content']

"I have a brief opening statement and then I'll ask Senator Quayle to make a comment or two and then I'll be glad to take questions and so will he.\nThe first thing I want to do is just say how horrible I feel about that tragedy in Pakistan this morning. As most of you know, the Government of Pakistan announced the death of President Zia. He was a friend of mine and Barbara's and been extraordinarily hospitable to us on more than one occasion.\nWhat it was was a transport plane of the Pakistan Air Force carrying him and our Ambassador, Arnie Raphel, and it exploded this morning, 7:30 A.M., our time, Eastern daylight time.\nA second American - not - may not confirm, also was lost in the incident.\nThe Government of Pakistan has formed an advisory council to oversee the transition, following President Zia's death. An Acting President has been appointed.\nPakistan and the United States have a very special relationship and the loss of General Zia is a great tragedy.\nOur Ambassador, Arnie 