In [56]:
import re
import requests
import string
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import sympy as sp
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
#nltk.download('punkt')
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

### Methodology:

In [21]:
response = requests.get('https://thepreamble.com/p/trump-promised-the-biggest-deportation')
soup = BeautifulSoup(response.content, 'html.parser')

p_tags_article1 = soup.find_all(lambda tag: tag.name == 'p' and not tag.find_parent(class_="sideBySideWrap-vGXrwP"))
clean_text_article1 = [el.text.strip() for el in p_tags_article1]
clean_text_article1 = clean_text_article1[:32]

sentences_clean_text_article1 = []
for paragraph in clean_text_article1:
    sentences_clean_text_article1.extend(sent_tokenize(paragraph))

trump1_df = pd.DataFrame(sentences_clean_text_article1, columns = ['Sentence'])
trump1_df['Title'] = 'Substack post: Trump Promised the Biggest Deportation Operation in History. It’s Not Happening.'

In [22]:
response = requests.get('https://www.nbcnews.com/news/us-news/trump-administration-takes-aim-immigrant-students-rcna198346')
soup = BeautifulSoup(response.content, 'html.parser')

p_tags_article2 = soup.find_all(class_=['body-graf'])
clean_text_article2 = [el.text.strip() for el in p_tags_article2]

sentences_clean_text_article2 = []
for paragraph in clean_text_article2:
    sentences_clean_text_article2.extend(sent_tokenize(paragraph))

trump2_df = pd.DataFrame(sentences_clean_text_article2, columns = ['Sentence'])
trump2_df['Title'] = 'CNBC Article: Rubio says State Department has revoked at least 300 student visas'
assert len(trump2_df) == len(sentences_clean_text_article2)

In [25]:
response = requests.get('https://austinkocher.substack.com/p/trump-declares-war-on-immigration')
soup = BeautifulSoup(response.content, 'html.parser')

# New learning - how to exclude certain content from scrape
p_tags_article3 = soup.find_all(lambda tag: tag.name == 'p' and not tag.find_parent(class_="pullquote"))

clean_text_article3 = [el.text.strip() for el in p_tags_article3]
clean_text_article3 = clean_text_article3[:36] 

sentences_clean_text_article3 = []
for paragraph in clean_text_article3:
    sentences_clean_text_article3.extend(sent_tokenize(paragraph))

trump3_df = pd.DataFrame(sentences_clean_text_article3, columns = ['Sentence'])
trump3_df['Title'] = 'Substack post: Trump declares war on immigration lawyers'

In [38]:
response = requests.get('https://escapely.com/st-patricks-day-team-building-activities/?srsltid=AfmBOoqYl-BulUCtAlAlJHOetMUythVillecnx4eMScnHTGg4VdPBPHf')
soup = BeautifulSoup(response.content, 'html.parser')

p_tags_article4 = soup.find_all('p')

clean_text_article4 = [el.text.strip() for el in p_tags_article4]
clean_text_article4 = clean_text_article4[:104]

sentences_clean_text_article4 = []
for paragraph in clean_text_article4:
    sentences_clean_text_article4.extend(sent_tokenize(paragraph))

stpatricks_df = pd.DataFrame(sentences_clean_text_article4, columns = ['Sentence'])
stpatricks_df['Title'] = '20 Best St. Patrick’s Day Team Building Activities for Work'

In [39]:
concat_df = pd.concat([trump1_df, trump2_df, trump3_df, stpatricks_df], ignore_index=True)
assert len(concat_df) == (len(trump1_df) + len(trump2_df) + len(trump3_df) + len(stpatricks_df))

### Include information about TfidfVectorizer/this matrix composition

- Explain the decision to keep the documents as sentences
- Discuss the bag_of_words 
- Share the matrix rank is computed by calculating the non-zero singular values (share why a singular value of zero means you are in null space)


### Normalize using L2 norm to account for document length 


**Motivation:** Singular vectors of the encoding matrix don't tell us as *much* as we would like because St. Patrick's day article is twice as long as the others. So do this process again but normalize at the TfidfVectorizer() stage to see what happens. 

[Stanford tf-idf notes, page 32](https://web.stanford.edu/class/cs276/19handouts/lecture6-tfidf-1per.pdf) state that after normalizing the documents we get comparable weights for long/short documents. This ensures that the St. Patrick's Day document won't weigh more than the two separate Trump articles.

In [48]:
vectorizer = TfidfVectorizer(stop_words = 'english', norm = 'l2')
tf_idf_matrix = vectorizer.fit_transform(concat_df.Sentence)

In [75]:
temp_tf_idf_matrix = tf_idf_matrix.todense()
assert temp_tf_idf_matrix.shape == (346, 1983)
rank = scipy.linalg.interpolative.estimate_rank(temp_tf_idf_matrix, eps=1e-12)
nprank = np.linalg.matrix_rank(temp_tf_idf_matrix)

### Note the difference in scipy vs. numpy matrix rank

### Discuss math of low rank approximation and TruncatedSVD

- Eckart Young Theorm
- TruncatedSVD documentation (what is actually happening here?)
- Of course the low rank approximation, k changes the singular values and singular vectors -- why?
- **Potential SSA question:** Why does this say "Singular values are equal to the 2-norms of the n_components in lower dimensional space?
- Answer the audience question of why we want to consider this in lower dimensional space.

In [82]:
# Choose 100 for n_components
hundred_svd = TruncatedSVD(n_components=100)
hundred_lsa = hundred_svd.fit_transform(tf_idf_matrix)

In [84]:
two_svd = TruncatedSVD(n_components = 2)
two_lsa = two_svd.fit_transform(tf_idf_matrix)

In [87]:
two_svd.singular_values_

array([2.76273472, 2.29441927])

In [88]:
hundred_svd.singular_values_[:2]

array([2.76274018, 2.29455651])

In [89]:
topic_encoded_df = pd.DataFrame(two_lsa, columns = ['Topic 1', 'Topic 2'])
topic_encoded_df['Sentence'] = concat_df.Sentence
# Is there a way to combine this?
display(topic_encoded_df)

Unnamed: 0,Topic 1,Topic 2,Sentence
0,0.003762,0.068112,Donald Trump may be a path-breaking politician...
1,0.011548,0.014963,He often promises to revive the trade agenda o...
2,0.012741,0.227706,"And, during his 2024 campaign, Trump repeatedl..."
3,0.005265,0.097703,Even Trump has acknowledged that last one is a...
4,0.006846,0.117584,"“A very moderate man,” Trump said last weekend..."
...,...,...,...
341,0.461283,-0.009576,These 20 St. Patrick’s Day team-building activ...
342,0.238240,-0.030523,From festive competitions to cultural explorat...
343,0.298890,0.033130,Celebrating St. Patrick’s Day with coworkers i...
344,0.239408,-0.002416,As we reflect on these team-building tips and ...


In [91]:
dictionary = vectorizer.get_feature_names_out()
display(dictionary)

array(['000', '100', '11', ..., 'york', 'yunseo', 'zoe'], dtype=object)

In [95]:
encoding_matrix = pd.DataFrame(two_svd.components_, index = ['topic_1', 'topic_2']).T

encoding_matrix["terms"] = dictionary
display(encoding_matrix)

Unnamed: 0,topic_1,topic_2,terms
0,0.001568,0.020963,000
1,0.000240,0.002632,100
2,0.003237,0.008371,11
3,0.004171,0.008075,15
4,0.000568,0.004653,166
...,...,...,...
1978,0.019904,0.014723,year
1979,0.007041,0.025091,years
1980,0.000488,0.009342,york
1981,0.000177,0.005039,yunseo


In [98]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
display(encoding_matrix.sort_values('abs_topic_1', ascending=False).head(25))

Unnamed: 0,topic_1,topic_2,terms,abs_topic_1,abs_topic_2
487,0.423628,0.057562,day,0.423628,0.057562
1312,0.367161,0.024132,patrick,0.367161,0.024132
1687,0.366362,0.023454,st,0.366362,0.023454
1765,0.34288,-0.074424,team,0.34288,0.074424
250,0.308099,-0.078978,building,0.308099,0.078978
67,0.178453,-0.011818,activities,0.178453,0.011818
68,0.149056,0.013517,activity,0.149056,0.013517
232,0.140179,-0.183046,book,0.140179,0.183046
512,0.140179,-0.183046,demo,0.140179,0.183046
969,0.132602,-0.008314,interested,0.132602,0.008314


In [99]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
display(encoding_matrix.sort_values('abs_topic_2', ascending=False).head(25))

Unnamed: 0,topic_1,topic_2,terms,abs_topic_1,abs_topic_2
919,0.016796,0.414464,immigration,0.016796,0.414464
1843,0.017965,0.310792,trump,0.017965,0.310792
174,0.009202,0.241252,attorneys,0.009202,0.241252
1060,0.009315,0.217585,legal,0.009315,0.217585
77,0.012723,0.197043,administration,0.012723,0.197043
512,0.140179,-0.183046,demo,0.140179,0.183046
232,0.140179,-0.183046,book,0.140179,0.183046
729,0.004726,0.116719,federal,0.004726,0.116719
1576,0.009178,0.113164,said,0.009178,0.113164
1076,0.010704,0.110537,like,0.010704,0.110537
