In [3]:
import re
import requests
import string
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import sympy as sp
from bs4 import BeautifulSoup
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

Steps in the smaller document-term example:

1. Make the words and sentence lists
2. Initialize the original document term matrix to be zero with the dimensions of our documents and words
3. Loop through the documents and update the document term matrix with the counts of the words in the documents
4. Apply the tf-idf methods and take the element-wise product of the tf matrix and the idf matrix
5. Confirm that our tf-idf splitting is the same as the TfIdfVectorizer in sci-kit learn 
6. Now that we've confirmed our matrix, A, take the eigenthings of AAT and ATA, write down what these matrices mean
7. Plot the singular vectors of U and V, plot the singular values ∑
8. Interpret this and do the same thing again for another extreme example

In [4]:
# Make a dtm from three sentences and 5 words (should be 3x5)

words = ["elephant", "horse", "zebra", "donkey", "monkey"]

# Intention: Make one sentence random bunch of words, another that repeates, and another with some repeat of words 
documents = ["elephant donkey zebra horse zebra monkey",
            "elephant elephant elephant elephant elephant",
            "horse horse horse monkey monkey horse"]

In [5]:
vectorizer = TfidfVectorizer(stop_words = 'english')
tf_idf_matrix = vectorizer.fit_transform(documents) # Learn vocab and perform idf

In [6]:
svd = TruncatedSVD(n_components = 2) # This is our k value. For now I am using k = 2 so I can plot the data in 2D.
lsa = svd.fit_transform(tf_idf_matrix) # Apply low-rank approximation to our tf_idf_matrix

topic_encoded_tfidf_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_tfidf_df['documents'] = documents
topic_encoded_tfidf_df # We can see that most variance is happening in the first sentence for topic 1 but it doesn't explain much in topic 2

Unnamed: 0,topic_1,topic_2,documents
0,0.863239,0.0,elephant donkey zebra horse zebra monkey
1,0.515884,0.801784,elephant elephant elephant elephant elephant
2,0.692131,-0.597614,horse horse horse monkey monkey horse


In [7]:
dictionary = vectorizer.get_feature_names_out()
dictionary

array(['donkey', 'elephant', 'horse', 'monkey', 'zebra'], dtype=object)

In [8]:
svd.components_ # Only returns right singular vectors

array([[ 2.23184364e-01,  5.15883963e-01,  5.85113263e-01,
         3.77425362e-01,  4.46368729e-01],
       [-1.15639006e-16,  8.01783726e-01, -5.34522484e-01,
        -2.67261242e-01, -1.15237627e-16]])

In [9]:
svd.singular_values_

array([1.22080427, 1.        ])

In [10]:
encoding_matrix = pd.DataFrame(svd.components_, index = ['topic_1', 'topic_2']).T
encoding_matrix["terms"] = dictionary
display(encoding_matrix)

Unnamed: 0,topic_1,topic_2,terms
0,0.223184,-1.15639e-16,donkey
1,0.515884,0.8017837,elephant
2,0.585113,-0.5345225,horse
3,0.377425,-0.2672612,monkey
4,0.446369,-1.152376e-16,zebra


In [11]:
# Top concepts? Dimensions in term-space explain most of variance?

In [12]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
display(encoding_matrix.sort_values('abs_topic_2', ascending=False))

Unnamed: 0,topic_1,topic_2,terms,abs_topic_1,abs_topic_2
1,0.515884,0.8017837,elephant,0.515884,0.8017837
2,0.585113,-0.5345225,horse,0.585113,0.5345225
3,0.377425,-0.2672612,monkey,0.377425,0.2672612
0,0.223184,-1.15639e-16,donkey,0.223184,1.15639e-16
4,0.446369,-1.152376e-16,zebra,0.446369,1.152376e-16


### New, maybe more informative example

In [13]:
with open('trump1.txt', 'r') as f:
    trump1 = f.readlines()
    trump1 = [line.strip() for line in trump1] # Removes all \n 
    trump1 = [s for s in trump1 if s.strip()] # Removes all empty strings

sentences_df = pd.DataFrame(trump1, columns = ['Sentence'])
sentences_df['Title'] = 'CNBC Article: Trump Admin Stops Green Card to do more vetting'
sentences_df

Unnamed: 0,Sentence,Title
0,Finalizing applications filed by certain immig...,CNBC Article: Trump Admin Stops Green Card to ...
1,"U.S. Citizenship and Immigration Services, the...",CNBC Article: Trump Admin Stops Green Card to ...
2,"Trump's executive order, signed Jan. 20, title...",CNBC Article: Trump Admin Stops Green Card to ...
3,CBS News reported Tuesday that USCIS has direc...,CNBC Article: Trump Admin Stops Green Card to ...
4,The agency said in a statement attributed to a...,CNBC Article: Trump Admin Stops Green Card to ...
5,The statement did not address which applicatio...,CNBC Article: Trump Admin Stops Green Card to ...
6,Vetting on top of vetting,CNBC Article: Trump Admin Stops Green Card to ...
7,For refugees and those who have been granted a...,CNBC Article: Trump Admin Stops Green Card to ...
8,“There’s a certain amount of documentation you...,CNBC Article: Trump Admin Stops Green Card to ...
9,People who are granted asylum or admitted to t...,CNBC Article: Trump Admin Stops Green Card to ...


In [14]:
with open('trump2.txt', 'r') as f:
    trump2 = f.readlines()
    trump2 = [line.strip() for line in trump2 if line.strip()]

trump2_df = pd.DataFrame(trump2, columns = ['Sentence'])
assert len(trump2) == trump2_df.shape[0]

trump2_df['Title'] = 'Guardian Article: Trump Officials Pause Greencard in Crackdown'
display(trump2_df)

Unnamed: 0,Sentence,Title
0,The Trump administration has paused the proces...,Guardian Article: Trump Officials Pause Greenc...
1,CBS News reported that approved refugees are p...,Guardian Article: Trump Officials Pause Greenc...
2,The move is likely to leave some immigrants gr...,Guardian Article: Trump Officials Pause Greenc...
3,“USCIS [United States Citizenship and Immigrat...,Guardian Article: Trump Officials Pause Greenc...
4,Adjustment of status is the process by which i...,Guardian Article: Trump Officials Pause Greenc...
5,The DHS cited a presidential action issued by ...,Guardian Article: Trump Officials Pause Greenc...
6,It comes as a federal judge in Manhattan on Tu...,Guardian Article: Trump Officials Pause Greenc...
7,"Chung, 21, has lived in the US since she was s...",Guardian Article: Trump Officials Pause Greenc...
8,Chung’s case has echoes of the ongoing detenti...,Guardian Article: Trump Officials Pause Greenc...
9,At least five students and academics of color ...,Guardian Article: Trump Officials Pause Greenc...


In [15]:
concat_df = pd.concat([sentences_df, trump2_df], ignore_index=True) # Using ignore_index = True since I want one big dataframe
display(concat_df)

Unnamed: 0,Sentence,Title
0,Finalizing applications filed by certain immig...,CNBC Article: Trump Admin Stops Green Card to ...
1,"U.S. Citizenship and Immigration Services, the...",CNBC Article: Trump Admin Stops Green Card to ...
2,"Trump's executive order, signed Jan. 20, title...",CNBC Article: Trump Admin Stops Green Card to ...
3,CBS News reported Tuesday that USCIS has direc...,CNBC Article: Trump Admin Stops Green Card to ...
4,The agency said in a statement attributed to a...,CNBC Article: Trump Admin Stops Green Card to ...
5,The statement did not address which applicatio...,CNBC Article: Trump Admin Stops Green Card to ...
6,Vetting on top of vetting,CNBC Article: Trump Admin Stops Green Card to ...
7,For refugees and those who have been granted a...,CNBC Article: Trump Admin Stops Green Card to ...
8,“There’s a certain amount of documentation you...,CNBC Article: Trump Admin Stops Green Card to ...
9,People who are granted asylum or admitted to t...,CNBC Article: Trump Admin Stops Green Card to ...


In [20]:
with open('stpatricks.txt', 'r') as f:
    stpatricks = f.readlines()
    stp_text = [line.strip() for line in stpatricks if line.strip()]
    
    
stp_df = pd.DataFrame(stp_text, columns = ['Sentence'])
stp_df['Title'] = 'Blog: St. Patrick\'s Day Ideas'
display(stp_df)

Unnamed: 0,Sentence,Title
0,"St. Patrick’s Day, celebrated annually on Marc...",Blog: St. Patrick's Day Ideas
1,Amidst the sea of green attire and shamrock-ad...,Blog: St. Patrick's Day Ideas
2,Let us uncover the top 20 St. Patrick’s Day ac...,Blog: St. Patrick's Day Ideas
3,Book a demo with us to learn how to try out th...,Blog: St. Patrick's Day Ideas
4,"But first, dust off your leprechaun hat, grab ...",Blog: St. Patrick's Day Ideas
5,1. Pot of Gold Scavenger Hunt,Blog: St. Patrick's Day Ideas
6,The “Pot of Gold Scavenger Hunt” is a fun St. ...,Blog: St. Patrick's Day Ideas
7,Here is a list of St. Patrick’s Day-themed ite...,Blog: St. Patrick's Day Ideas
8,Four-leaf clover – 10 points,Blog: St. Patrick's Day Ideas
9,Leprechaun hat – 15 points,Blog: St. Patrick's Day Ideas


In [21]:
# Really simple concat

sentence_df = pd.concat([concat_df, stp_df], ignore_index=True) # Using ignore_index = True since I want one big dataframe
display(sentence_df)

Unnamed: 0,Sentence,Title
0,Finalizing applications filed by certain immig...,CNBC Article: Trump Admin Stops Green Card to ...
1,"U.S. Citizenship and Immigration Services, the...",CNBC Article: Trump Admin Stops Green Card to ...
2,"Trump's executive order, signed Jan. 20, title...",CNBC Article: Trump Admin Stops Green Card to ...
3,CBS News reported Tuesday that USCIS has direc...,CNBC Article: Trump Admin Stops Green Card to ...
4,The agency said in a statement attributed to a...,CNBC Article: Trump Admin Stops Green Card to ...
...,...,...
80,9. Lucky Leprechaun,Blog: St. Patrick's Day Ideas
81,“Lucky Leprechaun” is a heartwarming game desi...,Blog: St. Patrick's Day Ideas
82,"In this game, participants take turns revealin...",Blog: St. Patrick's Day Ideas
83,"As players share their stories, they engage in...",Blog: St. Patrick's Day Ideas


In [23]:
# Count the words
vectorizer = TfidfVectorizer(stop_words='english')

# Vectorize just the sentence, we don't care about vectorizing the Title
bag_of_words = vectorizer.fit_transform(sentence_df.Sentence)

In [24]:
svd = TruncatedSVD(n_components = 2)
lsa = svd.fit_transform(bag_of_words)

In [32]:
topic_encoded_df = pd.DataFrame(lsa, columns = ['Topic 1', 'Topic 2'])
topic_encoded_df['sentence'] = sentence_df.Sentence
# Is there a way to combine this?
topic_encoded_df['Is_Trump'] = (sentence_df.Title == "Guardian Article: Trump Officials Pause Greencard in Crackdown")
topic_encoded_df['Is_Trump'] = (sentence_df.Title == "CNBC Article: Trump Admin Stops Green Card to do more vetting")
display(topic_encoded_df)

Unnamed: 0,Topic 1,Topic 2,sentence,Is_Trump
0,0.009682,0.001339,Finalizing applications filed by certain immig...,True
1,0.023507,-0.004325,"U.S. Citizenship and Immigration Services, the...",True
2,0.018334,0.003281,"Trump's executive order, signed Jan. 20, title...",True
3,0.033366,-0.002017,CBS News reported Tuesday that USCIS has direc...,True
4,0.010883,0.001513,The agency said in a statement attributed to a...,True
...,...,...,...,...
80,0.189128,0.513166,9. Lucky Leprechaun,False
81,0.155096,0.213760,“Lucky Leprechaun” is a heartwarming game desi...,False
82,0.085555,0.124853,"In this game, participants take turns revealin...",False
83,0.019370,0.046266,"As players share their stories, they engage in...",False


In [None]:
ts = topic_encoded_df.Is_Trump.sum()

fal = len(topic_encoded_df) - ts
fal

# Ok - maybe try to see what happens when you add the norm argument to the tf-idf vectorizer?

# And also see medium https://medium.com/analytics-vidhya/understand-tf-idf-by-building-it-from-scratch-adc11eba7142#:~:text=Term%20frequency%20adjusted%20for%20document,f(t%2Cd)) implentation

#^ Confirm what happens actually happens

In [34]:
dictionary = vectorizer.get_feature_names_out()
display(dictionary)

array(['10', '11', '149', '15', '17th', '20', '2016', '2020', '2023',
       '21', '30', '53', 'academics', 'accessories', 'accompanied',
       'according', 'action', 'actions', 'activist', 'activities',
       'activity', 'add', 'adding', 'additional', 'address', 'adds',
       'adjusting', 'adjustment', 'administration', 'admission',
       'admitted', 'adorned', 'adventure', 'affect', 'affected',
       'agencies', 'agency', 'agenda', 'aggressively', 'aiming', 'aliens',
       'alignment', 'amidst', 'ancestry', 'annually', 'anon', 'anonymous',
       'anonymously', 'anticipation', 'appealing', 'applicants',
       'application', 'applications', 'applied', 'apply', 'appreciation',
       'appropriate', 'approved', 'area', 'arguing', 'arrange', 'asked',
       'asking', 'assigned', 'asylee', 'asylum', 'atmosphere',
       'attempting', 'attire', 'attorney', 'attributed', 'authorities',
       'backlog', 'banners', 'based', 'benefits', 'billion', 'blank',
       'blocked', 'bludgeon',

In [35]:
svd.components_

array([[ 2.65509255e-02,  1.63831246e-03,  3.79986123e-04, ...,
         1.25439472e-02,  2.64045903e-04,  4.59057151e-04],
       [ 3.17171656e-02,  2.97997768e-03, -1.74434527e-04, ...,
        -8.09814537e-04, -2.81317667e-04,  8.89482078e-05]])

In [37]:
svd.singular_values_

array([2.0130516 , 1.76176558])

In [38]:
encoding_matrix = pd.DataFrame(svd.components_, index = ['topic_1', 'topic_2']).T

encoding_matrix["terms"] = dictionary
display(encoding_matrix)

Unnamed: 0,topic_1,topic_2,terms
0,0.026551,0.031717,10
1,0.001638,0.002980,11
2,0.000380,-0.000174,149
3,0.065181,0.180662,15
4,0.011966,-0.006636,17th
...,...,...,...
632,0.015577,-0.005143,write
633,0.011419,-0.000008,writing
634,0.012544,-0.000810,year
635,0.000264,-0.000281,years


In [44]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
display(encoding_matrix.sort_values('abs_topic_1', ascending=False).head(15))


# Get the .iloc of words

# One of the reasons both topics seem to do with the st patricks day vs. others is because of document length?



Unnamed: 0,topic_1,topic_2,terms,abs_topic_1,abs_topic_2
162,0.326052,-0.166657,day,0.326052,0.166657
409,0.318218,-0.161884,patrick,0.318218,0.161884
536,0.317562,-0.160745,st,0.317562,0.160745
563,0.252651,-0.023463,team,0.252651,0.023463
20,0.199928,-0.087508,activity,0.199928,0.087508
295,0.189461,-0.106635,hunt,0.189461,0.106635
87,0.187932,-0.043372,building,0.187932,0.043372
251,0.185718,-0.052744,fun,0.185718,0.052744
427,0.173148,0.214797,points,0.173148,0.214797
489,0.170761,-0.092588,scavenger,0.170761,0.092588


In [45]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
display(encoding_matrix.sort_values('abs_topic_2', ascending=False).head(15))

Unnamed: 0,topic_1,topic_2,terms,abs_topic_1,abs_topic_2
336,0.146009,0.529509,leprechaun,0.146009,0.529509
277,0.111853,0.41646,hat,0.111853,0.41646
583,0.121456,0.355875,toss,0.121456,0.355875
427,0.173148,0.214797,points,0.173148,0.214797
354,0.122345,0.206839,lucky,0.122345,0.206839
3,0.065181,0.180662,15,0.065181,0.180662
162,0.326052,-0.166657,day,0.326052,0.166657
409,0.318218,-0.161884,patrick,0.318218,0.161884
536,0.317562,-0.160745,st,0.317562,0.160745
122,0.093289,0.126896,coin,0.093289,0.126896


### Decide what to do about the bottom later.

In [16]:
word_lookup = { v : k for k, v in enumerate(words)}
word_lookup

{'elephant': 0, 'horse': 1, 'zebra': 2, 'donkey': 3, 'monkey': 4}

In [17]:
for d_idx, document in enumerate(documents):
    for word in document.split():
        word_idx = word_lookup[word]
        dtm[d_idx][word_idx] += 1
dtm

NameError: name 'dtm' is not defined

In [None]:
# Divide the raw term frequencies by the total number of words in each document
tf_matrix = dtm / dtm.sum(axis = 1, keepdims=True)
tf_matrix

In [None]:
col_sums = np.count_nonzero(dtm, axis = 0) # We want the number of documents a term appears in. E.g. elephant appears twice
idf_matrix = np.log(np.divide(1 + len(documents), (1 + col_sums))) + 1
idf_matrix

In [None]:
tf_idf_matrix = tf_matrix * idf_matrix

# Unsure
norms = np.linalg.norm(tf_idf_matrix, axis = 1, keepdims=True)
tf_idf_matrix = tf_idf_matrix / norms
tf_idf_matrix

In [None]:
# Step 5: Confirm that this is correct with the library implementation

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True)
result = tfidf.fit_transform(documents)
tfidf_matrix = result.toarray()
tfidf_matrix

In [None]:
# I can confirm that my implementation and TfidfVectorizer are the same in different orders
# The vectorizer uses alphabetical order while mine is not in alphabetical order
tfidf.vocabulary_ 

### Pause and interpret tf_idf values. Check: Do they make sense?

IDF: Used to penalize common words since words that appear often across many documents don't tell us much.

In [None]:
# Library implementation


U, S, VT = np.linalg.svd(tf_idf_matrix)
print(f"Left Singular Vectors: {U}")
print(f"Singular Values: {S}")
print(f"Right Singular Vectors: {VT}")

In [None]:
df_U = pd.DataFrame(U, index = [sentence for sentence in documents], 
                   columns = [f"Concept {i+1}" for i in range(U.shape[1])])
df_U

In [None]:
df_V = pd.DataFrame(np.transpose(VT), columns = [f"Latent Topic {i+1}" for i in range(VT.shape[0])],
                    index = [word for word in words])                    
df_V

In [None]:
df_V[:2]