In [59]:
import re
import requests
import string
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sympy as sp
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /Users/zaynpatel/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Steps in the smaller document-term example:

1. Make the words and sentence lists
2. Initialize the original document term matrix to be zero with the dimensions of our documents and words
3. Loop through the documents and update the document term matrix with the counts of the words in the documents
4. Apply the tf-idf methods and take the element-wise product of the tf matrix and the idf matrix
5. Confirm that our tf-idf splitting is the same as the TfIdfVectorizer in sci-kit learn 
6. Now that we've confirmed our matrix, A, take the eigenthings of AAT and ATA, write down what these matrices mean
7. Plot the singular vectors of U and V, plot the singular values ∑
8. Interpret this and do the same thing again for another extreme example

In [3]:
# Make a dtm from three sentences and 5 words (should be 3x5)

words = ["elephant", "horse", "zebra", "donkey", "monkey"]

# Intention: Make one sentence random bunch of words, another that repeates, and another with some repeat of words 
documents = ["elephant donkey zebra horse zebra monkey",
            "elephant elephant elephant elephant elephant",
            "horse horse horse monkey monkey horse"]

In [4]:
vectorizer = TfidfVectorizer(stop_words = 'english')
tf_idf_matrix = vectorizer.fit_transform(documents) # Learn vocab and perform idf

In [5]:
svd = TruncatedSVD(n_components = 2) # This is our k value. For now I am using k = 2 so I can plot the data in 2D.
lsa = svd.fit_transform(tf_idf_matrix) # Apply low-rank approximation to our tf_idf_matrix

topic_encoded_tfidf_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_tfidf_df['documents'] = documents
topic_encoded_tfidf_df # We can see that most variance is happening in the first sentence for topic 1 but it doesn't explain much in topic 2

Unnamed: 0,topic_1,topic_2,documents
0,0.863239,1.387779e-16,elephant donkey zebra horse zebra monkey
1,0.515884,0.8017837,elephant elephant elephant elephant elephant
2,0.692131,-0.5976143,horse horse horse monkey monkey horse


In [6]:
dictionary = vectorizer.get_feature_names_out()
dictionary

array(['donkey', 'elephant', 'horse', 'monkey', 'zebra'], dtype=object)

In [7]:
svd.components_ # Only returns right singular vectors

array([[ 2.23184364e-01,  5.15883963e-01,  5.85113263e-01,
         3.77425362e-01,  4.46368729e-01],
       [ 5.29378437e-17,  8.01783726e-01, -5.34522484e-01,
        -2.67261242e-01,  1.92173424e-16]])

In [8]:
svd.singular_values_

array([1.22080427, 1.        ])

In [9]:
encoding_matrix = pd.DataFrame(svd.components_, index = ['topic_1', 'topic_2']).T
encoding_matrix["terms"] = dictionary
display(encoding_matrix)

Unnamed: 0,topic_1,topic_2,terms
0,0.223184,5.2937840000000004e-17,donkey
1,0.515884,0.8017837,elephant
2,0.585113,-0.5345225,horse
3,0.377425,-0.2672612,monkey
4,0.446369,1.921734e-16,zebra


In [10]:
# Top concepts? Dimensions in term-space explain most of variance?

In [11]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
display(encoding_matrix.sort_values('abs_topic_2', ascending=False))

Unnamed: 0,topic_1,topic_2,terms,abs_topic_1,abs_topic_2
1,0.515884,0.8017837,elephant,0.515884,0.8017837
2,0.585113,-0.5345225,horse,0.585113,0.5345225
3,0.377425,-0.2672612,monkey,0.377425,0.2672612
4,0.446369,1.921734e-16,zebra,0.446369,1.921734e-16
0,0.223184,5.2937840000000004e-17,donkey,0.223184,5.2937840000000004e-17


### New, maybe more informative example

In [12]:
with open('trump1.txt', 'r') as f:
    trump1 = f.readlines()
    trump1 = [line.strip() for line in trump1] # Removes all \n 
    trump1 = [s for s in trump1 if s.strip()] # Removes all empty strings

sentences_df = pd.DataFrame(trump1, columns = ['Sentence'])
sentences_df['Title'] = 'CNBC Article: Trump Admin Stops Green Card to do more vetting'
sentences_df

Unnamed: 0,Sentence,Title
0,Finalizing applications filed by certain immig...,CNBC Article: Trump Admin Stops Green Card to ...
1,"U.S. Citizenship and Immigration Services, the...",CNBC Article: Trump Admin Stops Green Card to ...
2,"Trump's executive order, signed Jan. 20, title...",CNBC Article: Trump Admin Stops Green Card to ...
3,CBS News reported Tuesday that USCIS has direc...,CNBC Article: Trump Admin Stops Green Card to ...
4,The agency said in a statement attributed to a...,CNBC Article: Trump Admin Stops Green Card to ...
5,The statement did not address which applicatio...,CNBC Article: Trump Admin Stops Green Card to ...
6,Vetting on top of vetting,CNBC Article: Trump Admin Stops Green Card to ...
7,For refugees and those who have been granted a...,CNBC Article: Trump Admin Stops Green Card to ...
8,“There’s a certain amount of documentation you...,CNBC Article: Trump Admin Stops Green Card to ...
9,People who are granted asylum or admitted to t...,CNBC Article: Trump Admin Stops Green Card to ...


In [13]:
with open('trump2.txt', 'r') as f:
    trump2 = f.readlines()
    trump2 = [line.strip() for line in trump2 if line.strip()]

trump2_df = pd.DataFrame(trump2, columns = ['Sentence'])
assert len(trump2) == trump2_df.shape[0]

trump2_df['Title'] = 'Guardian Article: Trump Officials Pause Greencard in Crackdown'
display(trump2_df)

Unnamed: 0,Sentence,Title
0,The Trump administration has paused the proces...,Guardian Article: Trump Officials Pause Greenc...
1,CBS News reported that approved refugees are p...,Guardian Article: Trump Officials Pause Greenc...
2,The move is likely to leave some immigrants gr...,Guardian Article: Trump Officials Pause Greenc...
3,“USCIS [United States Citizenship and Immigrat...,Guardian Article: Trump Officials Pause Greenc...
4,Adjustment of status is the process by which i...,Guardian Article: Trump Officials Pause Greenc...
5,The DHS cited a presidential action issued by ...,Guardian Article: Trump Officials Pause Greenc...
6,It comes as a federal judge in Manhattan on Tu...,Guardian Article: Trump Officials Pause Greenc...
7,"Chung, 21, has lived in the US since she was s...",Guardian Article: Trump Officials Pause Greenc...
8,Chung’s case has echoes of the ongoing detenti...,Guardian Article: Trump Officials Pause Greenc...
9,At least five students and academics of color ...,Guardian Article: Trump Officials Pause Greenc...


In [14]:
concat_df = pd.concat([sentences_df, trump2_df], ignore_index=True) # Using ignore_index = True since I want one big dataframe
display(concat_df)

Unnamed: 0,Sentence,Title
0,Finalizing applications filed by certain immig...,CNBC Article: Trump Admin Stops Green Card to ...
1,"U.S. Citizenship and Immigration Services, the...",CNBC Article: Trump Admin Stops Green Card to ...
2,"Trump's executive order, signed Jan. 20, title...",CNBC Article: Trump Admin Stops Green Card to ...
3,CBS News reported Tuesday that USCIS has direc...,CNBC Article: Trump Admin Stops Green Card to ...
4,The agency said in a statement attributed to a...,CNBC Article: Trump Admin Stops Green Card to ...
5,The statement did not address which applicatio...,CNBC Article: Trump Admin Stops Green Card to ...
6,Vetting on top of vetting,CNBC Article: Trump Admin Stops Green Card to ...
7,For refugees and those who have been granted a...,CNBC Article: Trump Admin Stops Green Card to ...
8,“There’s a certain amount of documentation you...,CNBC Article: Trump Admin Stops Green Card to ...
9,People who are granted asylum or admitted to t...,CNBC Article: Trump Admin Stops Green Card to ...


In [82]:
with open('stpatricks.txt', 'r') as f:
    stpatricks = f.readlines()
    stp_text = [line.strip() for line in stpatricks if line.strip()]
    print(stp_text[2])
    
    
stp_df = pd.DataFrame(stp_text, columns = ['Sentence'])
stp_df['Title'] = 'Blog: St. Patrick\'s Day Ideas'
#display(stp_df)

Let us uncover the top 20 St. Patrick’s Day activities for work. These live team-building activities will surely bring fun and purpose to your workplace before you go out to join the festivities.


In [16]:
# Really simple concat

sentence_df = pd.concat([concat_df, stp_df], ignore_index=True) # Using ignore_index = True since I want one big dataframe
display(sentence_df)

Unnamed: 0,Sentence,Title
0,Finalizing applications filed by certain immig...,CNBC Article: Trump Admin Stops Green Card to ...
1,"U.S. Citizenship and Immigration Services, the...",CNBC Article: Trump Admin Stops Green Card to ...
2,"Trump's executive order, signed Jan. 20, title...",CNBC Article: Trump Admin Stops Green Card to ...
3,CBS News reported Tuesday that USCIS has direc...,CNBC Article: Trump Admin Stops Green Card to ...
4,The agency said in a statement attributed to a...,CNBC Article: Trump Admin Stops Green Card to ...
...,...,...
80,9. Lucky Leprechaun,Blog: St. Patrick's Day Ideas
81,“Lucky Leprechaun” is a heartwarming game desi...,Blog: St. Patrick's Day Ideas
82,"In this game, participants take turns revealin...",Blog: St. Patrick's Day Ideas
83,"As players share their stories, they engage in...",Blog: St. Patrick's Day Ideas


In [17]:
# Count the words
vectorizer = TfidfVectorizer(stop_words='english')

# Vectorize just the sentence, we don't care about vectorizing the Title
bag_of_words = vectorizer.fit_transform(sentence_df.Sentence)

In [18]:
svd = TruncatedSVD(n_components = 2)
lsa = svd.fit_transform(bag_of_words)

In [19]:
topic_encoded_df = pd.DataFrame(lsa, columns = ['Topic 1', 'Topic 2'])
topic_encoded_df['sentence'] = sentence_df.Sentence
# Is there a way to combine this?
topic_encoded_df['Is_Trump'] = (sentence_df.Title == "Guardian Article: Trump Officials Pause Greencard in Crackdown")
topic_encoded_df['Is_Trump'] = (sentence_df.Title == "CNBC Article: Trump Admin Stops Green Card to do more vetting")
display(topic_encoded_df)

Unnamed: 0,Topic 1,Topic 2,sentence,Is_Trump
0,0.010268,-0.002250,Finalizing applications filed by certain immig...,True
1,0.024641,-0.003686,"U.S. Citizenship and Immigration Services, the...",True
2,0.018393,0.004421,"Trump's executive order, signed Jan. 20, title...",True
3,0.033126,-0.006623,CBS News reported Tuesday that USCIS has direc...,True
4,0.011679,0.001345,The agency said in a statement attributed to a...,True
...,...,...,...,...
80,0.191675,0.508132,9. Lucky Leprechaun,False
81,0.156631,0.207960,“Lucky Leprechaun” is a heartwarming game desi...,False
82,0.085967,0.124744,"In this game, participants take turns revealin...",False
83,0.020080,0.048248,"As players share their stories, they engage in...",False


In [20]:
dictionary = vectorizer.get_feature_names_out()
display(dictionary)

array(['10', '11', '149', '15', '17th', '20', '2016', '2020', '2023',
       '21', '30', '53', 'academics', 'accessories', 'accompanied',
       'according', 'action', 'actions', 'activist', 'activities',
       'activity', 'add', 'adding', 'additional', 'address', 'adds',
       'adjusting', 'adjustment', 'administration', 'admission',
       'admitted', 'adorned', 'adventure', 'affect', 'affected',
       'agencies', 'agency', 'agenda', 'aggressively', 'aiming', 'aliens',
       'alignment', 'amidst', 'ancestry', 'annually', 'anon', 'anonymous',
       'anonymously', 'anticipation', 'appealing', 'applicants',
       'application', 'applications', 'applied', 'apply', 'appreciation',
       'appropriate', 'approved', 'area', 'arguing', 'arrange', 'asked',
       'asking', 'assigned', 'asylee', 'asylum', 'atmosphere',
       'attempting', 'attire', 'attorney', 'attributed', 'authorities',
       'backlog', 'banners', 'based', 'benefits', 'billion', 'blank',
       'blocked', 'bludgeon',

In [21]:
svd.components_

array([[ 2.74957245e-02,  1.47277319e-03,  4.29120609e-04, ...,
         1.19304647e-02,  3.41858844e-05,  5.20389045e-04],
       [ 3.51303446e-02,  3.09957316e-03, -1.43509882e-04, ...,
        -3.58134745e-04,  2.74174104e-04, -1.33761232e-04]])

In [22]:
svd.singular_values_

array([2.01300013, 1.76183109])

In [23]:
encoding_matrix = pd.DataFrame(svd.components_, index = ['topic_1', 'topic_2']).T

encoding_matrix["terms"] = dictionary
display(encoding_matrix)

Unnamed: 0,topic_1,topic_2,terms
0,0.027496,0.035130,10
1,0.001473,0.003100,11
2,0.000429,-0.000144,149
3,0.063895,0.180223,15
4,0.011957,-0.006786,17th
...,...,...,...
632,0.015422,-0.005235,write
633,0.010983,-0.000090,writing
634,0.011930,-0.000358,year
635,0.000034,0.000274,years


In [24]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
display(encoding_matrix.sort_values('abs_topic_1', ascending=False).head(15))


# Get the .iloc of words

# One of the reasons both topics seem to do with the st patricks day vs. others is because of document length?



Unnamed: 0,topic_1,topic_2,terms,abs_topic_1,abs_topic_2
162,0.326537,-0.167403,day,0.326537,0.167403
409,0.318702,-0.162785,patrick,0.318702,0.162785
536,0.318057,-0.161826,st,0.318057,0.161826
563,0.248891,-0.020185,team,0.248891,0.020185
20,0.200386,-0.088134,activity,0.200386,0.088134
295,0.189415,-0.111806,hunt,0.189415,0.111806
251,0.186644,-0.051309,fun,0.186644,0.051309
87,0.185516,-0.040989,building,0.185516,0.040989
427,0.172954,0.218063,points,0.172954,0.218063
489,0.170764,-0.097378,scavenger,0.170764,0.097378


In [25]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
display(encoding_matrix.sort_values('abs_topic_2', ascending=False).head(15))

Unnamed: 0,topic_1,topic_2,terms,abs_topic_1,abs_topic_2
336,0.146837,0.527336,leprechaun,0.146837,0.527336
277,0.111244,0.414781,hat,0.111244,0.414781
583,0.121455,0.354061,toss,0.121455,0.354061
427,0.172954,0.218063,points,0.172954,0.218063
354,0.12506,0.201975,lucky,0.12506,0.201975
3,0.063895,0.180223,15,0.063895,0.180223
162,0.326537,-0.167403,day,0.326537,0.167403
409,0.318702,-0.162785,patrick,0.318702,0.162785
536,0.318057,-0.161826,st,0.318057,0.161826
122,0.093693,0.127308,coin,0.093693,0.127308


### Normalize using L2 norm to account for document length 


**Motivation:** Singular vectors of the encoding matrix don't tell us as *much* as we would like because St. Patrick's day article is twice as long as the others. So do this process again but normalize at the TfidfVectorizer() stage to see what happens. 

[Stanford tf-idf notes, page 32](https://web.stanford.edu/class/cs276/19handouts/lecture6-tfidf-1per.pdf) state that after normalizing the documents we get comparable weights for long/short documents. This ensures that the St. Patrick's Day document won't weigh more than the two separate Trump articles.

In [26]:
tfvectorizer = TfidfVectorizer(stop_words='english', norm='l2') # Add l2 norm to adjust for doc lengths
sentences = tfvectorizer.fit_transform(sentence_df.Sentence)

In [27]:
norm_svd = TruncatedSVD(n_components=2)
norm_lsa = norm_svd.fit_transform(sentences)

In [28]:
norm_topic_encoded_df = pd.DataFrame(norm_lsa, columns = ['Topic 1', 'Topic 2'])
topic_encoded_df['sentence'] = sentence_df.Sentence
# Is there a way to combine this?
norm_topic_encoded_df['Is_Trump'] = (sentence_df.Title == "Guardian Article: Trump Officials Pause Greencard in Crackdown")
norm_topic_encoded_df['Is_Trump'] = (sentence_df.Title == "CNBC Article: Trump Admin Stops Green Card to do more vetting")
display(norm_topic_encoded_df)

Unnamed: 0,Topic 1,Topic 2,Is_Trump
0,0.008471,-0.003606,True
1,0.022368,-0.004589,True
2,0.018316,0.000468,True
3,0.033124,-0.006547,True
4,0.011538,0.002639,True
...,...,...,...
80,0.189587,0.510712,False
81,0.155845,0.209995,False
82,0.085393,0.123390,False
83,0.019755,0.051285,False


In [29]:
norm_svd.singular_values_

array([2.01304076, 1.76172142])

In [30]:
norm_svd.components_

array([[ 0.02641077,  0.00182001,  0.00036773, ...,  0.01271989,
         0.00034801,  0.00033374],
       [ 0.03669071,  0.0033582 , -0.00050152, ..., -0.00167759,
        -0.0002879 , -0.00024332]])

In [31]:
encoding_matrix = pd.DataFrame(norm_svd.components_, index = ['topic_1', 'topic_2']).T
encoding_matrix["terms"] = dictionary
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
display(encoding_matrix.sort_values('abs_topic_1', ascending=False).head(15))

Unnamed: 0,topic_1,topic_2,terms,abs_topic_1,abs_topic_2
162,0.32635,-0.16674,day,0.32635,0.16674
409,0.318521,-0.161702,patrick,0.318521,0.161702
536,0.317879,-0.160288,st,0.317879,0.160288
563,0.252337,-0.026012,team,0.252337,0.026012
20,0.199678,-0.086428,activity,0.199678,0.086428
295,0.190126,-0.108743,hunt,0.190126,0.108743
87,0.187672,-0.045514,building,0.187672,0.045514
251,0.184678,-0.053116,fun,0.184678,0.053116
427,0.172987,0.217954,points,0.172987,0.217954
489,0.171195,-0.09572,scavenger,0.171195,0.09572


In [32]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
display(encoding_matrix.sort_values('abs_topic_2', ascending=False).head(15))

Unnamed: 0,topic_1,topic_2,terms,abs_topic_1,abs_topic_2
336,0.146941,0.528053,leprechaun,0.146941,0.528053
277,0.111748,0.41361,hat,0.111748,0.41361
583,0.120501,0.354988,toss,0.120501,0.354988
427,0.172987,0.217954,points,0.172987,0.217954
354,0.1221,0.20484,lucky,0.1221,0.20484
3,0.065723,0.179309,15,0.065723,0.179309
162,0.32635,-0.16674,day,0.32635,0.16674
409,0.318521,-0.161702,patrick,0.318521,0.161702
536,0.317879,-0.160288,st,0.317879,0.160288
122,0.092983,0.129247,coin,0.092983,0.129247


### Now add another Trump article - longer one from online

In [73]:
response = requests.get('https://austinkocher.substack.com/p/trump-declares-war-on-immigration')
soup = BeautifulSoup(response.content, 'html.parser')

# New learning - how to exclude certain content from scrape
p = soup.find_all(lambda tag: tag.name == 'p' and not tag.find_parent(class_="pullquote"))

clean_text = [el.text.strip() for el in p]
clean_text = clean_text[:36] 

# Convert list into string so we can tokenize sentences
text_str = ""
for paragraph in clean_text:
    text_str += paragraph

sentences = sent_tokenize(text_str)

trump3_df = pd.DataFrame(sentences, columns = ['Sentence'])
trump3_df['Title'] = 'Substack post: Trump Declares War with Immigration Lawyers'
display(trump3_df)

Unnamed: 0,Sentence,Title
0,"When authoritarian regimes come to power, one ...",Substack post: Trump Declares War with Immigra...
1,"This week, Trump escalated his campaign agains...",Substack post: Trump Declares War with Immigra...
2,The message is clear: lawyers who oppose this ...,Substack post: Trump Declares War with Immigra...
3,Framed as an effort to enforce professional et...,Substack post: Trump Declares War with Immigra...
4,This focus makes clear that the immigrant righ...,Substack post: Trump Declares War with Immigra...
...,...,...
75,No one has filed more frivolous and unsubstant...,Substack post: Trump Declares War with Immigra...
76,"After he lost the 2020 election, Trump sent at...",Substack post: Trump Declares War with Immigra...
77,"Those lawsuits were entirely rhetorical, not b...",Substack post: Trump Declares War with Immigra...
78,If anyone should be professionally disciplined...,Substack post: Trump Declares War with Immigra...


In [74]:
three_trump_one_stp_df = pd.concat([sentence_df, trump3_df], ignore_index=True)

In [75]:
three_trump_one_stp_df.Title

0      CNBC Article: Trump Admin Stops Green Card to ...
1      CNBC Article: Trump Admin Stops Green Card to ...
2      CNBC Article: Trump Admin Stops Green Card to ...
3      CNBC Article: Trump Admin Stops Green Card to ...
4      CNBC Article: Trump Admin Stops Green Card to ...
                             ...                        
160    Substack post: Trump Declares War with Immigra...
161    Substack post: Trump Declares War with Immigra...
162    Substack post: Trump Declares War with Immigra...
163    Substack post: Trump Declares War with Immigra...
164    Substack post: Trump Declares War with Immigra...
Name: Title, Length: 165, dtype: object

In [79]:
trump2_df[0]

KeyError: 0

In [38]:
vectorizer = TfidfVectorizer(stop_words='english', norm='l2')
sentences = vectorizer.fit_transform(three_trump_one_stp_df.Sentence)

In [39]:
svd = TruncatedSVD(n_components = 2)
lsa = svd.fit_transform(sentences)

In [40]:
topic_encoded_df = pd.DataFrame(lsa, columns = ['Topic 1', 'Topic 2'])
topic_encoded_df['sentence'] = three_trump_one_stp_df.Sentence
# Is there a way to combine this?
topic_encoded_df['Not_Trump'] = (three_trump_one_stp_df.Title == "Blog: St. Patrick\'s Day Ideas")
display(topic_encoded_df)

Unnamed: 0,Topic 1,Topic 2,sentence,Not_Trump
0,0.048003,0.267070,Finalizing applications filed by certain immig...,False
1,0.074011,0.336283,"U.S. Citizenship and Immigration Services, the...",False
2,0.046137,0.192462,"Trump's executive order, signed Jan. 20, title...",False
3,0.066791,0.233819,CBS News reported Tuesday that USCIS has direc...,False
4,0.033302,0.179525,The agency said in a statement attributed to a...,False
...,...,...,...,...
119,0.001089,0.007243,Living in Dangerous Times,False
120,0.046396,0.244918,"At the risk of understatement, I am growing in...",False
121,0.068745,0.293307,I was following the reactions of various well-...,False
122,0.033494,0.156344,Let’s not overlook the obvious here. No one ha...,False


### Decide what to do about the bottom later.

In [41]:
word_lookup = { v : k for k, v in enumerate(words)}
word_lookup

{'elephant': 0, 'horse': 1, 'zebra': 2, 'donkey': 3, 'monkey': 4}

In [42]:
for d_idx, document in enumerate(documents):
    for word in document.split():
        word_idx = word_lookup[word]
        dtm[d_idx][word_idx] += 1
dtm

NameError: name 'dtm' is not defined

In [None]:
# Divide the raw term frequencies by the total number of words in each document
tf_matrix = dtm / dtm.sum(axis = 1, keepdims=True)
tf_matrix

In [None]:
col_sums = np.count_nonzero(dtm, axis = 0) # We want the number of documents a term appears in. E.g. elephant appears twice
idf_matrix = np.log(np.divide(1 + len(documents), (1 + col_sums))) + 1
idf_matrix

In [None]:
tf_idf_matrix = tf_matrix * idf_matrix

# Unsure
norms = np.linalg.norm(tf_idf_matrix, axis = 1, keepdims=True)
tf_idf_matrix = tf_idf_matrix / norms
tf_idf_matrix

In [None]:
# Step 5: Confirm that this is correct with the library implementation

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True)
result = tfidf.fit_transform(documents)
tfidf_matrix = result.toarray()
tfidf_matrix

In [None]:
# I can confirm that my implementation and TfidfVectorizer are the same in different orders
# The vectorizer uses alphabetical order while mine is not in alphabetical order
tfidf.vocabulary_ 

### Pause and interpret tf_idf values. Check: Do they make sense?

IDF: Used to penalize common words since words that appear often across many documents don't tell us much.

In [None]:
# Library implementation


U, S, VT = np.linalg.svd(tf_idf_matrix)
print(f"Left Singular Vectors: {U}")
print(f"Singular Values: {S}")
print(f"Right Singular Vectors: {VT}")

In [None]:
df_U = pd.DataFrame(U, index = [sentence for sentence in documents], 
                   columns = [f"Concept {i+1}" for i in range(U.shape[1])])
df_U

In [None]:
df_V = pd.DataFrame(np.transpose(VT), columns = [f"Latent Topic {i+1}" for i in range(VT.shape[0])],
                    index = [word for word in words])                    
df_V

In [None]:
df_V[:2]