TF-IDF practice. Method 1: relying mostly on TfidfVectorizer without pandas

This practice file uses the "EEBOphase2_1590-1639_body_texts" folder from DCC.

Helpful tutorial: https://www.youtube.com/watch?v=i74DVqMsRWY

Helpful note: "the function get_feature_namest() for the vectorizer is now deprecated; when version 1.2 of sklearn is released the function will be completely removed (thus breaking the code in this video). The new standard function to use is to change the line to: vectorizer.get_feature_names_out()."

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import string
from nltk.corpus import stopwords
import json
import glob
import re
import os

In [29]:
# Put the folder pathname here:
folderPath = "/Users/Jerry/Desktop/Submission Diary entries"

# The following dictionary stores the content of all files. The document is in this format - key: (type: string. file name without ".txt") | value: (type: string. the text of the document combined into one paragraph):

titleContentDictionary = {}

# The loadFolder function below reads the "folderPath" variable and loads content into the dictionary above.
def loadFolder(folder):
    for filename in os.listdir(folder):
        if filename.endswith(".txt"):
            filePath = os.path.join(folder, filename)
            with open(filePath, "r") as fileToOpen:
                content = fileToOpen.read()
                content = content.replace("\n", "")
                content = " ".join(content.split())
                titleContentDictionary[filename[:-4]] = content
loadFolder(folderPath)

#FOR DEBUG: print(titleContentDictionary["A00002"])

'''
USE THIS CODE IF YOU WISH TO STORE THE CONTENT OF FILES IN A JSON:

with open("file.json", "w", encoding="utf-8") as jsonDestination:
    json.dump(titleContentDictionary, jsonDestination, indent=4)
'''

'\nUSE THIS CODE IF YOU WISH TO STORE THE CONTENT OF FILES IN A JSON:\n\nwith open("file.json", "w", encoding="utf-8") as jsonDestination:\n    json.dump(titleContentDictionary, jsonDestination, indent=4)\n'

In [15]:
# Fucntion to clean stopwords and punctuation marks. Call this function if needed
def cleanStopwordsPunctuation(content):
    splittedContent = content.split()
    withStopwordsRemovedList = []
    for word in splittedContent:
        if word not in stopwords.words("english"):
            withStopwordsRemovedList.append(word)
    withStopwordsRemovedString = " ".join(withStopwordsRemovedList)
    punctuationRemoved = withStopwordsRemovedString.translate(str.maketrans("", "", string.punctuation))

    return(punctuationRemoved)

In [16]:
# Define TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    lowercase=True,
    max_features = 100, # an integer. This decides how many output terms from most frequent to least frequent you wish the program to provide. For example, max_features = 100 means you wish to see the 100 most frequent terms (ngrams) in the corpus.
    max_df = 0.8, # a float. Words appearing in over 80% of documents will be ignored.
    min_df = 5, # an integer. Words appearing in less than 5 documents will be ignored.
    ngram_range = (1,3), # looking at single word, double-word-group (bigram), and triple-word-group (trigram). From 1 to 3-term pairs. Could change the number to customize the size of the terms you wish to see. E.g.: (2, 3) would mean only searching for bigrams and trigrams.
    stop_words = "english"
)

In [23]:
# This variable grabs only the values (or the file content) from the "titleContentDictionary" dictionary, since vectorizer only takes in content and not file name. 
contentAsDictValues = titleContentDictionary.values() # Type: <class "dict_values"> . This is NOT a list and should not be passed into "vectorizer.fit_transform" in the next cell!

# This variable stores the content of the files and should be passed into "vectorizer.fit_transform" in the next file.
contentAsList = []

# This "for" loop is to turn the <class "dict_values"> variable "contentAsDictValues" into the <class "list> variable "contentAsList"
for chapter in titleContentDictionary.values():
    contentAsList.append(chapter)
#FOR DEBUG: print(type(contentAsList))

<class 'list'>


In [28]:
'''
THESE VARIABLES ARE FOR DEBUG PURPOSES
test1 = "Los Angeles is a city of quartz, reflecting the lights from a sun that no longer melts even the snowflakes falling during an afternoon in June of 2026."
test2 = "San Francisco is a city of quartz, reflecting the lights from a sun that no longer melts even the snowflakes falling during an afternoon in June of 2026."
test3 = "Detroit is a city of diamonts, reflecting the lights from a moon that no longer melts ice in June of 2025."
test4 = "Detroit is a city of diamonts, reflecting the lights from a moon that no longer melts ice in June of 2025."
'''

'''
USE THIS CODE IF YOU ARE STORING YOUR CONTENT IN A JSON RATHER THAN A DICTIONARY
jsonAsList = []

with open("/Users/Jerry/Desktop/Submission Diary entries", "r") as file:
    stories = json.load(file)
    for story in stories.values():
        jsonAsList.append(story)
'''

vectors = vectorizer.fit_transform(contentAsList)
featuresNames = vectorizer.get_feature_names_out() #no parameters passed in.
dense = vectors.todense()
denseList = dense.tolist()

#this list will be a list of lists --- i.e.: [[content, content], [content, content], [content, content]] that stores the words deemed as significant by the TF-IDF calculation.
allKeywords = []
'''to interpret the "allKeywords" list of lists: the index number of one list is the same as the index number of the file content in the "contentAsList" list. For example, allKeywords[0] calls the first list in the "allKeywords" list of lists. The words in allKeywords[0] are the significant words from the file content stored in contentAsList[0].
'''

for term in denseList:
    x = 0
    keywords = []
    for word in term:
        if word > 0:
            keywords.append(featuresNames[x])
        x += 1
    allKeywords.append(keywords)

# This "for" loop will print out the signifciant words
for list in allKeywords:
    print(list)

'''
THIS CODE WILL PRINT OUT ALL UNIQUE WORDS DEEMED AS SGINIFICANT FROM THE ENTIRE FOLDER.

allKeywordsver2Set = set()
for list in allKeywords:
    temporarySet = set()
    for word in list:
        temporarySet.add(word)
    for word in temporarySet:
        allKeywordsver2Set.add(word)
print(allKeywordsver2Set)
'''

{'led', 'menu', 'distance', 'said', 'present', 'reading', 'year old', 'described', 'covered', 'vote', 'law', 'finished', 'written', 'social', 'lips', 'lot', 'violent', 'considered', 'light', 'shot', 'bad', 'trouble', 'violence', 'sofa', 'media', 'coffee', 'felt', 'water', 'guess', 'respect', 'en route', 'door', 'things', 'glad', 'thousand', 'problem', 'information', 'wing', 'department', 'struck', 'moved', 'eye', 'old man', 'party', 'mayi', 'lost', 'ben abbes', 'powerful', 'sky', 'question', 'muslim brotherhood', 'want', 'ago', 'tone', 'enjoyed', 'man', 'let', 'months', 'marie', 'impossible', 'local', 'bottle', 'strange', 'opened', 'morning', 'lived', 'cold', 'expressed', 'come', 'socialists', 'did', 'dressed', 'close', 'experience', 'aside', 'ready', 'personal', 'failure', 'circumstances', 'smooth', 'devoted', 'face', 'open', 'sorbonne', 'met', 'win', 'pure', 'moment', 'hotel', 'leave', 'yes', 'idea', 'slowly', 'elegant', 'la', 'shopping', 'living room', 'rate', 'france', 'paper', 'tu