In [None]:
!pip install constellate-client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Collecting Dataset

In [None]:
# Default dataset is "Shakespeare Quarterly," 1950-present
dataset_id = "7e41317e-740f-e86a-4729-20dab492e925"

# Importing your dataset with a dataset ID
import constellate
# Pull in the sampled dataset (1500 documents) that matches `dataset_id`
# in the form of a gzipped JSON lines file.
# The .get_dataset() method downloads the gzipped JSONL file
# to the /data folder and returns a string for the file name and location
dataset_file = constellate.get_dataset(dataset_id)

# To download the full dataset (up to a limit of 25,000 documents),
# request it first in the builder environment. See the Constellate Client
# documentation at: https://constellate.org/docs/constellate-client
# Then use the `constellate.download` method show below.
# dataset_file = constellate.download(dataset_id, 'jsonl')

Constellate: use and download of datasets is covered by the Terms & Conditions of Use: https://constellate.org/terms-and-conditions/
All documents from JSTOR published in Shakespeare Quarterly from 1950 - 2020. 6745 documents.
INFO:root:File /content/data/7e41317e-740f-e86a-4729-20dab492e925-sampled-jsonl.jsonl.gz exists. Not re-downloading.


# Apply Pre-Processing Filters (if available)

In [None]:
# Import a pre-processed CSV file of filtered dataset IDs.
# If you do not have a pre-processed CSV file, the analysis
# will run on the full dataset and may take longer to complete.
import pandas as pd
import os

pre_processed_file_name = f'data/pre-processed_{dataset_id}.csv'

if os.path.exists(pre_processed_file_name):
    df = pd.read_csv(pre_processed_file_name)
    filtered_id_list = df["id"].tolist()
    use_filtered_list = True
    print('Pre-Processed CSV found. Successfully read in ' + str(len(df)) + ' documents.')
else: 
    use_filtered_list = False
    print('No pre-processed CSV file found. Full dataset will be used.')

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
No pre-processed CSV file found. Full dataset will be used.


# Define a Unigram Processing Function

In this step, we gather the unigrams. If there is a Pre-Processing Filter, we will only analyze documents from the filtered ID list. We will also process each unigram, assessing them individually. We will complete the following tasks:

- Lowercase all tokens

- Remove tokens in stopwords list

- Remove tokens with fewer than 4 characters

- Remove tokens with non-alphabetic characters

We can define this process in a function.

In [None]:
# Define a function that will process individual tokens
# Only a token that passes through all three `if` 
# statements will be returned. A `True` result for
# any `if` statement does not return the token. 

def process_token(token):
    token = token.lower()
    if len(token) < 4: # If True, do not return token
        return None
    if not(token.isalpha()): # If True, do not return token
        return None
    return token # If all are False, return the lowercased token

# Collect lists of Document IDs, Titles, and Unigrams

In [None]:
documents = [] # A list that will contain all of our unigrams
document_ids = [] # A list that will contain all of our document ids
document_titles = [] # A list that will contain all of our titles

for document in constellate.dataset_reader(dataset_file):
    processed_document = [] # Temporarily store the unigrams for this document
    document_id = document['id'] # Temporarily store the document id for this document
    document_title = document['title'] # Temporarily store the document title for this document
    if use_filtered_list is True:
        # Skip documents not in our filtered_id_list
        if document_id not in filtered_id_list:
            continue
    unigrams = document.get("unigramCount", [])
    for gram, count in unigrams.items():
        clean_gram = process_token(gram)
        if clean_gram is None:
            continue
        processed_document += [clean_gram] * count # Add the unigram as many times as it was counted
    if len(processed_document) > 0:
        document_ids.append(document_id)
        document_titles.append(document_title)
        documents.append(processed_document)

In [None]:
# Show the unigrams collected for a particular document
# Change the value of n to see a different document
n = 0

print(document_titles[n])
list(documents[n])

Review Article


['social',
 'notable',
 'discuss',
 'voluminous',
 'least',
 'least',
 'distillation',
 'have',
 'have',
 'since',
 'hope',
 'never',
 'never',
 'prob',
 'spite',
 'among',
 'rare',
 'dealt',
 'actually',
 'taking',
 'taking',
 'truer',
 'evolved',
 'competitor',
 'education',
 'pressure',
 'excellence',
 'cambridge',
 'students',
 'fashionable',
 'shakespeare',
 'shakespeare',
 'usher',
 'study',
 'essential',
 'guneratne',
 'tradition',
 'intervals',
 'harshly',
 'organization',
 'consider',
 'historiographers',
 'female',
 'operations',
 'reviews',
 'considering',
 'shows',
 'participants',
 'established',
 'denigrating',
 'synchronized',
 'only',
 'only',
 'before',
 'chronology',
 'times',
 'treatment',
 'archaeopteryx',
 'entirely',
 'entirely',
 'screened',
 'might',
 'even',
 'even',
 'even',
 'fail',
 'response',
 'interest',
 'interest',
 'falsifies',
 'method',
 'talents',
 'that',
 'that',
 'that',
 'that',
 'that',
 'that',
 'that',
 'that',
 'that',
 'that',
 'that',
 'th

In [None]:
# Convert a given document into a Counter object to determine
# word frequencies count

# Import counter to help count word frequencies
from collections import Counter

word_freq = Counter(documents[1]) # Change documents index to see a different document
word_freq.most_common(25)

[('with', 12),
 ('gentle', 6),
 ('have', 5),
 ('that', 5),
 ('shakespeare', 4),
 ('jupiter', 4),
 ('this', 4),
 ('rosalind', 4),
 ('most', 4),
 ('name', 4),
 ('first', 4),
 ('orlando', 4),
 ('scene', 3),
 ('reading', 3),
 ('when', 3),
 ('ganymede', 3),
 ('good', 2),
 ('next', 2),
 ('folio', 2),
 ('half', 2),
 ('modern', 2),
 ('feel', 2),
 ('lovemaking', 2),
 ('texts', 2),
 ('your', 2)]

# Using Gensim to Compute “Term Frequency- Inverse Document Frequency"

# Creating a Gensim Dictionary

A gensim dictionary is a kind of masterlist of all the words across all the documents in our corpus. Each unique word is assigned an ID in the gensim dictionary. The result is a set of key/value pairs of unique tokens and their unique IDs.

In [None]:
import gensim
dictionary = gensim.corpora.Dictionary(documents)

INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English
INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(96666 unique tokens: ['about', 'abundance', 'academy', 'acceptance', 'accom']...) from 1499 documents (total 2055259 corpus positions)


The gensim dictionary stores a unique identifier (starting with 0) for every unique token in the corpus. The gensim dictionary does not contain information on word frequencies; it only catalogs all the unique words in the corpus. You can see the unique ID for each token in the text using the .token2id() method.

In [None]:
list(dictionary.token2id.items())

[('about', 0),
 ('abundance', 1),
 ('academy', 2),
 ('acceptance', 3),
 ('accom', 4),
 ('accommodated', 5),
 ('account', 6),
 ('accounts', 7),
 ('accurate', 8),
 ('achievement', 9),
 ('achievements', 10),
 ('acknowledge', 11),
 ('action', 12),
 ('actually', 13),
 ('adaptations', 14),
 ('additional', 15),
 ('advanced', 16),
 ('afterword', 17),
 ('alimentary', 18),
 ('already', 19),
 ('although', 20),
 ('american', 21),
 ('among', 22),
 ('amplifications', 23),
 ('ance', 24),
 ('anthony', 25),
 ('anticipated', 26),
 ('appear', 27),
 ('appeared', 28),
 ('appears', 29),
 ('appellation', 30),
 ('approach', 31),
 ('appropriates', 32),
 ('appropriating', 33),
 ('approximate', 34),
 ('archaeopteryx', 35),
 ('arguments', 36),
 ('arsenio', 37),
 ('articulate', 38),
 ('asides', 39),
 ('assayed', 40),
 ('assortment', 41),
 ('assume', 42),
 ('asta', 43),
 ('attached', 44),
 ('attempts', 45),
 ('attendant', 46),
 ('attitude', 47),
 ('award', 48),
 ('ball', 49),
 ('become', 50),
 ('been', 51),
 ('befo

In [None]:
# Get the value for the key 'people'. Return 0 if there is no token matching 'people'. 
# The number returned is the gensim dictionary ID for the token. 

dictionary.token2id.get('people', 0)

2391

In [None]:
# Find the token associated with a token id number
token_id = 2391

# If the token id matches, print out the associated token
for dict_id, token in dictionary.items():
    if dict_id == token_id:
        print(token)

people


# Creating a Bag of Words Corpus

In [None]:
# Create a bag of words corpus
bow_corpus = []

for document in documents:
    bow_corpus.append(dictionary.doc2bow(document))

print('Bag of words corpus created successfully.')

# The for loop could also be written as a list comprehension
# bow_corpus = [dictionary.doc2bow(document) for document in documents]

Bag of words corpus created successfully.


In [None]:
# Examine the bag of words corpus for a specific document n
# Change the value of n to see another document
n = 50

list(bow_corpus[n][:25]) # List out a slice of the first 25 items

[(69, 1),
 (209, 2),
 (365, 3),
 (403, 6),
 (511, 1),
 (527, 1),
 (542, 1),
 (710, 1),
 (716, 1),
 (780, 2),
 (802, 1),
 (832, 1),
 (895, 2),
 (900, 1),
 (937, 2),
 (966, 1),
 (967, 1),
 (1011, 2),
 (1777, 2),
 (1903, 1),
 (2082, 1),
 (2456, 2),
 (3433, 1),
 (4095, 1),
 (4909, 1)]

In [None]:
# For each id and count in the bag of words corpus
# Print the corresponding word from the Gensim dictionary and count
for id, count in bow_corpus[n]:
    print(dictionary[id].ljust(25), count)

business                  1
folger                    2
quarterly                 3
shakespeare               6
with                      1
association               1
dealing                   1
addressed                 1
america                   1
correspondence            2
editor                    1
founded                   1
library                   2
literary                  1
other                     2
publication               1
published                 1
should                    2
matters                   2
same                      1
books                     1
sent                      2
review                    1
accepted                  1
offered                   1
relating                  1
manuscripts               2
patricia                  1
subscriptions             1


# Create the TfidfModel

The next step is to create the TF-IDF model which will set the parameters for our implementation of TF-IDF. In our TF-IDF example, the formula for TF-IDF was:

In [None]:
# Create our gensim TF-IDF model
model = gensim.models.TfidfModel(corpus=bow_corpus, smartirs='atc')

INFO:gensim.models.tfidfmodel:collecting document frequencies
INFO:gensim.models.tfidfmodel:PROGRESS: processing document #0
INFO:gensim.models.tfidfmodel:calculating IDF weights for 1499 documents and 96665 features (965947 matrix non-zeros)


In [None]:
# Create TF-IDF scores for the ``bow_corpus`` using our model
corpus_tfidf = model[bow_corpus]

In [None]:
# List out the TF-IDF scores for the nth document's first 10 tokens
# Change n to change the document
n = 0

list(corpus_tfidf[n][:25])

[(0, 0.004165359318460266),
 (1, 0.05273152277773158),
 (2, 0.03896359725500704),
 (3, 0.03249886655811404),
 (4, 0.07677324222557771),
 (5, 0.05534224658180443),
 (6, 0.014382272407544904),
 (7, 0.027113784416215946),
 (8, 0.0373379713845312),
 (9, 0.03620665872940413),
 (10, 0.0491437025430276),
 (11, 0.03344972020380123),
 (12, 0.016698377189814092),
 (13, 0.0212699539322715),
 (14, 0.04509821793214101),
 (15, 0.026893808282929015),
 (16, 0.034346331097227095),
 (17, 0.050577386719316234),
 (18, 0.07677324222557771),
 (19, 0.01735223344771443),
 (20, 0.012660105724594202),
 (21, 0.019753429292405013),
 (22, 0.012142896457206141),
 (23, 0.08178276354379453),
 (24, 0.04477049198716163)]

In [None]:
# Display the tokens instead of the gensim dictionary IDs.
for id, score in corpus_tfidf[n][:25]:
    print(dictionary[id].ljust(20), score)

about                0.004165359318460266
abundance            0.05273152277773158
academy              0.03896359725500704
acceptance           0.03249886655811404
accom                0.07677324222557771
accommodated         0.05534224658180443
account              0.014382272407544904
accounts             0.027113784416215946
accurate             0.0373379713845312
achievement          0.03620665872940413
achievements         0.0491437025430276
acknowledge          0.03344972020380123
action               0.016698377189814092
actually             0.0212699539322715
adaptations          0.04509821793214101
additional           0.026893808282929015
advanced             0.034346331097227095
afterword            0.050577386719316234
alimentary           0.07677324222557771
already              0.01735223344771443
although             0.012660105724594202
american             0.019753429292405013
among                0.012142896457206141
amplifications       0.08178276354379453
ance     

# Find Top Terms in a Single Document

In [None]:
# Sort the tuples in our tf-idf scores list

# Choosing a document by its index number
# Change n to see a different document
n = 0

def Sort(tfidf_tuples):
    "This sorts based on the second value in our tuple, the tf-idf score"
    tfidf_tuples.sort(key = lambda x: x[1], reverse=True)
    return tfidf_tuples 

# Print the document id and title
print('Title: ', document_titles[n])
print('ID: ', document_ids[n])
print('----------------------------------------')

# List the top twenty tokens in our example document by their TF-IDF scores
# First we sort the tokens with their scores
most_significant_terms = Sort(corpus_tfidf[n])[:20]

# Next we print the list, replacing the token ids with the tokens
for id, score in most_significant_terms:
    print(dictionary[id].ljust(20), score)

Title:  Review Article
ID:  http://www.jstor.org/stable/23025656
----------------------------------------
vardac               0.09636970325498885
archaeopteryx        0.09034659680155203
arsenio              0.09034659680155203
bergmans             0.09034659680155203
commemora            0.09034659680155203
concentrat           0.09034659680155203
contingen            0.09034659680155203
disjunc              0.09034659680155203
falsifies            0.09034659680155203
frugoni              0.09034659680155203
godzillas            0.09034659680155203
guneratne            0.09034659680155203
novelizations        0.09034659680155203
pathe                0.09034659680155203
porten               0.09034659680155203
practi               0.09034659680155203
rearguard            0.09034659680155203
synetic              0.09034659680155203
vitagraph            0.09034659680155203
buchanan             0.08189645623582076


We also analyze across the entire corpus to find the most unique terms. These are terms that appear in a particular text, but rarely or never appear in other texts. (Often, these will be proper names since a particular article may mention a name often but the name may rarely appear in other articles. There’s also a fairly good chance these will be typos or errors in optical character recognition.)

In [None]:
td = {}
for document in corpus_tfidf:
    for token_id, score in document:
        current_score = td.get(dictionary.get(token_id), 0)
        if current_score < score:
            td.update([(dictionary.get(token_id), score)])

In [None]:
# Sort the items of ``td`` into a new variable ``sorted_td``
# the ``reverse`` starts from highest to lowest
sorted_td = sorted(td.items(), key=lambda kv: kv[1], reverse=True) 

for term, weight in sorted_td[:25]: # Print the top 25 terms in the entire corpus
    print(term.ljust(20), weight)

cwtrnca              0.9777852589476882
terence              0.9498900312074794
nuimber              0.8319747660464009
tuft                 0.7868376014650895
emblemata            0.7852131956320808
ouderdom             0.7766128344659354
wenceslaus           0.7639576845819294
gaiicanus            0.7601563052782292
reproducedfrmtefgr   0.7572397678450393
dicus                0.7265126251019742
houwelyck            0.6967260500651924
hrotsvits            0.6912337411707761
gibraltar            0.6631087398173557
comediae             0.6456051905232666
penshurst            0.6366533423710458
duodekas             0.6325106929571689
emblematum           0.6325106929571689
jike                 0.6313999059233509
evenyng              0.6293035863369911
mornyng              0.6293035863369911
alcmena              0.6252388155874612
amphitryo            0.6252388155874612
captivi              0.6206171107588102
johan                0.6192255141749697
dormitory            0.6090366868084331


# Display Most Significant Term for each Document

In [None]:
# For each document, print the ID, most significant/unique word, and TF/IDF score

n = 0

for n, doc in enumerate(corpus_tfidf):
    if len(doc) < 1:
        continue
    word_id, score = max(doc, key=lambda x: x[1])
    print(document_ids[n], dictionary.get(word_id), score)
    if n >= 10:
        break

http://www.jstor.org/stable/23025656 vardac 0.09636970325498885
http://www.jstor.org/stable/2866842 atomies 0.16414829725545108
http://www.jstor.org/stable/2870361 ishrat 0.09742685996867113
http://www.jstor.org/stable/2869042 birbeck 0.08884924780770011
http://www.jstor.org/stable/2866396 recitative 0.08891242784008163
http://www.jstor.org/stable/2868260 auff 0.0706386429797704
http://www.jstor.org/stable/2866945 fhall 0.10940495221853316
http://www.jstor.org/stable/2866486 tarras 0.14019047478795563
http://www.jstor.org/stable/2870020 moriarty 0.0839037873576494
http://www.jstor.org/stable/2867158 mavortio 0.19436992001388878
http://www.jstor.org/stable/2868374 viliam 0.07751320685198493


# Ranking documents by TF-IDF Score for a Search Word

In [None]:
# Set a limit on the number of documents analyzed
limit = 1000

from collections import defaultdict
terms_to_docs = defaultdict(list)
for doc_id, doc in enumerate(corpus_tfidf):
    for term_id, value in doc:
        term = dictionary.get(term_id)
        terms_to_docs[term].append((doc_id, value))
    if doc_id >= limit:
        break

In [None]:
# Pick a unigram to discover its score across documents
search_term = 'coriolanus'

# Display a list of documents and scores for the search term

matching = terms_to_docs.get(search_term)

try: 
    for doc_id, score in sorted(matching, key=lambda x: x[1], reverse=True):
        print(document_titles[doc_id].ljust(50), score)
except:
    print('Search term not found. Change the term or expand the corpus size.')

Front Matter                                       0.07233120491971268
Review Article                                     0.06992737046641462
Review Article                                     0.06175695973766614
Review Article                                     0.05481377815152557
The Great Lakes Shakespeare Festival               0.05260194079913104
Review Article                                     0.0523969885483399
Front Matter                                       0.051669346840501076
Review Article                                     0.050001972055823504
The New Arden Coriolanus                           0.04874219511638209
Now Then, Now Then, What's Going on Here?          0.048729668040474586
Notes and Comments                                 0.046098952115329515
Review Article                                     0.04413483928415584
Front Matter                                       0.042208706494066855
Review Article                                     0.04086851827888139
Sh