In [1]:
import nltk
import numpy as np
import matplotlib.pyplot as plt

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
wordnet_lemmatizer = WordNetLemmatizer()

In [4]:
titles = [line.strip() for line in open('all_book_titles.txt')]

In [7]:
stops = set(stopwords.words('english'))

In [8]:
# Example of domain-specific stopwords
stops = stops.union({
    'introduction', 'edition', 'series', 'application',
    'approach', 'card', 'access', 'package', 'plus', 'etext',
    'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
    'third', 'second', 'fourth', 'volume'
})

In [13]:
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    
    # Remove short words that are not useful
    tokens = [t for t in tokens if len(t)>2]
    # put words into base form
    tokens = [ wordnet_lemmatizer.lemmatize(t) for t in tokens]
    
    # remove stopwords
    tokens = [t for t in tokens if t not in stops]
    # Remove any digits
    tokens = [t for t in tokens if not any(c.isdigit() for c in t)]
    
    return tokens

In [14]:
vectorizer = CountVectorizer(binary=True, tokenizer=my_tokenizer)

In [15]:
X = vectorizer.fit_transform(titles)

In [16]:
# Create index > word map for plotting later

index_word_map = vectorizer.get_feature_names_out()

In [17]:
# Transpose x to make rows = terms, cols = documents
X = X.T

In [18]:
svd = TruncatedSVD()
Z = svd.fit_transform(X)

In [20]:
# !pip install plotly

Collecting plotly
  Downloading plotly-5.10.0-py2.py3-none-any.whl (15.2 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.10.0 tenacity-8.0.1


In [21]:
import plotly.express as px

In [24]:
fig = px.scatter(x=Z[:,0], y=Z[:, 1], text=index_word_map, size_max=60)
fig.update_traces(textposition='top center')
fig.show()