## Preliminaries

In [1]:
import os, pickle
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

Load cleaned Novel 450 data (only English novels)

In [2]:
with open("./novel450/novel450_cleaned.pkl", 'rb') as f:
    docs = pickle.load(f)

## Document-term matrix

Document-term matrix with base Python

In [3]:
def base_dtm(docs):
    vocab = set()
    for doc in docs:
        vocab |= set(doc.split(' '))
        
    counts = [dict.fromkeys(vocab, 0) for doc in docs]
    for idx, doc in enumerate(docs):
        for word in doc.split(' '):
            counts[idx][word] += 1
            
    dtm = [[count for count in doc.values()] for doc in counts]
    return dtm, list(vocab)

In [4]:
%%time
dtm, vocab = base_dtm(docs)

CPU times: user 4.12 s, sys: 140 ms, total: 4.26 s
Wall time: 4.26 s


Check our work

In [5]:
dtm = pd.DataFrame(dtm, columns=vocab)
print("Document-term matrix shape:", dtm.shape, "\nVocab length:", len(vocab))

Document-term matrix shape: (150, 86626) 
Vocab length: 86626


In [6]:
dtm.iloc[:10, :10]

Unnamed: 0,aesir,legacied,assizes,splintering,hugged,finely,succored,piratical,inkhorn,fastening
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,2,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,1,0,0,0
4,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,3,0,0,0,2
7,0,0,0,0,0,0,1,0,0,5
8,0,0,0,0,0,3,0,0,0,3
9,0,0,0,0,0,0,0,0,0,0


Document-term matrix with NumPy and SciPy

In [7]:
def np_dtm(docs):
    vocab = set()
    n_nonzero = 0
    for doc in docs:
        split_doc = doc.split(' ')
        unique_terms = set(split_doc)
        vocab |= unique_terms
        n_nonzero += len(unique_terms)
        
    docnames = np.array(range(0, len(docs)))
    vocab = np.array(list(vocab))    
    vocab_sorter = np.argsort(vocab)
    
    ndocs = len(docnames)
    nvocab = len(vocab)
    
    data = np.empty(n_nonzero, dtype=np.intc)
    rows = np.empty(n_nonzero, dtype=np.intc)
    cols = np.empty(n_nonzero, dtype=np.intc)
    
    idx = 0
    for docname, doc in zip(docnames, docs):
        doc = doc.split(' ')
        term_indices = vocab_sorter[np.searchsorted(vocab, doc, sorter=vocab_sorter)]
        unique_indices, counts = np.unique(term_indices, return_counts=True)
        
        n_vals = len(unique_indices)
        idx_end = idx + n_vals
        
        data[idx:idx_end] = counts
        cols[idx:idx_end] = unique_indices
        doc_idx = np.where(docnames == docname)
        rows[idx:idx_end] = np.repeat(doc_idx, n_vals)
        
        idx = idx_end
        
    dtm = coo_matrix((data, (rows, cols)), shape=(ndocs, nvocab), dtype=np.intc)
    return dtm, vocab

In [8]:
%%time
dtm, vocab = np_dtm(docs)

CPU times: user 9 s, sys: 310 ms, total: 9.3 s
Wall time: 9.31 s


Check our work

In [9]:
dtm = pd.DataFrame(dtm.toarray(), columns=vocab)
print("Document-term matrix shape:", dtm.shape, "\nVocab length:", len(vocab))

Document-term matrix shape: (150, 86626) 
Vocab length: 86626


In [10]:
dtm.iloc[:10, :10]

Unnamed: 0,aesir,legacied,assizes,splintering,hugged,finely,succored,piratical,inkhorn,fastening
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,2,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,1,0,0,0
4,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,3,0,0,0,2
7,0,0,0,0,0,0,1,0,0,5
8,0,0,0,0,0,3,0,0,0,3
9,0,0,0,0,0,0,0,0,0,0


In [11]:
for term in list(vocab)[:25]:
    print(term)

aesir
legacied
assizes
splintering
hugged
finely
succored
piratical
inkhorn
fastening
siliceous
enfiladed
abode
indifferency
watching
unhorse
arguer
disallow
handedly
twelfthly
cujuslibet
propinquity
exquisites
ichthyological
academician


Make the equivalent of a `gensim` dictionary

In [12]:
term_freq = np.ravel(dtm.sum(axis=0))
term_dict = {term:val for term, val in zip(list(vocab), list(term_freq))}

In [13]:
term_dict['writer']

532

In [14]:
term_dict['patience']

1040

Let's compare the speed of the above methods with `scikit-learn`

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
vec = CountVectorizer()

In [17]:
%%time
X = vec.fit_transform(docs)
X.shape

CPU times: user 3.34 s, sys: 34.6 ms, total: 3.37 s
Wall time: 3.37 s


(150, 86617)

## Document-term matrix spin-offs

Regardless of what method we use to construct a document-term matrix, let's quickly review how to build other data structures from one. First, let's start with a fresh document-term matrix

In [18]:
dtm, vocab = np_dtm(docs)

With that made, we can make a term co-occurrence matrix

In [19]:
%%time
tt = dtm.T.dot(dtm)
tt = pd.DataFrame(tt.toarray(), columns=vocab, index=vocab)
np.fill_diagonal(tt.values, 0)

CPU times: user 1min 6s, sys: 41.8 s, total: 1min 48s
Wall time: 2min 6s


In [20]:
tt.iloc[:10, :10]

Unnamed: 0,aesir,legacied,assizes,splintering,hugged,finely,succored,piratical,inkhorn,fastening
aesir,0,0,0,0,0,0,0,0,3,3
legacied,0,0,0,0,0,2,0,0,0,1
assizes,0,0,0,0,0,8,0,1,0,3
splintering,0,0,0,0,2,6,0,3,0,9
hugged,0,0,0,2,0,109,0,4,2,64
finely,0,2,8,6,109,0,1,5,11,145
succored,0,0,0,0,0,1,0,0,0,7
piratical,0,0,1,3,4,5,0,0,0,46
inkhorn,3,0,0,0,2,11,0,0,0,13
fastening,3,1,3,9,64,145,7,46,13,0


And here's a document-document matrix (i.e. how many terms a given document shares with another document)

In [21]:
%%time
dd = dtm.dot(dtm.T)
dd = pd.DataFrame(dd.toarray())
np.fill_diagonal(dd.values, 0)

CPU times: user 175 ms, sys: 27.3 ms, total: 202 ms
Wall time: 211 ms


In [22]:
dd.iloc[:10, :10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,319233,332032,708957,66275,66416,155917,287463,634880,108597
1,319233,0,1226432,2623068,236484,218177,541581,1082758,2191551,346878
2,332032,1226432,0,3636596,244028,238299,596526,1124730,2512144,424297
3,708957,2623068,3636596,0,527393,536012,1354912,2562971,5435557,867297
4,66275,236484,244028,527393,0,51845,163449,231644,614496,80832
5,66416,218177,238299,536012,51845,0,136622,241493,538251,90554
6,155917,541581,596526,1354912,163449,136622,0,655831,1877004,200943
7,287463,1082758,1124730,2562971,231644,241493,655831,0,2439067,335009
8,634880,2191551,2512144,5435557,614496,538251,1877004,2439067,0,784921
9,108597,346878,424297,867297,80832,90554,200943,335009,784921,0


Let's construct a toy example just to show how these two structures work a bit more clearly

In [23]:
m = np.matrix([[0, 1, 1], [1, 0, 1], [0, 1, 1]])
toy_dtm = pd.DataFrame(m, columns=['t1', 't2', 't3'], index=['d1', 'd2', 'd3'])
toy_dtm

Unnamed: 0,t1,t2,t3
d1,0,1,1
d2,1,0,1
d3,0,1,1


Term-term:

In [24]:
toy_tt = toy_dtm.T.dot(toy_dtm)
np.fill_diagonal(toy_tt.values, 0)
toy_tt

Unnamed: 0,t1,t2,t3
t1,0,0,1
t2,0,0,2
t3,1,2,0


Document-document:

In [25]:
toy_dd = toy_dtm.dot(toy_dtm.T)
np.fill_diagonal(toy_dd.values, 0)
toy_dd

Unnamed: 0,d1,d2,d3
d1,0,1,2
d2,1,0,1
d3,2,1,0


## Topic modeling

Just to carry things through, let's build a quick topic model with one of our document-term matrices

In [26]:
import lda
import pyLDAvis
import warnings
warnings.filterwarnings('ignore')
pyLDAvis.enable_notebook()

In [43]:
%%capture
model = lda.LDA(n_topics=20, n_iter=500, random_state=357).fit(dtm)

Vocabulary should be the same between the DTM and the model

In [44]:
assert len(vocab) == model.topic_word_.shape[1]

Look at top terms for each topic

In [45]:
for idx, topic_dist in enumerate(model.components_):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-10:-1]
    print("Topic {}: {}".format(idx, ' '.join(topic_words)))

Topic 0: slope house year friend love word thought matter told
Topic 1: dog eye head young foot wood fire began back
Topic 2: day thought word room house moment friend place left
Topic 3: hand back eye head voice door red face night
Topic 4: god love heart life soul day word world eye
Topic 5: country house place horse gentleman hae part day town
Topic 6: thing life girl people thought world asked book felt
Topic 7: hand sword castle answered head art father noble honor
Topic 8: mind friend life character nature heart world part day
Topic 9: mother father day thing thought child poor home heart
Topic 10: day dollar money street city thing business office place
Topic 11: young house room day gentleman archer friend dear wife
Topic 12: eye night appeared length voice moment door heard thought
Topic 13: cried young dear answered thing honor give gentleman hand
Topic 14: dear mother young aunt feeling sister child room happy
Topic 15: boy thing back hand give put head night place
Topic 16:

Assemble pieces required for pyLDAvis

In [46]:
def get_doc_lengths(docs):
    doc_lengths = []
    for doc in docs:
        split_doc = doc.split(' ')
        doc_lengths.append(len(split_doc))
        
    return np.array(doc_lengths)

In [47]:
doc_lengths = get_doc_lengths(docs)

In [48]:
vis_data = {'topic_term_dists': model.components_,
            'doc_topic_dists': model.doc_topic_,
            'doc_lengths': doc_lengths,
            'vocab': vocab,
            'term_frequency': term_freq}

In [49]:
vis = pyLDAvis.prepare(**vis_data)

In [50]:
pyLDAvis.display(vis)