In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import decomposition
from glob import glob
import os

Using english literature as corpus

https://liferay.de.dariah.eu/tatom/datasets.html

In [8]:
filenames = []
for folder in ["british-fiction-corpus"]: 
    filenames.extend(glob("data/" + folder + "/*.txt"))

In [10]:
vectorizer = TfidfVectorizer(input='filename', stop_words='english')
dtm = vectorizer.fit_transform(filenames).toarray()
vocab = np.array(vectorizer.get_feature_names())
dtm.shape, len(vocab)

((27, 55035), 55035)

Using NMF

In [12]:
clf = decomposition.NMF(n_components=10, random_state=1)

W1 = clf.fit_transform(dtm)
H1 = clf.components_

In [13]:
num_top_words=8

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [14]:
def get_all_topic_words(H):
    top_indices = lambda t: {i for i in np.argsort(t)[:-num_top_words-1:-1]}
    topic_indices = [top_indices(t) for t in H]
    return sorted(set.union(*topic_indices))

In [15]:
ind = get_all_topic_words(H1)

In [16]:
vocab[ind]

array(['adams', 'allworthy', 'bounderby', 'catherine', 'cathy',
       'corporal', 'crawley', 'darcy', 'did', 'dorothea', 'earnshaw',
       'edgar', 'elinor', 'emma', 'father', 'finn', 'glegg', 'good',
       'gradgrind', 'hareton', 'heathcliff', 'jones', 'joseph', 'know',
       'lady', 'laura', 'like', 'linton', 'little', 'll', 'lopez',
       'louisa', 'lydgate', 'lyndon', 'maggie', 'man', 'marianne',
       'micawber', 'miss', 'mr', 'mrs', 'old', 'osborne', 'pendennis',
       'philip', 'phineas', 'quoth', 'said', 'sissy', 'sophia', 'sparsit',
       'stephen', 'thought', 'time', 'tis', 'toby', 'tom', 'trim',
       'tulliver', 'uncle', 'wakem', 'weston', 'wharton'], dtype='<U31')

In [17]:
show_topics(H1)

['mr said lydgate mrs dorothea micawber little know',
 'said little like did time know good thought',
 'adams jones said allworthy sophia lady joseph mr',
 'elinor mr emma darcy mrs weston marianne miss',
 'maggie tulliver said tom glegg philip mr wakem',
 'heathcliff linton hareton catherine earnshaw cathy edgar ll',
 'toby said uncle father corporal quoth tis trim',
 'phineas said lopez mr finn man wharton laura',
 'bounderby gradgrind sparsit said mr sissy louisa stephen',
 'crawley said lyndon pendennis old little osborne lady']

Export the data

using FileLink and FileLinks you can run Excel spreadsheets in Jupyter

In [18]:
from IPython.display import FileLink, FileLinks

In [19]:
np.savetxt("britlit_W.csv", W1, delimiter=",", fmt='%.14f')
FileLink('britlit_W.csv')

In [20]:
np.savetxt("britlit_H.csv", H1[:,ind], delimiter=",", fmt='%.14f')
FileLink('britlit_H.csv')

In [21]:
np.savetxt("britlit_raw.csv", dtm[:,ind], delimiter=",", fmt='%.14f')
FileLink('britlit_raw.csv')

Using SVD

In [22]:
U, s, V = decomposition.randomized_svd(dtm, 10)
ind = get_all_topic_words(V)

In [25]:
vocab[ind]

array(['adams', 'allworthy', 'bounderby', 'bretton', 'catherine',
       'crawley', 'crimsworth', 'darcy', 'dashwood', 'did', 'elinor',
       'elizabeth', 'elton', 'emma', 'finn', 'fleur', 'good', 'gradgrind',
       'hareton', 'heathcliff', 'hunsden', 'jennings', 'jones', 'joseph',
       'knightley', 'know', 'lady', 'linton', 'little', 'lopez', 'louisa',
       'madame', 'maggie', 'man', 'marianne', 'miss', 'monsieur', 'mr',
       'mrs', 'pelet', 'phineas', 'said', 'sissy', 'sophia', 'sparsit',
       'toby', 'tulliver', 'uncle', 'weston'], dtype='<U31')

In [26]:
show_topics(H1)

['mr said lydgate mrs dorothea micawber little know',
 'said little like did time know good thought',
 'adams jones said allworthy sophia lady joseph mr',
 'elinor mr emma darcy mrs weston marianne miss',
 'maggie tulliver said tom glegg philip mr wakem',
 'heathcliff linton hareton catherine earnshaw cathy edgar ll',
 'toby said uncle father corporal quoth tis trim',
 'phineas said lopez mr finn man wharton laura',
 'bounderby gradgrind sparsit said mr sissy louisa stephen',
 'crawley said lyndon pendennis old little osborne lady']

In [27]:
np.savetxt("britlit_U.csv", U, delimiter=",", fmt='%.14f')
FileLink('britlit_U.csv')

In [28]:
np.savetxt("britlit_V.csv", V[:,ind], delimiter=",", fmt='%.14f')
FileLink('britlit_V.csv')

In [29]:
np.savetxt("britlit_raw_svd.csv", dtm[:,ind], delimiter=",", fmt='%.14f')
FileLink('britlit_raw_svd.csv')

In [30]:
np.savetxt("britlit_S.csv", np.diag(s), delimiter=",", fmt='%.14f')
FileLink('britlit_S.csv')