In [None]:
# Reference: https://nbviewer.jupyter.org/github/rare-technologies/gensim/blob/develop/docs/notebooks/atmodel_tutorial.ipynb

In [2]:
# Download NIPS data
!wget -O - 'http://www.cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' > nips12raw_str602.tgz

--2018-07-26 12:14:30--  http://www.cs.nyu.edu/~roweis/data/nips12raw_str602.tgz
Resolving www.cs.nyu.edu (www.cs.nyu.edu)... 128.122.49.30
Connecting to www.cs.nyu.edu (www.cs.nyu.edu)|128.122.49.30|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz [following]
--2018-07-26 12:14:33--  https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz
Resolving cs.nyu.edu (cs.nyu.edu)... 128.122.49.30
Connecting to cs.nyu.edu (cs.nyu.edu)|128.122.49.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12851423 (12M) [application/x-gzip]
Saving to: ‘STDOUT’


2018-07-26 12:14:55 (616 KB/s) - written to stdout [12851423/12851423]



In [3]:
# Unzip dataset, which is nicely annotated with authors
import tarfile

filename = 'nips12raw_str602.tgz'
tar = tarfile.open(filename, 'r:gz')
for item in tar:
    tar.extract(item, path='.')

In [6]:
import os, re
from smart_open import smart_open

# Folder containing all NIPS papers.
data_dir = './nipstxt/'  # Set this path to the data on your machine.

# Folders containin individual NIPS papers.
yrs = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
dirs = ['nips' + yr for yr in yrs]

# Get all document texts and their corresponding IDs.
docs = []
doc_ids = []
for yr_dir in dirs:
    files = os.listdir(data_dir + yr_dir)  # List of filenames.
    for filen in files:
        # Get document ID.
        (idx1, idx2) = re.search('[0-9]+', filen).span()  # Matches the indexes of the start end end of the ID.
        doc_ids.append(yr_dir[4:] + '_' + str(int(filen[idx1:idx2])))
        
        # Read document text.
        # Note: ignoring characters that cause encoding errors.
        with smart_open(data_dir + yr_dir + '/' + filen, 'rb', encoding='utf-8', errors='ignore' ) as fid:
            txt = fid.read()
            
        # Replace any whitespace (newline, tabs, etc.) by a single space.
        txt = re.sub('\s', ' ', txt)
        
        docs.append(txt)

In [9]:
# len(docs)

1740

In [12]:
from smart_open import smart_open
# filenames = [data_dir + 'idx/a' + yr + '.txt' for yr in yrs]  # Using the years defined in previous cell.

# Get all author names and their corresponding document IDs.
author2doc = dict()
i = 0
for yr in yrs:
    # The files "a00.txt" and so on contain the author-document mappings.
    filename = data_dir + 'idx/a' + yr + '.txt'
    for line in smart_open(filename, 'rb', errors='ignore', encoding='utf-8'):
        # Each line corresponds to one author.
        contents = re.split(',', line)
        author_name = (contents[1] + contents[0]).strip()
        # Remove any whitespace to reduce redundant author names.
        author_name = re.sub('\s', '', author_name)
        # Get document IDs for author.
        ids = [c.strip() for c in contents[2:]]
        if not author2doc.get(author_name):
            # This is a new author.
            author2doc[author_name] = []
            i += 1
        
        # Add document IDs to author.
        author2doc[author_name].extend([yr + '_' + id for id in ids])

# Use an integer ID in author2doc, instead of the IDs provided in the NIPS dataset.
# Mapping from ID of document in NIPS datast, to an integer ID.
doc_id_dict = dict(zip(doc_ids, range(len(doc_ids))))
# Replace NIPS IDs by integer IDs.
for a, a_doc_ids in author2doc.items():
    for i, doc_id in enumerate(a_doc_ids):
        author2doc[a][i] = doc_id_dict[doc_id]

In [13]:
import spacy
nlp = spacy.load('en')

In [14]:
%%time
processed_docs = []    
for doc in nlp.pipe(docs, n_threads=4, batch_size=100):
    # Process document using Spacy NLP pipeline.
    
    ents = doc.ents  # Named entities.

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list.
    #doc = [token for token in doc if token not in STOPWORDS]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_docs.append(doc)
    
# This method takes very long ~28 min

CPU times: user 28min 1s, sys: 7min 28s, total: 35min 29s
Wall time: 20min 38s


In [15]:
docs = processed_docs
del processed_docs

In [16]:
# Compute bigrams.
from gensim.models import Phrases
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [17]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.

from gensim.corpora import Dictionary
dictionary = Dictionary(docs)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 20
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

_ = dictionary[0]  # This sort of "initializes" dictionary.id2token.

In [18]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [19]:
print('Number of authors: %d' % len(author2doc))
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of authors: 2479
Number of unique tokens: 7478
Number of documents: 1740


In [20]:
from gensim.models import AuthorTopicModel
%time model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                author2doc=author2doc, chunksize=2000, passes=1, eval_every=0, \
                iterations=1, random_state=1)

CPU times: user 2.11 s, sys: 80.8 ms, total: 2.19 s
Wall time: 2.2 s


In [21]:
%%time
model_list = []
for i in range(5):
    model = AuthorTopicModel(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                    author2doc=author2doc, chunksize=2000, passes=100, gamma_threshold=1e-10, \
                    eval_every=0, iterations=1, random_state=i)
    top_topics = model.top_topics(corpus)
    tc = sum([t[1] for t in top_topics])
    model_list.append((model, tc))


CPU times: user 5min, sys: 6.72 s, total: 5min 7s
Wall time: 5min 8s


In [22]:
model, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' %tc)


Topic coherence: -1.089e+01


In [24]:
# Save model.
model.save('model.atmodel')

In [25]:
# Load model.
model = AuthorTopicModel.load('model.atmodel')

In [35]:
model.show_topics(num_topics=10)

[(0,
  u'0.013*"signal" + 0.010*"control" + 0.008*"response" + 0.008*"movement" + 0.008*"field" + 0.008*"visual" + 0.008*"motor" + 0.007*"filter" + 0.007*"position" + 0.006*"target"'),
 (1,
  u'0.008*"neuron" + 0.008*"f" + 0.008*"memory" + 0.008*"matrix" + 0.007*"w" + 0.007*"threshold" + 0.006*"let" + 0.005*"capacity" + 0.005*"layer" + 0.005*"g"'),
 (2,
  u'0.010*"gaussian" + 0.008*"mixture" + 0.007*"density" + 0.007*"likelihood" + 0.006*"prior" + 0.006*"sample" + 0.006*"data" + 0.006*"matrix" + 0.006*"noise" + 0.005*"component"'),
 (3,
  u'0.009*"generalization" + 0.009*"rule" + 0.007*"bound" + 0.006*"f" + 0.005*"w" + 0.005*"finite" + 0.005*"sequence" + 0.005*"let" + 0.004*"theorem" + 0.004*"optimal"'),
 (4,
  u'0.030*"neuron" + 0.014*"spike" + 0.012*"cell" + 0.010*"synaptic" + 0.008*"frequency" + 0.008*"response" + 0.007*"firing" + 0.007*"potential" + 0.006*"delay" + 0.006*"circuit"'),
 (5,
  u'0.021*"image" + 0.009*"representation" + 0.006*"face" + 0.006*"object" + 0.006*"recognitio