In [1]:
# import and setup modules we'll be using in this notebook
import logging
import itertools

import numpy as np
import gensim

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

def head(stream, n=10):
    """Convenience fnc: return the first `n` elements of the stream, as plain list."""
    return list(itertools.islice(stream, n))

In [2]:
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield title, tokens

In [4]:
# only use simplewiki in this tutorial (fewer documents)
# the full wiki dump is exactly the same format, but larger

# r'/Users/viral.parikh/Desktop/External_Datasets/kaggle/bbc/bbc_small'

stream = iter_wiki('/Users/viral.parikh/Desktop/External_Datasets/kaggle/20newsgroup/simplewiki-20140623-pages-articles.xml.bz2')
for title, tokens in itertools.islice(iter_wiki('/Users/viral.parikh/Desktop/External_Datasets/kaggle/20newsgroup/simplewiki-20140623-pages-articles.xml.bz2'), 8):
    print title, tokens[:10]  # print the article title and its first ten tokens    

April [u'april', u'fourth', u'month', u'year', u'comes', u'march', u'days', u'april', u'begins', u'day']
August [u'august', u'eighth', u'month', u'year', u'gregorian', u'calendar', u'coming', u'july', u'september', u'days']
Art [u'painting', u'renoir', u'work', u'art', u'art', u'activity', u'creation', u'people', u'importance', u'attraction']
A [u'page', u'letter', u'alphabet', u'indefinite', u'article', u'article', u'grammar', u'uses', u'disambiguation', u'letter']
Air [u'air', u'fan', u'air', u'air', u'earth', u'atmosphere', u'clear', u'gas', u'living', u'things']
Autonomous communities of Spain [u'spain', u'divided', u'parts', u'called', u'autonomous', u'communities', u'autonomous', u'means', u'autonomous', u'communities']
Alan Turing [u'statue', u'alan', u'turing', u'rebuild', u'machine', u'alan', u'turing', u'alan', u'mathison', u'turing']
Alanis Morissette [u'alanis', u'nadine', u'morissette', u'born', u'june', u'grammy', u'award', u'winning', u'canadian', u'american']


In [9]:
stream

<generator object iter_wiki at 0x109ccd8c0>

In [10]:
id2word = {0: u'word', 2: u'profit', 300: u'another_word'}

In [11]:
id2word

{0: u'word', 2: u'profit', 300: u'another_word'}

In [12]:
doc_stream = (tokens for _, tokens in iter_wiki('/Users/viral.parikh/Desktop/External_Datasets/kaggle/20newsgroup/simplewiki-20140623-pages-articles.xml.bz2'))

In [13]:
doc_stream

<generator object <genexpr> at 0x109ccdf50>

In [14]:
%time id2word_wiki = gensim.corpora.Dictionary(doc_stream)
print(id2word_wiki)

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:adding document #10000 to Dictionary(148230 unique tokens: [u'fawn', u'\u03c9\u0431\u0440\u0430\u0434\u043e\u0432\u0430\u043d\u043d\u0430\u0467', u'refreshable', u'yollar\u0131', u'idaira']...)
INFO:gensim.corpora.dictionary:adding document #20000 to Dictionary(225175 unique tokens: [u'biennials', u'sowela', u'mdbg', u'clottes', u'idaira']...)
INFO:gensim.corpora.dictionary:adding document #30000 to Dictionary(286134 unique tokens: [u'biennials', u'sowela', u'mdbg', u'clottes', u'klatki']...)
INFO:gensim.corpora.dictionary:adding document #40000 to Dictionary(375996 unique tokens: [u'biennials', u'sowela', u'mdbg', u'biysk', u'sermersheim']...)
INFO:gensim.corpora.dictionary:built Dictionary(409123 unique tokens: [u'biennials', u'sowela', u'mdbg', u'biysk', u'sermersheim']...) from 48321 documents (total 10370158 corpus positions)


CPU times: user 3min 36s, sys: 1.42 s, total: 3min 37s
Wall time: 3min 37s
Dictionary(409123 unique tokens: [u'biennials', u'sowela', u'mdbg', u'biysk', u'sermersheim']...)


In [None]:
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_wiki.filter_extremes(no_below=20, no_above=0.1)
print(id2word_wiki)