In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim import corpora

In [2]:
# List with 2 sentences
my_docs = ["I don't know what tomorrow brings?",
           "Can I know? can you know?", 
          " I do not like darkness"]

# Tokenize the docs using Gensim’s 'simple_preprocess'. 
# Break it into words, also removing stop words (the 'I')
tokenized_list = [simple_preprocess(doc) for doc in my_docs]
print(tokenized_list)

[['don', 'know', 'what', 'tomorrow', 'brings'], ['can', 'know', 'can', 'you', 'know'], ['do', 'not', 'like', 'darkness']]


In [3]:
# Create a Dictionary object to be used in my function
mydict = corpora.Dictionary()
mydict

<gensim.corpora.dictionary.Dictionary at 0x2110911b940>

In [8]:
# Create my Corpus for the tokens available in list defined earlier
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list]

#anothere interesting Gensim function is 'BoWCorpus', check it out with a toy example

#print the token with its corresponding id
print(mydict) 

Dictionary(11 unique tokens: ['brings', 'don', 'know', 'tomorrow', 'what']...)


In [9]:
# Token to Id map
print(mydict.token2id)

{'brings': 0, 'don': 1, 'know': 2, 'tomorrow': 3, 'what': 4, 'can': 5, 'you': 6, 'darkness': 7, 'do': 8, 'like': 9, 'not': 10}


In [10]:
#print the word to id for every doc in human readable format
print(mycorpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(2, 2), (5, 2), (6, 1)], [(7, 1), (8, 1), (9, 1), (10, 1)]]


In [11]:
word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]
print(word_counts)

[[('brings', 1), ('don', 1), ('know', 1), ('tomorrow', 1), ('what', 1)], [('know', 2), ('can', 2), ('you', 1)], [('darkness', 1), ('do', 1), ('like', 1), ('not', 1)]]


In [12]:
# Save the Dict and Corpus
mydict.save('mydict.dict')  # save dict to disk
corpora.MmCorpus.serialize('bow_corpus.mm', mycorpus)  # save corpus to disk
print("SAVED Dictionary & CORPUS")

SAVED Dictionary & CORPUS


In [13]:
# Load them back for later use
loaded_dict = corpora.Dictionary.load('mydict.dict')

print("loading them back & printing them")
corpus = corpora.MmCorpus('bow_corpus.mm')

#print corpus 
for line in corpus:
    print(line)

loading them back & printing them
[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0)]
[(2, 2.0), (5, 2.0), (6, 1.0)]
[(7, 1.0), (8, 1.0), (9, 1.0), (10, 1.0)]
