## Gensim Tutorial 
#### Gensim is billed as a Natural Language Processing package for processing texts, working with word vector models (such as Word2Vec, FastText etc) and for building topic models.

In [6]:
import gensim
from gensim import corpora
from pprint import pprint

In [None]:
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

In [None]:
documents = ["This is a document."]

documents_2 = ["This is not a document."]


### In order to work on text documents, Gensim requires the words (aka tokens) be converted to unique ids. In order to achieve that, Gensim lets you create a Dictionary object that maps each word to a unique id.

### The dictionary object is typically used to create a ‘bag of words’ Corpus. It is this Dictionary and the bag-of-words (Corpus) that are used as inputs to topic modeling and other models that Gensim specializes in.

## How to create a Dictionary

In [12]:
# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]
texts_2 = [[text for text in doc.split()] for doc in documents_2]

In [13]:
print(texts)

[['This', 'is', 'a', 'document.']]


In [14]:
# Create dictionary
dictionary = corpora.Dictionary(texts)

In [15]:
print(dictionary)

Dictionary(4 unique tokens: ['This', 'a', 'document.', 'is'])


In [16]:
print(dictionary.token2id)

{'This': 0, 'a': 1, 'document.': 2, 'is': 3}


In [17]:
dictionary.add_documents(texts_2)

In [18]:
print(dictionary)

Dictionary(5 unique tokens: ['This', 'a', 'document.', 'is', 'not'])


In [19]:
print(dictionary.token2id)

{'This': 0, 'a': 1, 'document.': 2, 'is': 3, 'not': 4}


## How to create a bag of words corpus in gensim?

In [13]:
from gensim.utils import simple_preprocess

In [14]:
my_docs = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

tokenized_list = [simple_preprocess(doc) for doc in my_docs]
print(tokenized_list)

[['human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications'], ['survey', 'of', 'user', 'opinion', 'of', 'computer', 'system', 'response', 'time'], ['the', 'eps', 'user', 'interface', 'management', 'system'], ['system', 'and', 'human', 'system', 'engineering', 'testing', 'of', 'eps'], ['relation', 'of', 'user', 'perceived', 'response', 'time', 'to', 'error', 'measurement'], ['the', 'generation', 'of', 'random', 'binary', 'unordered', 'trees'], ['the', 'intersection', 'graph', 'of', 'paths', 'in', 'trees'], ['graph', 'minors', 'iv', 'widths', 'of', 'trees', 'and', 'well', 'quasi', 'ordering'], ['graph', 'minors', 'survey']]


In [15]:
my_dictionary = corpora.Dictionary()

In [16]:
mycorpus = [my_dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_list]
print(mycorpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)], [(5, 1), (12, 1), (14, 1), (15, 1), (16, 1), (17, 1)], [(4, 1), (8, 1), (12, 2), (15, 1), (18, 1), (19, 1), (20, 1)], [(8, 1), (10, 1), (13, 1), (14, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(8, 1), (17, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1)], [(8, 1), (17, 1), (29, 1), (31, 1), (32, 1), (33, 1), (34, 1)], [(8, 1), (18, 1), (29, 1), (31, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)], [(11, 1), (31, 1), (36, 1)]]


In [65]:
print(my_dictionary)

Dictionary(41 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...)


In [66]:
print(my_dictionary.token2id)

{'abc': 0, 'applications': 1, 'computer': 2, 'for': 3, 'human': 4, 'interface': 5, 'lab': 6, 'machine': 7, 'of': 8, 'opinion': 9, 'response': 10, 'survey': 11, 'system': 12, 'time': 13, 'user': 14, 'eps': 15, 'management': 16, 'the': 17, 'and': 18, 'engineering': 19, 'testing': 20, 'error': 21, 'measurement': 22, 'perceived': 23, 'relation': 24, 'to': 25, 'binary': 26, 'generation': 27, 'random': 28, 'trees': 29, 'unordered': 30, 'graph': 31, 'in': 32, 'intersection': 33, 'paths': 34, 'iv': 35, 'minors': 36, 'ordering': 37, 'quasi': 38, 'well': 39, 'widths': 40}


## How to save a gensim dictionary and corpus to disk and load them back?

In [17]:
my_dictionary.save('mydict.dict')  # save dict to disk
corpora.MmCorpus.serialize('mycorpus.mm', mycorpus)  # save corpus to disk

In [90]:
# Load them back
loaded_dict = corpora.Dictionary.load('mydict.dict')
print(loaded_dict.token2id)

Dictionary(41 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...)


In [92]:
corpus = corpora.MmCorpus('mycorpus.mm')
print(corpus)
for line in corpus:
    print(line)

MmCorpus(9 documents, 41 features, 65 non-zero entries)
[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)]
[(2, 1.0), (8, 2.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0)]
[(5, 1.0), (12, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0)]
[(4, 1.0), (8, 1.0), (12, 2.0), (15, 1.0), (18, 1.0), (19, 1.0), (20, 1.0)]
[(8, 1.0), (10, 1.0), (13, 1.0), (14, 1.0), (21, 1.0), (22, 1.0), (23, 1.0), (24, 1.0), (25, 1.0)]
[(8, 1.0), (17, 1.0), (26, 1.0), (27, 1.0), (28, 1.0), (29, 1.0), (30, 1.0)]
[(8, 1.0), (17, 1.0), (29, 1.0), (31, 1.0), (32, 1.0), (33, 1.0), (34, 1.0)]
[(8, 1.0), (18, 1.0), (29, 1.0), (31, 1.0), (35, 1.0), (36, 1.0), (37, 1.0), (38, 1.0), (39, 1.0), (40, 1.0)]
[(11, 1.0), (31, 1.0), (36, 1.0)]


## How to create the TFIDF matrix using gensim?

In [80]:
from gensim import models
import numpy as np

documents = ["This is the first line",
             "This is the second sentence",
             "This third document"]

documents = ["Human machine interface for lab abc computer applications",
           "A survey of user opinion of computer system response time",
           "The EPS user interface management system",
           "System and human system engineering testing of EPS",
           "Relation of user perceived response time to error measurement",
           "The generation of random binary unordered trees",
           "The intersection graph of paths in trees",
           "Graph minors IV Widths of trees and well quasi ordering",
           "Graph minors A survey"]

documents = ["news about food campaign",
             "news of sport campaign",
             "news of sport campaign sport activities",
             "news of food campaign campaign campaign"
            ]


In [81]:
# Create the Dictionary 
mydict = corpora.Dictionary([simple_preprocess(line) for line in documents])
print(mydict)
print(mydict.token2id)

Dictionary(7 unique tokens: ['about', 'campaign', 'food', 'news', 'of']...)
{'about': 0, 'campaign': 1, 'food': 2, 'news': 3, 'of': 4, 'sport': 5, 'activities': 6}


In [82]:
# Create the Corpus
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(1, 1), (3, 1), (4, 1), (5, 1)], [(1, 1), (3, 1), (4, 1), (5, 2), (6, 1)], [(1, 3), (2, 1), (3, 1), (4, 1)]]


In [83]:
# Show the Word Weights in Corpus
for doc in corpus:
    print([[mydict[id], freq] for id, freq in doc])

[['about', 1], ['campaign', 1], ['food', 1], ['news', 1]]
[['campaign', 1], ['news', 1], ['of', 1], ['sport', 1]]
[['campaign', 1], ['news', 1], ['of', 1], ['sport', 2], ['activities', 1]]
[['campaign', 3], ['food', 1], ['news', 1], ['of', 1]]


In [84]:
# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

In [85]:
# Show the TF-IDF weights
for doc in tfidf[corpus]:
    print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])

[['about', 0.89], ['food', 0.45]]
[['of', 0.38], ['sport', 0.92]]
[['of', 0.15], ['sport', 0.7], ['activities', 0.7]]
[['food', 0.92], ['of', 0.38]]


## How to use gensim downloader API to load datasets?

In [18]:
import gensim.downloader as api

# Get information about the model or dataset
api.info('glove-wiki-gigaword-50')
# Download
w2v_model = api.load("glove-wiki-gigaword-50")
w2v_model.most_similar('blue')

[('red', 0.8901657462120056),
 ('black', 0.8648406863212585),
 ('pink', 0.845291793346405),
 ('green', 0.8346816301345825),
 ('yellow', 0.8320707082748413),
 ('purple', 0.8293111324310303),
 ('white', 0.8225342035293579),
 ('orange', 0.8114302158355713),
 ('bright', 0.799933910369873),
 ('colored', 0.7876655459403992)]

## How to create bigrams and trigrams using Phraser models?