In [1]:
import pandas as pd
import numpy as np

What sort of text inputs can gensim handle? The input text typically comes in 3 different forms :
1. As sentences stored in pythonâ€™s native list object
2. As one single text file, small or large.
3. In multiple text files.

In [3]:
import gensim
from gensim import corpora
from pprint import pprint

In [4]:
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

In [5]:
texts = [[word for word in doc.split() ] for doc in documents]
pprint(texts)

[['The',
  'Saudis',
  'are',
  'preparing',
  'a',
  'report',
  'that',
  'will',
  'acknowledge',
  'that'],
 ['Saudi',
  'journalist',
  'Jamal',
  "Khashoggi's",
  'death',
  'was',
  'the',
  'result',
  'of',
  'an'],
 ['interrogation',
  'that',
  'went',
  'wrong,',
  'one',
  'that',
  'was',
  'intended',
  'to',
  'lead'],
 ['to',
  'his',
  'abduction',
  'from',
  'Turkey,',
  'according',
  'to',
  'two',
  'sources.']]


In [9]:
dictionary = corpora.Dictionary(texts)
print(dictionary)

Dictionary<33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...>


In [10]:
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


In [11]:
text_2 = [[word for word in doc.split()] for doc in documents_2]
print(text_2)

[['One', 'source', 'says', 'the', 'report', 'will', 'likely', 'conclude', 'that'], ['the', 'operation', 'was', 'carried', 'out', 'without', 'clearance', 'and'], ['transparency', 'and', 'that', 'those', 'involved', 'will', 'be', 'held'], ['responsible.', 'One', 'of', 'the', 'sources', 'acknowledged', 'that', 'the'], ['report', 'is', 'still', 'being', 'prepared', 'and', 'cautioned', 'that'], ['things', 'could', 'change.']]


In [12]:
dictionary.add_documents(text_2)

In [15]:
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32, 'One': 33, 'conclude': 34, 'likely': 35, 'says': 36, 'source': 37, 'and': 38, 'carried': 39, 'clearance': 40, 'operation': 41, 'out': 42, 'without': 43, 'be': 44, 'held': 45, 'involved': 46, 'those': 47, 'transparency': 48, 'acknowledged': 49, 'responsible.': 50, 'sources': 51, 'being': 52, 'cautioned': 53, 'is': 54, 'prepared': 55, 'still': 56, 'change.': 57, 'could': 58, 'things': 59}


In [16]:
from gensim.utils import simple_preprocess

In [17]:
tokenized_text = [simple_preprocess(doc) for doc in documents]
print(tokenized_text)

[['the', 'saudis', 'are', 'preparing', 'report', 'that', 'will', 'acknowledge', 'that'], ['saudi', 'journalist', 'jamal', 'khashoggi', 'death', 'was', 'the', 'result', 'of', 'an'], ['interrogation', 'that', 'went', 'wrong', 'one', 'that', 'was', 'intended', 'to', 'lead'], ['to', 'his', 'abduction', 'from', 'turkey', 'according', 'to', 'two', 'sources']]


In [19]:
mydict = corpora.Dictionary()
mycorpus= [mydict.doc2bow(doc, allow_update = True) for doc in tokenized_text]
print(mycorpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1)], [(6, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)], [(5, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)], [(21, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1)]]


In [21]:
word_counts = [[(mydict[id],count) for id,count in line] for line in mycorpus]
print(word_counts)

[[('acknowledge', 1), ('are', 1), ('preparing', 1), ('report', 1), ('saudis', 1), ('that', 2), ('the', 1), ('will', 1)], [('the', 1), ('an', 1), ('death', 1), ('jamal', 1), ('journalist', 1), ('khashoggi', 1), ('of', 1), ('result', 1), ('saudi', 1), ('was', 1)], [('that', 2), ('was', 1), ('intended', 1), ('interrogation', 1), ('lead', 1), ('one', 1), ('to', 1), ('went', 1), ('wrong', 1)], [('to', 2), ('abduction', 1), ('according', 1), ('from', 1), ('his', 1), ('sources', 1), ('turkey', 1), ('two', 1)]]


In [23]:
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words("english") 

In [24]:
class BoWCorpus(object):
    def __init__(self, path, dictionary):
        self.path = path
        self.dictionary = dictionary
    
    def __iter__(self):
        global mydict
        
        for line in open(self.path, encoding = 'utf-8'):
            tokenized_list = simple_preprocess(line, deacc = True)
            bow = self.dictionary.doc2bow(tokenized_list, allow_update = True)
            mydict.merge_with(self.dictionary)
            yield bow

In [27]:
mydict = corpora.Dictionary()
bow_corpus = BoWCorpus('sample.txt', dictionary=mydict)
for line in bow_corpus:
    print(line)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 3), (30, 1), (31, 1), (32, 1), (33, 1), (34, 3), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 2), (42, 1), (43, 8), (44, 6), (45, 1), (46, 1), (47, 3), (48, 1), (49, 1), (50, 1), (51, 3), (52, 1), (53, 3), (54, 1), (55, 1)]


In [28]:
#How to save a gensim dictionary and corpus to disk and load them back?
mydict.save('mydict.dict')  # save dict to disk
corpora.MmCorpus.serialize('bow_corpus.mm', bow_corpus)  # save corpus to disk

In [29]:
# Load them back
loaded_dict = corpora.Dictionary.load('mydict.dict')
corpus = corpora.MmCorpus('bow_corpus.mm')
for line in corpus:
    print(line)

[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 3.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0), (19, 1.0), (20, 1.0), (21, 1.0), (22, 1.0), (23, 1.0), (24, 1.0), (25, 1.0), (26, 1.0), (27, 1.0), (28, 2.0), (29, 3.0), (30, 1.0), (31, 1.0), (32, 1.0), (33, 1.0), (34, 3.0), (35, 1.0), (36, 1.0), (37, 1.0), (38, 1.0), (39, 1.0), (40, 1.0), (41, 2.0), (42, 1.0), (43, 8.0), (44, 6.0), (45, 1.0), (46, 1.0), (47, 3.0), (48, 1.0), (49, 1.0), (50, 1.0), (51, 3.0), (52, 1.0), (53, 3.0), (54, 1.0), (55, 1.0)]


In [30]:
#How to create the TFIDF matrix (corpus) in gensim?
from gensim import models

tfidf = models.TfidfModel(corpus, smartirs = 'ntc')
for doc in tfidf[corpus]:
    print([[mydict[id],np.around(freq , decimals = 2)] for id, freq in doc])

[['abduction', 0.07], ['according', 0.07], ['acknowledge', 0.07], ['acknowledged', 0.07], ['an', 0.07], ['and', 0.21], ['are', 0.07], ['be', 0.07], ['being', 0.07], ['carried', 0.07], ['cautioned', 0.07], ['change', 0.07], ['clearance', 0.07], ['conclude', 0.07], ['could', 0.07], ['death', 0.07], ['from', 0.07], ['held', 0.07], ['his', 0.07], ['intended', 0.07], ['interrogation', 0.07], ['involved', 0.07], ['is', 0.07], ['jamal', 0.07], ['journalist', 0.07], ['khashoggi', 0.07], ['lead', 0.07], ['likely', 0.07], ['of', 0.14], ['one', 0.21], ['operation', 0.07], ['out', 0.07], ['prepared', 0.07], ['preparing', 0.07], ['report', 0.21], ['responsible', 0.07], ['result', 0.07], ['saudi', 0.07], ['saudis', 0.07], ['says', 0.07], ['source', 0.07], ['sources', 0.14], ['still', 0.07], ['that', 0.55], ['the', 0.42], ['things', 0.07], ['those', 0.07], ['to', 0.21], ['transparency', 0.07], ['turkey', 0.07], ['two', 0.07], ['was', 0.21], ['went', 0.07], ['will', 0.21], ['without', 0.07], ['wrong',

In [32]:
import gensim.downloader as api
api.info('glove-wiki-gigaword-50')

{'num_records': 400000,
 'file_size': 69182535,
 'base_dataset': 'Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)',
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/glove-wiki-gigaword-50/__init__.py',
 'license': 'http://opendatacommons.org/licenses/pddl/',
 'parameters': {'dimension': 50},
 'description': 'Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).',
 'preprocessing': 'Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i <fname> -o glove-wiki-gigaword-50.txt`.',
 'read_more': ['https://nlp.stanford.edu/projects/glove/',
  'https://nlp.stanford.edu/pubs/glove.pdf'],
 'checksum': 'c289bc5d7f2f02c6dc9f2f9b67641813',
 'file_name': 'glove-wiki-gigaword-50.gz',
 'parts': 1}

In [None]:
w2v_model = api.load("glove-wiki-gigaword-50")
w2v_model.most_similar('blue')

In [33]:
# Build the bigram models
bigram = gensim.models.phrases.Phrases(tokenized_text, min_count=3, threshold=10)
# Construct bigram
print(bigram[tokenized_text[0]])

['the', 'saudis', 'are', 'preparing', 'report', 'that', 'will', 'acknowledge', 'that']


In [34]:
# Build the trigram models
trigram = gensim.models.phrases.Phrases(bigram[tokenized_text], threshold=10)

# Construct trigram
print(trigram[bigram[tokenized_text[0]]])

['the', 'saudis', 'are', 'preparing', 'report', 'that', 'will', 'acknowledge', 'that']


In [2]:
def square_numbers(nums):
    squares = []
    for n in nums:
        squares.append(n*n)
    return squares

numbers = [1,2,3,4,5]
result = square_numbers(numbers)
print(result)

[1, 4, 9, 16, 25]


In [3]:
def square_numbers(nums):
    for n in nums:
        yield n*n

numbers = [1,2,3,4,5]
result = square_numbers(numbers)
print(result)

<generator object square_numbers at 0x000001FF544215F0>


In [4]:
for s in result:
    print(s)

1
4
9
16
25


In [7]:
squares = [n*n for n in [1,2,3,4,5]]
print(squares)

[1, 4, 9, 16, 25]


In [9]:
generator = (n*n for n in [1,2,3,4,5])
print(next(generator))

1


In [10]:
print(list(generator))

[4, 9, 16, 25]
