In [1]:
!pip install gensim



In [2]:
# open the text file as an object 
file = open('hamlet.txt', encoding ='utf-8')

In [3]:
text=file.read()

In [4]:
text



In [5]:
"avbcdd.awedc.bsdcd".split(".")

['avbcdd', 'awedc', 'bsdcd']

In [6]:
# Tokenize data: Handling punctuations and lowercasing the text 

from gensim.utils import simple_preprocess 

# preprocess the file to get a list of tokens 
token_list =[] 

for sentence in text.split('.'): 
    
  # the simple_preprocess function returns a list of each sentence 
  token_list.append(simple_preprocess(sentence, deacc = True))

- deacc (bool, optional) – Remove accent marks from tokens using deaccent()?

- (The deaccent() function is another utility function, documented at the link, which does exactly what the name and documentation suggest: removes accent marks from letters, so that, for example, 'é' becomes just 'e'.

- simple_preprocess(), per its documentation, to discard any tokens shorter than min_len=2 characters.

In [7]:
token_list[:5]

[['the',
  'tragedy',
  'of',
  'hamlet',
  'prince',
  'of',
  'denmark',
  'by',
  'william',
  'shakespeare',
  'dramatis',
  'personae',
  'claudius',
  'king',
  'of',
  'denmark'],
 ['marcellus', 'officer'],
 ['hamlet',
  'son',
  'to',
  'the',
  'former',
  'and',
  'nephew',
  'to',
  'the',
  'present',
  'king'],
 ['polonius', 'lord', 'chamberlain'],
 ['horatio', 'friend', 'to', 'hamlet']]

## Create Dictionary

In [8]:
from gensim import corpora 

# storing the extracted tokens into the dictionary 
my_dictionary = corpora.Dictionary(token_list) 

print(my_dictionary)

Dictionary(4593 unique tokens: ['by', 'claudius', 'denmark', 'dramatis', 'hamlet']...)


## Save and Load Dictionary

In [9]:
# save your dictionary to disk 
my_dictionary.save('dictionary.dict')

# load back 
load_dict = corpora.Dictionary.load('dictionary.dict')

print(load_dict)

Dictionary(4593 unique tokens: ['by', 'claudius', 'denmark', 'dramatis', 'hamlet']...)


In [11]:
# save your dictionary as text file 
from gensim.test.utils import get_tmpfile 

tmp_fname = get_tmpfile("dictionary1.txt") 

my_dictionary.save_as_text(tmp_fname)

# load your dictionary text file 
load_dict1 = corpora.Dictionary.load_from_text(tmp_fname)

print(load_dict1)

Dictionary(4593 unique tokens: ['abate', 'abatements', 'abhorred', 'ability', 'able']...)


## Bag of Words

In [12]:
# Converting to a bag of word corpus 
BoW_corpus =[my_dictionary.doc2bow(sent, allow_update = True) for sent in token_list] 

print(BoW_corpus[:5])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 3), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(13, 1), (14, 1)], [(4, 1), (5, 1), (10, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2)], [(21, 1), (22, 1), (23, 1)], [(4, 1), (20, 1), (24, 1), (25, 1)]]


In [13]:

my_dictionary.doc2bow(['the',
  'tragedy',
  'of',
  'hamlet',
  'prince',
  'of',
  'denmark',
  'by',
  'william',
  'shakespeare',
  'dramatis',
  'personae',
  'claudius',
  'king',
  'of',
  'denmark'], allow_update = True)

[(0, 1),
 (1, 1),
 (2, 2),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 3),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1)]

In [14]:

sorted(['the',
  'tragedy',
  'of',
  'hamlet',
  'prince',
  'of',
  'denmark',
  'by',
  'william',
  'shakespeare',
  'dramatis',
  'personae',
  'claudius',
  'king',
  'of',
  'denmark'])

['by',
 'claudius',
 'denmark',
 'denmark',
 'dramatis',
 'hamlet',
 'king',
 'of',
 'of',
 'of',
 'personae',
 'prince',
 'shakespeare',
 'the',
 'tragedy',
 'william']

In [15]:
my_dictionary[6]

'of'

In [16]:
# Word weight in Bag of Words corpus 
word_weight =[] 

for doc in BoW_corpus: 
    for id, freq in doc: 
        word_weight.append([my_dictionary[id], freq]) 
        
print(word_weight[:5])

[['by', 1], ['claudius', 1], ['denmark', 2], ['dramatis', 1], ['hamlet', 1]]


## TF-IDF

In [17]:
from gensim.models import TfidfModel
import numpy as np 

# create TF-IDF model 
tfIdf = TfidfModel(BoW_corpus, smartirs ='ntc')

In [18]:
tfIdf

<gensim.models.tfidfmodel.TfidfModel at 0x261b43f7ee0>

In [19]:
# TF-IDF Word Weight 
weight_tfidf =[] 
for doc in tfIdf[BoW_corpus]: 
    for id, tf_idf in doc: 
        weight_tfidf.append([my_dictionary[id], np.around(tf_idf, decimals = 3)]) 
print(weight_tfidf[:10])

[['by', 0.146], ['claudius', 0.31], ['denmark', 0.407], ['dramatis', 0.339], ['hamlet', 0.142], ['king', 0.117], ['of', 0.241], ['personae', 0.339], ['prince', 0.272], ['shakespeare', 0.339]]


## Word2Vec

In [20]:
# import Word2Vec model
from gensim.models import Word2Vec

# Create Word2vec object
model = Word2Vec(sentences=token_list,  # tokenized sentences
                 vector_size=100,  
                 window=5, 
                 min_count=1, 
                 workers=4, 
                 sg=0) # CBOW 

#Save model
model.save("word2vec.model")

- min_count is for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage.
- workers , the last of the major parameters (full list here) is for training parallelization.
- Window: Maximum distance between the current and predicted word within a sentence.
- sg: stands for skip-gram; 0- CBOW; 1:skipgram

In [21]:
# Load trained Word2Vec model
model = Word2Vec.load("word2vec.model")

In [22]:
# Generate vector
vector = model.wv['think']  # returns numpy array

In [23]:
vector

array([-0.06536394,  0.13783662,  0.0178107 ,  0.05624513,  0.1327443 ,
       -0.25557443,  0.1571361 ,  0.37711096, -0.15743077, -0.17653891,
       -0.07194799, -0.1689529 , -0.01948159,  0.08003864,  0.10482447,
       -0.13295038,  0.1347952 , -0.13882028, -0.10407022, -0.3790161 ,
        0.15449278,  0.07505971,  0.13023914, -0.14541292, -0.04979572,
       -0.01073848, -0.16579717, -0.06898595, -0.22782661,  0.06700233,
        0.09944459, -0.03138614, -0.01740737, -0.20263763, -0.12945676,
        0.18899257, -0.0213289 , -0.11077302, -0.07002216, -0.22332807,
        0.04043547, -0.12619297, -0.06958359,  0.06246174,  0.20016743,
       -0.02095488, -0.07678098, -0.11940005,  0.04609959,  0.1124356 ,
        0.06797406, -0.1596255 , -0.06629136, -0.10634647, -0.15244715,
        0.07443058,  0.06856311,  0.07570249, -0.21991296, -0.00541837,
        0.0579852 ,  0.03655605,  0.0768728 ,  0.01769605, -0.12029076,
        0.16120638,  0.06965724,  0.12110123, -0.34502164,  0.15

In [24]:
vector.shape

(100,)

In [25]:
# Finding most similar words
model.wv.most_similar('present')

[('second', 0.9768619537353516),
 ('exit', 0.9760934710502625),
 ('soft', 0.975530743598938),
 ('friends', 0.9752652049064636),
 ('bear', 0.9752048254013062),
 ('madness', 0.9749588966369629),
 ('pastoral', 0.9748015999794006),
 ('this', 0.9747998118400574),
 ('soul', 0.9747369289398193),
 ('should', 0.9746885299682617)]

## Doc2Vec

In [26]:
documents=text.split(".")

In [27]:
documents[:5]

['THE TRAGEDY OF HAMLET, PRINCE OF DENMARK\n\n\nby William Shakespeare\n\n\n\nDramatis Personae\n\n  Claudius, King of Denmark',
 '\n  Marcellus, Officer',
 '\n  Hamlet, son to the former, and nephew to the present king',
 '\n  Polonius, Lord Chamberlain',
 '\n  Horatio, friend to Hamlet']

In [28]:
from collections import namedtuple

# Transform data (you can add more data preprocessing steps) 
docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, text in enumerate(documents):
    words = text.lower().split()
    tags = [i]
    docs.append(analyzedDocument(words, tags))

print(docs[:2])

[AnalyzedDocument(words=['the', 'tragedy', 'of', 'hamlet,', 'prince', 'of', 'denmark', 'by', 'william', 'shakespeare', 'dramatis', 'personae', 'claudius,', 'king', 'of', 'denmark'], tags=[0]), AnalyzedDocument(words=['marcellus,', 'officer'], tags=[1])]


In [29]:
from gensim.models import doc2vec

model = doc2vec.Doc2Vec(docs,
                        vector_size=100,
                        window=5, 
                        min_count=1, 
                        workers=4,
                        dm=0) # PV-DBOW

- dm ({1,0}, optional) – Defines the training algorithm. If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
- vector_size (int, optional) – Dimensionality of the feature vectors.
- window (int, optional) – The maximum distance between the current and predicted word within a sentence.
- alpha (float, optional) – The initial learning rate.

In [30]:
vector=model.infer_vector(['the', 'tragedy', 'of', 'hamlet,', 'prince', 'of', 'denmark', 'by', 'william', 
                    'shakespeare', 'dramatis', 'personae', 'claudius,', 'king', 'of', 'denmark'])

In [31]:
vector

array([ 3.73992249e-02, -2.89477576e-02,  1.20215170e-01,  2.22050902e-02,
       -2.65924931e-02, -4.15853672e-02,  5.43003529e-02,  1.63022038e-02,
       -9.90091935e-02, -7.53495321e-02, -1.54466793e-01,  1.37175605e-01,
        8.24811086e-02,  7.66080171e-02,  1.57768149e-02,  4.01052460e-02,
        2.29448274e-01,  2.52265204e-02,  7.12625906e-02, -6.36832556e-05,
       -4.79852855e-02,  7.39841759e-02,  8.16715509e-02,  3.08757350e-02,
        9.74139795e-02,  3.67657803e-02, -2.18390115e-02,  3.33741866e-02,
        1.04154809e-03, -4.80791219e-02, -5.82959540e-02, -9.58849564e-02,
       -3.93800326e-02, -5.52165918e-02, -9.71637145e-02, -4.11860906e-02,
       -3.10108531e-02,  1.33002372e-02, -9.01232753e-03,  4.76599159e-03,
       -2.12833527e-02,  4.73276079e-02,  1.11856288e-03,  6.59171790e-02,
       -2.62828320e-02, -2.23690700e-02,  1.79640219e-01, -1.13296896e-01,
       -2.32352167e-02,  2.05143735e-01,  4.70596515e-02, -1.57281850e-02,
        4.13605198e-02, -

In [32]:
vector.shape

(100,)

## Google Word2Vec

In [33]:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

Download GoogleNews vectors: https://stackoverflow.com/questions/46433778/import-googlenews-vectors-negative300-bin

In [43]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

model.wv['reforms'].shape

FileNotFoundError: [Errno 2] No such file or directory: 'GoogleNews-vectors-negative300.bin'

## Stanford Glove

In [34]:
import gensim.downloader as api

List of Twitter Glove Embeddings

- glove-twitter-25 (104 MB)
- glove-twitter-50 (199 MB)
- glove-twitter-100 (387 MB)
- glove-twitter-200 (758 MB)

In [35]:
# download the model and return as object ready for use
model_glove_twitter = api.load("glove-twitter-25")



In [36]:
model_glove_twitter['reforms'].shape

(25,)

In [37]:
model_glove_twitter['reforms']

array([ 0.37207  ,  0.91542  , -1.6257   , -0.15803  ,  0.38455  ,
       -1.3252   , -0.74057  , -2.095    ,  1.0401   , -0.0027519,
        0.33633  , -0.085222 , -2.1703   ,  0.91529  ,  0.77599  ,
       -0.87018  , -0.97346  ,  0.68114  ,  0.71777  , -0.99392  ,
        0.028837 ,  0.24823  , -0.50573  , -0.44954  , -0.52987  ],
      dtype=float32)

In [38]:
# get similar items
model_glove_twitter.most_similar("policies",topn=10)

[('policy', 0.9484812617301941),
 ('reforms', 0.9403934478759766),
 ('laws', 0.9401204586029053),
 ('government', 0.923071026802063),
 ('regulations', 0.9168933629989624),
 ('economy', 0.9110006093978882),
 ('immigration', 0.9105909466743469),
 ('legislation', 0.9089650511741638),
 ('govt', 0.9054747223854065),
 ('regulation', 0.9050779342651367)]

## Facebook FastText

In [39]:
# Import FastText 
from gensim.models import FastText

# Create FastText Model object
model = FastText(vector_size=50, window=3, min_count=1)  # instantiate

# Build Vocab
model.build_vocab(token_list)

# Train FastText model
model.train(token_list, total_examples=len(token_list), epochs=10)  # train

(232131, 308470)

In [40]:
model.wv['policy']

array([-0.20984098, -0.10337741,  0.12326135, -0.09302633, -0.18535946,
        0.15095477, -0.18418005,  0.08778601,  0.03103931,  0.4522604 ,
        0.1949227 ,  0.01506678,  0.39085543,  0.0465074 , -0.2180361 ,
        0.2073521 ,  0.01931712, -0.04996003,  0.03581789, -0.26337218,
       -0.29202676, -0.1849381 ,  0.21226208,  0.08450359,  0.0207683 ,
       -0.05268253, -0.3643701 ,  0.06985143, -0.28303897,  0.09640918,
        0.12008975,  0.14689307, -0.1224826 ,  0.4763004 ,  0.06799921,
        0.14705272, -0.19285509,  0.04355754, -0.07744131,  0.01111095,
        0.01965019,  0.2556003 ,  0.02419364,  0.18420711,  0.11553071,
       -0.29061547,  0.2728347 , -0.03224691,  0.08732051,  0.24020532],
      dtype=float32)

In [41]:
# Finding most similar words
model.wv.most_similar('present')

[('presently', 0.9999943971633911),
 ('presentment', 0.9999939799308777),
 ('tent', 0.999990701675415),
 ('indentures', 0.9999898672103882),
 ('ostentation', 0.9999896287918091),
 ('definement', 0.99998939037323),
 ('instrument', 0.99998939037323),
 ('instance', 0.9999889135360718),
 ('station', 0.9999887347221375),
 ('sent', 0.9999885559082031)]