# One hot encoding using Pytorch

In [1]:
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x7f166211b570>

In [None]:
def text_to_onehot(text: str, corpus: str):
    # Read input file
    vocab = []
    with open(corpus, encoding="utf8") as f:
        passage = f.read()
        sentences = passage.lower().replace(".","").split('\n')
        for sentence in sentences:
            for word in sentence.split():
                if word not in vocab:
                    vocab.append(word)
    print(f'No. of words in our vocabulary: {len(vocab)}')
    print(vocab, '\n')

    # split the provided text into words
    txt = text.split()

    # Extract ranks out of word list
    rank_list = [rank for rank, _ in enumerate(vocab)]

    # Encode rank as one-hot vectors
    vocab_dict = {}
    text_vec = torch.zeros(len(txt), len(vocab)) # length of vocab is embedding size
    vocab_vec = torch.zeros(len(vocab), len(vocab))
    vocab_vec[torch.arange(len(vocab)), rank_list] = 1
    
    # create dict for mapping word to one-hot vector
    for index, word in enumerate(vocab):
            vocab_dict[word] = vocab_vec[index]
    
    # for each word of text in vocab assign its one-hot value
    for index, word in enumerate(txt):
        if word in vocab:
            text_vec[index] = vocab_dict[word]
    print(f'One hot vector of your text based on corpus:\n {text_vec}')

In [None]:
text_to_onehot('man eats biscuits and scares dog','corpus.txt')

No. of words in our vocabulary: 5
['dog', 'eats', 'biscuits', 'man', 'bites'] 

One hot vector of your text based on corpus:
 tensor([[0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]])


**Words in our text which are not in corpus gets all zero one-hot vector. Thats OOV**.

# BOW Model using sklearn

In [None]:
processed_docs = []
with open('corpus.txt', encoding="utf8") as f:
    passage = f.read()
processed_docs = passage.lower().replace(".","").split('\n')
processed_docs

['dog eats biscuits ', 'man eats biscuits ', 'dog bites man ', 'man bites dog']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#look at the documents list
print("Our corpus: ", processed_docs)

count_vect = CountVectorizer()
#Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)

#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)

#see the BOW rep for first 2 documents
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog: ", bow_rep[1].toarray())

#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation:", temp.toarray())

Our corpus:  ['dog eats biscuits', 'man eats biscuits', 'dog bites man', 'man bites dog']
Our vocabulary:  {'dog': 2, 'eats': 3, 'biscuits': 0, 'man': 4, 'bites': 1}
BoW representation for 'dog bites man':  [[1 0 1 1 0]]
BoW representation for 'man bites dog:  [[1 0 0 1 1]]
Bow representation: [[0 0 2 0 0]]


Here we are considering the frequency of words into account. However, sometimes, we don't care about frequency much, but only want to know whether a word appeared in a text or not. That is, each document is represented as a vector of 0s and 1s. We use the option binary=True in CountVectorizer for this purpose. This results in a different representation for the same sentence.

# Bag of N-grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


#Ngram vectorization example with count vectorizer and uni, bi, trigrams
count_vect = CountVectorizer(ngram_range=(1,3))

#Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)

#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)

#see the BOW rep for first 2 documents
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog: ", bow_rep[1].toarray())

#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])

print("Bow representation:", temp.toarray())

Our vocabulary:  {'dog': 4, 'eats': 9, 'biscuits': 0, 'dog eats': 7, 'eats biscuits': 10, 'dog eats biscuits': 8, 'man': 11, 'man eats': 14, 'man eats biscuits': 15, 'bites': 1, 'dog bites': 5, 'bites man': 3, 'dog bites man': 6, 'man bites': 12, 'bites dog': 2, 'man bites dog': 13}
BoW representation for 'dog bites man':  [[1 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0]]
BoW representation for 'man bites dog:  [[1 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1]]
Bow representation: [[0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0]]


Note that the number of features (and hence the size of the feature vector) increased a lot for the same data, compared to the ther single word based representations!!

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_docs)

# IDF for all words in the vocabulary
print("IDF for all words in the vocabulary",tfidf.idf_)
print("-"*10)

# All words in the vocabulary.
print("All words in the vocabulary",tfidf.get_feature_names())
print("-"*10)

# TFIDF representation for all documents in our corpus 
print("TFIDF representation for all documents in our corpus\n",bow_rep_tfidf.toarray()) 
print("-"*10)

temp = tfidf.transform(["dog and man are friends"])
print("Tfidf representation:\n", temp.toarray())

IDF for all words in the vocabulary [1.51082562 1.51082562 1.22314355 1.51082562 1.22314355]
----------
All words in the vocabulary ['biscuits', 'bites', 'dog', 'eats', 'man']
----------
TFIDF representation for all documents in our corpus
 [[0.61366674 0.         0.49681612 0.61366674 0.        ]
 [0.61366674 0.         0.         0.61366674 0.49681612]
 [0.         0.65782931 0.53256952 0.         0.53256952]
 [0.         0.65782931 0.53256952 0.         0.53256952]]
----------
Tfidf representation:
 [[0.         0.         0.70710678 0.         0.70710678]]


# Pre-trained word2vec model

Google News Dataset

In [None]:
!wget -P /tmp/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-10-19 12:04:08--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.250.62
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.250.62|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘/tmp/input/GoogleNews-vectors-negative300.bin.gz’


2020-10-19 12:04:26 (88.0 MB/s) - ‘/tmp/input/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [None]:
import warnings # This module ignores the various types of warnings generated
warnings.filterwarnings("ignore") 

import os # This module provides a way of using operating system dependent functionality

import psutil # This module helps in retrieving information on running processes and system resource utilization
process = psutil.Process(os.getpid())
from psutil import virtual_memory
mem = virtual_memory()

import time # This module is used to calculate the time

In [None]:
from gensim.models import Word2Vec, KeyedVectors
pretrainedpath = '/tmp/input/GoogleNews-vectors-negative300.bin.gz'

# Load W2V model. This will take some time, but it is a one time effort! 
pre = process.memory_info().rss
print("Memory used in GB before Loading the Model: %0.2f"%float(pre/(10**9))) #Check memory usage before loading the model
print('-'*10)

start_time = time.time() # Start the timer
ttl = mem.total # Total memory available

w2v_model = KeyedVectors.load_word2vec_format(pretrainedpath, binary=True) # load the model
print("%0.2f seconds taken to load"%float(time.time() - start_time)) # Calculate the total time elapsed since starting the timer
print('-'*10)

print('Finished loading Word2Vec')
print('-'*10)

post = process.memory_info().rss
print("Memory used in GB after Loading the Model: {:.2f}".format(float(post/(10**9)))) # Calculate the memory used after loading the model
print('-'*10)

print("Percentage increase in memory usage: {:.2f}% ".format(float((post/pre)*100))) # Percentage increase in memory after loading the model
print('-'*10)

print("Numver of words in vocablulary: ",len(w2v_model.vocab)) # Number of words in the vocabulary.

Memory used in GB before Loading the Model: 0.17
----------
123.91 seconds taken to load
----------
Finished loading Word2Vec
----------
Memory used in GB after Loading the Model: 5.01
----------
Percentage increase in memory usage: 3027.79% 
----------
Numver of words in vocablulary:  3000000


In [None]:
#Let us examine the model by knowing what the most similar words are, for a given word!
w2v_model.most_similar('beautiful')

[('gorgeous', 0.8353004455566406),
 ('lovely', 0.810693621635437),
 ('stunningly_beautiful', 0.7329413890838623),
 ('breathtakingly_beautiful', 0.7231341004371643),
 ('wonderful', 0.6854087114334106),
 ('fabulous', 0.6700063943862915),
 ('loveliest', 0.6612576246261597),
 ('prettiest', 0.6595001816749573),
 ('beatiful', 0.6593326330184937),
 ('magnificent', 0.6591402292251587)]

In [None]:
# What if I am looking for a word that is not in this vocabulary?
w2v_model['kaunhotum?']

KeyError: ignored

# Train our Embedding on WikiCorpus using GenSim

In [None]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')

In [None]:
# define training data
# Genism word2vec requires that a format of ‘list of lists’ be provided for training where every document contained in a list.
# Every list contains lists of tokens of that document.
corpus = [["dog","eats","biscuits"],["man", "eats","biscuits"],['dog','bites','man'], ["man", "bites" ,"dog"]]

#Training the model
model_cbow = Word2Vec(corpus, min_count=1,sg=0) # using CBOW Architecture for trainnig
model_skipgram = Word2Vec(corpus, min_count=1,sg=1)# using skipGram Architecture for training

### Continuous Bag of Words (CBOW)

In [None]:
#Summarize the loaded model
print(model_cbow)

#Summarize vocabulary
words = list(model_cbow.wv.vocab)
print(words)

#Acess vector for one word
print(model_cbow['dog'])

Word2Vec(vocab=5, size=100, alpha=0.025)
['dog', 'eats', 'biscuits', 'man', 'bites']
[-2.4027475e-03 -3.6447307e-03 -2.5443786e-03 -4.1847485e-03
  3.4096520e-04  3.8469373e-03  1.8951070e-03  2.2128995e-03
 -2.9074475e-03 -1.7413594e-03  3.1497036e-03 -2.7298685e-03
  7.7649608e-04  4.7321985e-03  2.3332597e-03 -3.6133968e-03
  1.3149050e-03 -2.8958269e-03  2.6008310e-03  4.3246713e-03
 -2.4437143e-03  1.7236971e-03  2.1966003e-04  3.1828312e-03
 -2.0580818e-03 -4.7907140e-03  4.7229971e-03 -2.5205074e-03
  2.5771598e-03  3.5903896e-03  4.5418474e-03 -2.8112877e-04
  2.3598003e-03 -1.4024939e-03 -2.3889376e-03  5.6103169e-04
  3.2694149e-03 -4.5257406e-03 -2.4695154e-03  3.8101565e-04
  2.5956191e-03 -3.1432405e-03  1.3197927e-03  3.2038018e-04
 -1.3719224e-03  4.2563415e-04  3.5662181e-04 -9.2187780e-04
  1.6659779e-04  3.8907481e-03 -3.2526883e-03  2.4699846e-03
  3.8279819e-03 -2.9219778e-03 -2.3369635e-03 -2.8545228e-03
 -3.0512323e-03  2.6372201e-03  2.9952941e-03 -2.5206900e-03


In [None]:
#Compute similarity 
print("Similarity between eats and bites:",model_cbow.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_cbow.similarity('eats', 'man'))

Similarity between eats and bites: 0.16608451
Similarity between eats and man: 0.076966956


From the above similarity scores we can conclude that eats is more similar to bites than man.

### SkipGram

In [None]:
#Summarize the loaded model
print(model_skipgram)

#Summarize vocabulary
words = list(model_skipgram.wv.vocab)
print(words)

#Acess vector for one word
print(model_skipgram['dog'])

Word2Vec(vocab=5, size=100, alpha=0.025)
['dog', 'eats', 'biscuits', 'man', 'bites']
[-2.4027475e-03 -3.6447307e-03 -2.5443786e-03 -4.1847485e-03
  3.4096520e-04  3.8469373e-03  1.8951070e-03  2.2128995e-03
 -2.9074475e-03 -1.7413594e-03  3.1497036e-03 -2.7298685e-03
  7.7649608e-04  4.7321985e-03  2.3332597e-03 -3.6133968e-03
  1.3149050e-03 -2.8958269e-03  2.6008310e-03  4.3246713e-03
 -2.4437143e-03  1.7236971e-03  2.1966003e-04  3.1828312e-03
 -2.0580818e-03 -4.7907140e-03  4.7229971e-03 -2.5205074e-03
  2.5771598e-03  3.5903896e-03  4.5418474e-03 -2.8112877e-04
  2.3598003e-03 -1.4024939e-03 -2.3889376e-03  5.6103169e-04
  3.2694149e-03 -4.5257406e-03 -2.4695154e-03  3.8101565e-04
  2.5956191e-03 -3.1432405e-03  1.3197927e-03  3.2038018e-04
 -1.3719224e-03  4.2563415e-04  3.5662181e-04 -9.2187780e-04
  1.6659779e-04  3.8907481e-03 -3.2526883e-03  2.4699846e-03
  3.8279819e-03 -2.9219778e-03 -2.3369635e-03 -2.8545228e-03
 -3.0512323e-03  2.6372201e-03  2.9952941e-03 -2.5206900e-03


In [None]:
# Compute similarity 
print("Similarity between eats and bites:",model_skipgram.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_skipgram.similarity('eats', 'man'))

Similarity between eats and bites: 0.16608451
Similarity between eats and man: 0.076966956


### Training Your Embedding on Wiki Corpus

In [None]:
!mkdir -p data/en/
!wget -P data/en/ https://dumps.wikimedia.org/enwiki/20201001/enwiki-20201001-pages-articles-multistream14.xml-p13159683p14324602.bz2

--2020-10-19 13:01:51--  https://dumps.wikimedia.org/enwiki/20201001/enwiki-20201001-pages-articles-multistream14.xml-p13159683p14324602.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.7, 2620:0:861:1:208:80:154:7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 290813119 (277M) [application/octet-stream]
Saving to: ‘data/en/enwiki-20201001-pages-articles-multistream14.xml-p13159683p14324602.bz2’


2020-10-19 13:02:47 (4.99 MB/s) - ‘data/en/enwiki-20201001-pages-articles-multistream14.xml-p13159683p14324602.bz2’ saved [290813119/290813119]



In [None]:
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
import time

In [None]:
#Preparing the Training data
wiki = WikiCorpus('data/en/enwiki-20201001-pages-articles-multistream14.xml-p13159683p14324602.bz2', 
                  lemmatize=False, dictionary={})
sentences = list(wiki.get_texts())

#### Hyperparameters
sg - Selecting the training algorithm: 1 for skip-gram else its 0 for CBOW. Default is CBOW.

min_count- Ignores all words with total frequency lower than this.

### CBOW


In [None]:
start = time.time()
word2vec_cbow = Word2Vec(sentences,min_count=10, sg=0)
end = time.time()

print("CBOW Model Training Complete.\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

CBOW Model Training Complete.
Time taken for training is:0.11 hrs 


In [None]:
#Summarize the loaded model
print(word2vec_cbow)
print("-"*30)

#Summarize vocabulary
words = list(word2vec_cbow.wv.vocab)
print(words)
print("-"*30)

#Acess vector for one word
print(word2vec_cbow['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",word2vec_cbow.similarity('film', 'drama'))
print("Similarity between film and dog:",word2vec_cbow.similarity('film', 'dog'))
print("-"*30)

Word2Vec(vocab=110266, size=100, alpha=0.025)
------------------------------
------------------------------
[ 2.7565804  -1.8998233   0.24396713 -5.0423594   1.8422738  -0.65177923
 -0.381312   -3.5065     -0.17528316  0.42131978 -3.004592   -0.03992306
  1.0394735  -0.8676751  -2.662261   -5.280323   -1.9426721   0.870471
 -2.0519125  -1.0287151   0.96150005  0.19526835 -2.1714604  -0.24984294
 -1.3098907   3.0635867  -0.334538    0.5344503   2.5159805   2.7793784
 -2.7574193  -0.32226488  2.4316955   1.5914464   0.4714155   2.0277789
 -0.24139051  3.060302    0.2473282   0.9530375   0.01355092 -0.17880818
  0.332892   -1.3293512   0.84251946 -2.766573    4.6083274  -0.7066732
 -1.4106652  -5.053077    1.9278134  -1.2711664  -1.6480677   1.2864221
  0.01805678 -2.0292292   2.2416604  -1.4383305   1.484246   -2.8953426
  2.842085    1.1541405  -1.109909    1.1531509  -2.1794612  -0.36965168
  1.0378665  -1.457691    1.6681366   1.1144332   0.75728685  1.0150515
  2.1407397   2.0408773 

In [None]:
# save model
from gensim.models import Word2Vec, KeyedVectors   
word2vec_cbow.wv.save_word2vec_format('word2vec_cbow.bin', binary=True)

# # load model
# new_modelword2vec_cbow = Word2Vec.load('word2vec_cbow.bin')
# print(word2vec_cbow)

### SkipGram

In [None]:
start = time.time()
word2vec_skipgram = Word2Vec(sentences,min_count=10, sg=1)
end = time.time()

print("SkipGram Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

SkipGram Model Training Complete
Time taken for training is:0.37 hrs 


In [None]:
#Summarize the loaded model
print(word2vec_skipgram)
print("-"*30)

#Summarize vocabulary
words = list(word2vec_skipgram.wv.vocab)
print(words)
print("-"*30)

#Acess vector for one word
print(word2vec_skipgram['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:", word2vec_skipgram.similarity('film', 'drama'))
print("Similarity between film and dog:",word2vec_skipgram.similarity('film', 'dog'))
print("-"*30)

Word2Vec(vocab=110266, size=100, alpha=0.025)
------------------------------
------------------------------
[ 0.2527833   0.07515053  0.7889645  -0.507733    0.13911958  0.3110742
  0.74493986 -0.3381282   0.12717879  0.24379423 -0.14013283  0.0083418
 -0.44349337 -0.1409569  -0.44693506 -0.1847089  -0.4280918  -0.52475286
 -0.03781946 -0.08712761  0.14771219  0.10999966 -0.24923426 -0.7507172
  0.6672947  -0.26096573  0.09615204 -0.03629035  0.16850083 -0.05096848
 -0.5004212  -0.21589129 -0.00953319 -0.06309711 -0.39762866 -0.09908433
  0.04289831  0.40278488  0.07224961 -0.56756383 -0.2571967  -0.57452816
 -0.00802177  0.178831    0.1296128  -0.31905633  0.17212953  0.40551314
 -0.4912367  -0.8221196   0.19269785 -0.14150034 -0.28215203 -0.31388947
 -0.05485747  0.18723874 -0.2309089   0.25268131 -0.24869195 -0.48261717
 -0.02445791 -0.15431958 -0.41577035 -0.03329036  0.03251792  0.4148097
  0.18134974 -0.2828539   0.35029912  0.36573496  0.02962436 -0.34553134
 -0.66009223  0.6483

**Skipgram took more time to train than BOW, any guess??**