# Word2Vec Model- Self Train

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import numpy as np

corpus = [
     'this coffee is very bad and is expensive',
     'this coffee is not bad and is cheap',
     'this coffee is not amazing and is not affordable',
]

In [33]:
# split each document/review into a list containing all the words in it
i=0
list_of_sentance=[]
for sentance in corpus:
    list_of_sentance.append(sentance.split())
print(list_of_sentance)

[['this', 'coffee', 'is', 'very', 'bad', 'and', 'is', 'expensive'], ['this', 'coffee', 'is', 'not', 'bad', 'and', 'is', 'cheap'], ['this', 'coffee', 'is', 'not', 'amazing', 'and', 'is', 'not', 'affordable']]


In [46]:
# train the Word2Vec model 

w2v_model=Word2Vec(list_of_sentance,min_count=1)
print(w2v_model.wv.most_similar('cheap'))   # check if similarity of any word is being outputed
w2v_words= list(w2v_model.wv.index_to_key)  
print(w2v_words)

[('and', 0.19918270409107208), ('coffee', 0.0749121904373169), ('bad', 0.060690056532621384), ('expensive', 0.044891368597745895), ('not', 0.03371307626366615), ('is', 0.027107110247015953), ('very', 0.02671417035162449), ('affordable', 0.008842861279845238), ('this', -0.0684460774064064), ('amazing', -0.1445155292749405)]
['is', 'not', 'and', 'coffee', 'this', 'bad', 'affordable', 'amazing', 'cheap', 'expensive', 'very']


# Average Word2Vector Model

In [48]:

sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sentance): # for each review/sentence
    sent_vec = np.zeros(100) # as word vectors are of zero length 100, you might need to change this to 300 if you use google's w2v
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))          # to check the total rows of the matrix= this should be 3 as there are 3 docs in corpus
print(len(sent_vectors[0]))       # this should be 100 as for each word, the respective vector is of 100 dimensions

100%|██████████| 3/3 [00:00<00:00, 3002.37it/s]

3
100





# TFIDF weighed W2V

In [49]:
# Load tfidf vectorizer to get the tf-idf value of each word
model = TfidfVectorizer()
model.fit(corpus)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [43]:
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(list_of_sentance): # for each review/sentence 
    sent_vec = np.zeros(100) # as word vectors are of zero length 100
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

100%|██████████| 3/3 [00:00<00:00, 2997.36it/s]


In [50]:
print(len(tfidf_sent_vectors))   # check the number of rows

3


In [51]:
print(tfidf_sent_vectors[0])    # sample check tf-idf weighted word2vec vector for review 1

[-2.17668086e-03  2.87485251e-04  4.60832264e-04  1.41068221e-03
 -4.30968042e-03 -3.77369269e-03  1.54168978e-03  5.95426867e-03
 -3.95025615e-03 -3.29107748e-03  3.29879719e-03 -1.19683723e-03
 -9.02379144e-04  3.46718890e-04 -8.91748690e-04 -6.62799837e-04
  2.16563512e-03 -1.26463881e-03 -2.69984581e-03 -5.88545786e-03
  8.32185579e-04  5.86956014e-04  4.68625108e-03  1.17613927e-03
  1.20826747e-03 -1.68935600e-03  1.55392750e-04  2.99458320e-03
 -2.85441464e-03 -3.26503726e-04 -1.46479376e-04 -1.32396257e-03
  3.57426716e-03 -2.12316555e-03 -7.74603864e-04  1.88076661e-03
  4.57043870e-03 -1.06206014e-03 -8.51976531e-04 -5.00718657e-04
 -1.84547632e-03  1.71285769e-03 -5.54864535e-03 -2.24105491e-04
  3.83053012e-04 -1.10052999e-03 -2.90348600e-03  4.36247947e-03
  8.90942183e-04  3.31094123e-03 -2.05762030e-03  1.76389095e-03
 -2.39405994e-03  8.00728418e-04  1.44202046e-03 -3.57695446e-03
  4.61037317e-04 -4.12075947e-03 -4.09142425e-04  1.53882782e-03
 -5.06672412e-04  2.53423