In [49]:
import pickle
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow as tf
import torch
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# #Load stemmed data as .pkl
with open("/content/drive/MyDrive/data_lem.pkl", 'rb') as f:
    data_lem = pickle.load(f)

In [5]:
#Generate a corpus
corpus_gen=[doc.split() for doc in data_lem]

In [8]:
#Word2Vec
model = Word2Vec(corpus_gen, size = 100, min_count = 566)

In [9]:
model.save("word2vec.model")

In [10]:
model.wv.vocab.keys()
#from excersice 3 for comparison (note: exercise 3 was stemmed data)
#dict_keys(['call', 'car', 'dai', 'engin', 'info', 'know', 'look', 'mail', 'small', 'thank', 'wonder', 'year', 'answer', 'base'

#not identical, but quite similar

dict_keys(['car', 'wonder', 'see', 'day', 'look', 'early', 'call', 'small', 'body', 'know', 'model', 'year', 'history', 'info', 'mail', 'thank', 'report', 'number', 'experience', 'send', 'message', 'speed', 'rate', 'add', 'card', 'disk', 'request', 'network', 'base', 'answer', 'guy', 'washington', 'edu', 'question', 'mac', 'give', 'start', 'life', 'way', 'new', 'machine', 'bit', 'maybe', 'anybody', 'expect', 'hear', 'suppose', 'access', 'price', 'line', 'like', 'go', 'display', 'probably', 'get', 'feel', 'well', 'great', 'good', 'opinion', 'people', 'use', 'take', 'size', 'money', 'hit', 'real', 'play', 'figure', 'actually', 'advance', 'email', 'post', 'news', 'time', 'truth', 'uucp', 'write', 'article', 'chip', 'far', 'low', 'level', 'stuff', 'pretty', 'nice', 'require', 'address', 'phone', 'information', 'com', 'thing', 'person', 'sense', 'world', 'clear', 'memory', 'error', 'yes', 'quote', 'software', 'check', 'right', 'value', 'set', 'code', 'tell', 'second', 'idea', 'apr', 'john',

In [24]:
list(model.wv.vocab.keys()).count("car")

1

In [13]:
len(model.wv.vocab.keys())

430

In [12]:
model.wv["car"], len(model.wv["car"])
#shows the word vector for "car"
#size is equal to the specified size given in the model creation

(array([-0.7688538 , -0.57045364,  0.93715423,  1.0327036 ,  1.5292839 ,
        -0.5854819 ,  0.7999106 , -1.5903491 ,  0.19504394, -1.0066195 ,
        -0.778624  ,  1.9578811 , -0.16155276, -0.86648524, -1.5033013 ,
         0.2883837 , -0.24667431,  0.7404138 , -0.48677817,  1.6359075 ,
         0.26505637,  1.3500357 , -1.4137801 , -0.9562595 ,  0.36940202,
        -0.63888687,  1.4475542 ,  0.69659925,  0.8651052 ,  1.3410618 ,
         0.3179075 , -1.3100004 ,  0.5637202 ,  0.19256493,  0.00328407,
         0.16411987,  0.3021263 ,  0.5035178 ,  0.2511811 ,  1.1637733 ,
         0.03522966,  1.2979157 ,  0.3174995 ,  0.382272  ,  0.86844105,
         0.5246218 , -0.4062733 , -0.04315323, -0.46838313, -0.05402102,
         0.96688247,  0.40157902, -1.6837243 , -0.9279718 , -0.15396546,
         0.45112318,  0.26185328,  0.11979344,  0.9996492 , -0.06004533,
        -1.6735586 , -0.7739195 ,  0.29014513,  1.2851517 ,  1.4389242 ,
         0.01577799,  0.31093442,  1.3179872 ,  0.9

In [16]:
model.wv.most_similar(positive=["bike","machine"], topn=1)
#shows the most similar word to both bike & machine
#(one would have expected car in there maybe?)

[('stuff', 0.6519047021865845)]

In [18]:
model.wv.most_similar(positive=["bike","machine"], topn=5)
#these all seem reasonable; although running a machine has different meaning than running a bike

[('stuff', 0.6519047021865845),
 ('buy', 0.6404200792312622),
 ('run', 0.6102454662322998),
 ('fast', 0.6102071404457092),
 ('speed', 0.6068271398544312)]

In [20]:
#different words
model.wv.most_similar(positive=["car","house"], topn=5)
#these all seem reasonable; while some are refering to a king ("police"), some are associative to both words ("bike")

[('police', 0.5742180347442627),
 ('white', 0.5732223391532898),
 ('black', 0.5005381107330322),
 ('bike', 0.47213566303253174),
 ('go', 0.47046229243278503)]

In [36]:
list(model.wv.vocab.keys()).count('enlighten')

0

In [81]:
corpus_embedded = []
for doc in corpus_gen:
  docs = []
  for word in doc:
    if list(model.wv.vocab.keys()).count(word) > 0: docs.append(model.wv[word])
  if len(docs) != 0: corpus_embedded.append(np.mean(docs, axis = 0)) #len 0 is created by docs that have no words that are in the model!

In [83]:
WordtoVecModel = pd.DataFrame(corpus_embedded)
WordtoVecModel.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.250444,-0.145671,0.3686,0.077028,0.247628,-0.101664,0.102999,-0.641151,0.25556,-0.547234,...,0.08916,-0.340357,-0.084376,-0.322424,0.151908,-0.240132,-0.108613,-0.291957,0.285783,-0.365435
1,-0.189367,-0.439202,0.20579,-0.049759,0.052459,-0.114537,-0.103828,0.09227,0.526709,-0.040958,...,-0.47363,-0.289011,0.233007,-0.236756,-0.132272,-0.138544,-0.382944,-0.608249,-0.160897,-0.0473
2,-0.28922,0.03407,0.028316,-0.176336,-0.211872,-0.069242,0.054079,-0.224646,0.253906,-0.054512,...,0.096934,-0.379228,0.221882,-0.186829,-0.080898,0.099022,-0.067428,-0.308073,-0.030986,-0.034373
3,-0.19214,-0.214018,-0.131627,0.146716,-0.098666,-0.102768,-0.192296,-0.115287,-0.164313,0.233914,...,-0.150026,0.152291,0.255472,-0.199071,0.181075,-0.002915,0.041878,-0.225609,-0.379897,-0.169778
4,-0.328924,0.028505,0.210276,-0.069066,-0.263026,-0.245952,-0.343816,-0.085552,-0.030569,0.268099,...,-0.226007,0.000967,0.135864,-0.08156,-0.074654,0.401027,-0.040739,-0.07363,0.019213,-0.07208


In [85]:
#Save stemmed data as .pkl
with open('/content/drive/MyDrive/df_w2v.pkl', 'wb') as f:
    pickle.dump(WordtoVecModel, f)

In [86]:
WordtoVecModel.iloc[0,:]

0    -0.250444
1    -0.145671
2     0.368600
3     0.077028
4     0.247628
        ...   
95   -0.240132
96   -0.108613
97   -0.291957
98    0.285783
99   -0.365435
Name: 0, Length: 100, dtype: float32

In [103]:
#Props to Philipp for this line
index = np.argmax(model.wv.cosine_similarities(WordtoVecModel.iloc[0,:], model.wv.vectors))
index, model.wv.index2word[index]

(60, 'car')

# Part 2

In [120]:
corpus_tagged = []
for i, tokens in enumerate(corpus_gen):
  corpus_tagged.append(TaggedDocument(tokens, [i]))

In [121]:
corpus_tagged[:4]

[TaggedDocument(words=['car', 'wonder', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'small', 'addition', 'bumper', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'engine', 'specs', 'year', 'production', 'car', 'history', 'info', 'funky', 'looking', 'car', 'mail', 'thank'], tags=[0]),
 TaggedDocument(words=['clock', 'poll', 'final', 'final', 'clock', 'report', 'acceleration', 'clock', 'upgrade', 'fair', 'number', 'brave', 'soul', 'upgrade', 'clock', 'oscillator', 'share', 'experience', 'poll', 'send', 'brief', 'message', 'detailing', 'experience', 'procedure', 'speed', 'attain', 'cpu', 'rate', 'speed', 'add', 'card', 'adapters', 'heat', 'sink', 'hour', 'usage', 'day', 'floppy', 'disk', 'functionality', 'floppy', 'especially', 'request', 'summarize', 'day', 'add', 'network', 'knowledge', 'base', 'clock', 'upgrade', 'haven', 'answer', 'poll', 'thank', 'guy', 'kuo', 'guykuo', 'washington', 'edu'], tags=[1]),
 Tagged

In [122]:
tagged_model = Doc2Vec(corpus_tagged, vector_size = 100, min_count = 566)

In [131]:
# infer vector (embedding representation) of first document 
inf_vec = tagged_model.infer_vector(corpus_tagged[0][0])

In [127]:
inf_vec

array([-0.00982174, -0.03790557,  0.08960464,  0.05358515,  0.07837495,
       -0.08305655,  0.00857049, -0.06249359,  0.00467596, -0.04073333,
       -0.00561744,  0.07903958,  0.03598593,  0.04725059, -0.03266205,
       -0.00085888,  0.05856785,  0.03837515, -0.06242437,  0.12611246,
        0.04323708,  0.00097824, -0.07547297, -0.07893962,  0.04142608,
        0.01708958,  0.0457562 , -0.01369079,  0.00346038,  0.08628903,
       -0.01616589, -0.00733301, -0.0738913 ,  0.07258812,  0.03019334,
       -0.07906841,  0.02202205,  0.06804267, -0.0565364 , -0.00689948,
       -0.07738908,  0.05986636, -0.01849759,  0.09466633,  0.05493022,
        0.0085044 , -0.07182146, -0.02088423, -0.08697141, -0.07794274,
        0.06348237,  0.02667676, -0.1432374 , -0.05611106,  0.00245078,
        0.02615167, -0.0240259 ,  0.07789084,  0.04461633, -0.00860584,
       -0.10254473, -0.03691643,  0.0495472 ,  0.06145537, -0.00442888,
        0.014801  , -0.01124022,  0.11131298,  0.05582102, -0.01

In [128]:
corpus_embedded[0]

array([-2.50443995e-01, -1.45671219e-01,  3.68599772e-01,  7.70283341e-02,
        2.47628212e-01, -1.01663612e-01,  1.02999248e-01, -6.41150653e-01,
        2.55560011e-01, -5.47234058e-01, -5.18729448e-01,  4.13301289e-01,
       -1.06957182e-01, -1.74564719e-01, -3.68398488e-01,  1.75920278e-01,
        3.46154310e-02, -1.85210782e-03, -2.89226919e-01,  7.48682201e-01,
       -4.21790481e-02,  2.78001338e-01, -5.20258307e-01, -2.72893369e-01,
        1.41813368e-01, -6.78571314e-02,  4.56432581e-01,  1.66515917e-01,
        2.66882449e-01,  4.15113449e-01,  2.03316644e-01, -3.10972512e-01,
        1.44162089e-01,  1.29583329e-01,  6.80604354e-02,  2.94016600e-02,
        2.40566418e-01,  3.65773499e-01, -4.03370857e-01,  3.85075808e-01,
       -4.55330871e-02,  2.70523101e-01,  2.16713503e-01,  2.46971369e-01,
        9.72816199e-02,  1.49014980e-01, -2.43425816e-01, -2.31367588e-01,
       -4.03888762e-01, -3.73605788e-01,  2.89890707e-01,  3.47398877e-01,
       -9.73893821e-01, -

In [134]:
DoctoVecModel = [tagged_model.infer_vector(doc) for doc, doc_id in corpus_tagged]

In [135]:
#Save stemmed data as .pkl
with open('/content/drive/MyDrive/df_d2v.pkl', 'wb') as f:
    pickle.dump(DoctoVecModel, f)