In [2]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

In [None]:

#! Loading preprocessed reviews
with open("/corpus_/corpus_trimmed", "r") as f:
    cleaned = f.readlines()
len(cleaned)

In [26]:
cleaned[:4]

['trying saris dark whilst sitting banks dunaj river bratislava beer deep dark amber foamy white head laced way glass smell slightly malty crazy taste burst sweet malt mouthfeel good lighter body good carbonation overall sweet good beer style worth try schawtzbier fans tap hotel restaurant stary smokovec slovakia served ari glass slight hourglass shape a: looks solid rootbeer float-type head 3 fingers toffee colored head gorgeous clumpy lacing opaque dark chocolate brown red hues edges shallow spots s: smokey chocolate coffee malts dominate nose t: actually delicious complex beer abv chocolate coffee pronounced taste dig m: low mid carb perfect thick overly coating palate wet faintly dry finish o: enjoyable beer find future huge fan style found good far on-tap whilst bratislava slovakia drank branded saris pint glass nice cola like black tan head small good retention lot better look standard pale lager opinion gives mild coffee barely smell contains hints caramel possibly nuts light va

In [None]:

#! tokenizing them for training
cleaned = [i.split() for i in cleaned]

In [None]:

#! loading a pretrained model
glove_vectors =  KeyedVectors.load_word2vec_format(
    fname="glove_pretrained/glove.840B.300d.txt", 
    binary=False, no_header=True
)

In [None]:

#! creating base model class. Vector size should match that of pretrained vectors
base_model = Word2Vec(vector_size=300, min_count=5,epochs=15)
base_model.build_vocab(cleaned)
total_examples = base_model.corpus_count

In [None]:

#! add pretrained GloVe's vocabulary
base_model.build_vocab(list(glove_vectors.key_to_index.keys()), update=True)


In [None]:
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1

In [None]:

#! Training the model
base_model.train(
    cleaned,
    total_examples=total_examples,
    epochs=base_model.epochs,
    compute_loss=True,
    callbacks=[callback()],
)
base_model_wv = base_model.wv

---
---
### Testing Models
---
---

In [3]:
md_840B = KeyedVectors.load("finetuned/glove.840B.300d.ft.model")
md_42B = KeyedVectors.load("finetuned/glove.42B.300d.ft.model")
md_6B = KeyedVectors.load("finetuned/glove.6B.200d.ft.model")

In [4]:
import pandas as pd

In [5]:
top = 20
word = "cheap"
_6b = pd.DataFrame(md_6B.wv.similar_by_key(word, topn=top), columns=["6b_word", "6b_similarity"])
_42b = pd.DataFrame(md_42B.wv.similar_by_key(word, topn=top), columns=["42b_word", "42b_similarity"])
_840b = pd.DataFrame(md_840B.wv.similar_by_key(word, topn=top), columns=["840b_word", "840b_similarity"])

pd.concat([_6b, _42b, _840b], axis= 1)

Unnamed: 0,6b_word,6b_similarity,42b_word,42b_similarity,840b_word,840b_similarity
0,cheep,0.769602,cheep,0.736627,cheep,0.710659
1,crappy,0.722729,crappy,0.669794,crappy,0.684083
2,shitty,0.716751,shitty,0.657957,horrible,0.614213
3,cheapo,0.642173,cheapo,0.614901,shitty,0.611913
4,swill,0.627762,cheaply,0.603771,swill,0.592604
5,horrible,0.611165,horrible,0.578439,cheapo,0.592304
6,nasty,0.606931,swill,0.571536,macro,0.575125
7,cheaply,0.606869,macro,0.563824,terrible,0.573023
8,rotgut,0.595265,nasty,0.558905,awful,0.570081
9,macro,0.593115,rotgut,0.557519,nasty,0.569548
