# GloVe (Gensim)

For looking at word vectors, we'll use **Gensim**. **Gensim** isn't really a deep learning package. It's a package for for word and text similarity modeling, which started with (LDA-style) topic models and grew into SVD and neural word representations. But its efficient and scalable, and quite widely used.   We gonna use **GloVe** embeddings, downloaded at [the Glove page](https://nlp.stanford.edu/projects/glove/). They're inside [this zip file](https://nlp.stanford.edu/data/glove.6B.zip)

In [68]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec
import time
import numpy as np
import torch
import pickle

In [59]:
import nltk
from nltk.corpus import reuters
from nltk.corpus import stopwords
import string
import re

In [60]:
#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath('glove.6B.100d.txt')  #search on the google
model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [61]:
# Define the training corpus
corpus = reuters.sents(categories="grain")

In [62]:
# Save the model
model_save_path = 'model/gensim_model.pkl'
pickle.dump(model,open(model_save_path,'wb'))

In [63]:
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

double_punctuation_pattern = re.compile(r'[' + re.escape(string.punctuation) + ']{2,}')

def clean_corpus(corpus):
    cleaned_corpus = []
    for sentence in corpus:
        cleaned_sentence = []
        for word in sentence:
            word = word.lower()
            if word not in stop_words and word not in punctuation and not double_punctuation_pattern.match(word):
                cleaned_sentence.append(word)
        cleaned_corpus.append(cleaned_sentence)
    return cleaned_corpus

cleaned_corpus = clean_corpus(corpus)
cleaned_corpus

[['china',
  'daily',
  'says',
  'vermin',
  'eat',
  '7',
  '12',
  'pct',
  'grain',
  'stocks',
  'survey',
  '19',
  'provinces',
  'seven',
  'cities',
  'showed',
  'vermin',
  'consume',
  'seven',
  '12',
  'pct',
  'china',
  'grain',
  'stocks',
  'china',
  'daily',
  'said'],
 ['also',
  'said',
  'year',
  '1',
  '575',
  'mln',
  'tonnes',
  '25',
  'pct',
  'china',
  'fruit',
  'output',
  'left',
  'rot',
  '2',
  '1',
  'mln',
  'tonnes',
  '30',
  'pct',
  'vegetables'],
 ['paper',
  'blamed',
  'waste',
  'inadequate',
  'storage',
  'bad',
  'preservation',
  'methods'],
 ['said',
  'government',
  'launched',
  'national',
  'programme',
  'reduce',
  'waste',
  'calling',
  'improved',
  'technology',
  'storage',
  'preservation',
  'greater',
  'production',
  'additives'],
 ['paper', 'gave', 'details'],
 ['thai',
  'trade',
  'deficit',
  'widens',
  'first',
  'quarter',
  'thailand',
  'trade',
  'deficit',
  'widened',
  '4',
  '5',
  'billion',
  'baht',


In [64]:
len(cleaned_corpus)

4113

In [65]:
# Custom callback to log loss and time every 100 epochs
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0  # Initialize epoch counter
        self.start_time = time.time()  # Record the start time
        self.previous_loss = 0  # Track the previous cumulative loss

    def on_epoch_end(self, model):
        self.epoch += 1  # Increment epoch counter
        if self.epoch % 100 == 0:  # Log every 100 epochs
            elapsed_time = time.time() - self.start_time  # Calculate elapsed time
            cumulative_loss = model.get_latest_training_loss()  # Get cumulative loss
            epoch_loss = cumulative_loss - self.previous_loss  # Calculate per-epoch loss
            self.previous_loss = cumulative_loss  # Update previous loss for next calculation
            print(f"Epoch {self.epoch:5} | Loss: {epoch_loss:.6f} | Time: {int(elapsed_time // 60)}m {int(elapsed_time % 60)}s")

In [66]:
# Training parameters
window_size = 2  # Context window size
emb_size = 2  # Embedding size
min_count = 2  # Ignore words with frequency less than min_count

# Initialize the callback
epoch_logger = EpochLogger()

# Train the Word2Vec model with the callback
model = Word2Vec(
    sentences=cleaned_corpus,  # Corpus to train on
    vector_size=emb_size,  # Size of the word embeddings
    window=window_size,  # Context window size
    min_count=min_count,  # Minimum word frequency threshold
    workers=22,  # Number of worker threads for parallel processing
    epochs=1000,  # Number of training epochs
    compute_loss=True,  # Enable loss computation
    callbacks=[epoch_logger]  # Add the callback for logging
    )

Epoch   100 | Loss: 2070179.125000 | Time: 0m 3s
Epoch   200 | Loss: 1990935.375000 | Time: 0m 6s
Epoch   300 | Loss: 1872034.500000 | Time: 0m 9s
Epoch   400 | Loss: 1874788.000000 | Time: 0m 13s
Epoch   500 | Loss: 1521846.000000 | Time: 0m 16s
Epoch   600 | Loss: 1355779.000000 | Time: 0m 19s
Epoch   700 | Loss: 1351014.000000 | Time: 0m 23s
Epoch   800 | Loss: 1341355.000000 | Time: 0m 26s
Epoch   900 | Loss: 1335077.000000 | Time: 0m 29s
Epoch  1000 | Loss: 1329423.000000 | Time: 0m 32s


## Save the model and Args

In [70]:
# Save the model
model_save_path = 'model/gensim_glove.pkl'
pickle.dump(model,open(model_save_path,'wb'))