# Unsupervised Word2Vec with Negative Sampling

PyTorch implementation of Negative Sampling for Word2Vec introduced in **Distributed Representations of Words and Phrases and their Compositionality (NIPS 2013)**

# Table of contents
* [Preamble](#Preamble)
* [Data preparation](#Data-preparation)
    * [Load raw data](#Load-raw-data)
    * [Train-test split](#Train-test-split)
    * [Setup vocabulary utils](#Setup-vocabulary-utils)
* [PyTorch Dataset class](#PyTorch-Dataset-class)
* [Word2Vec Module](#Word2Vec-Module)
    * [Embedding module](#Embedding-module)
    * [Sampling utilities](#Sampling-utilities)
* [Training](#Training)

## Preamble

In [None]:
# Preamble
import time, random
import os, sys
import math
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from tqdm import tqdm

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64

# Data preparation
The word embeddings is trained on YELP-review dataset: https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset

## Load raw data

In [None]:
# CSV Preparation
data_file = open("/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json")
data = []

cnt = 1569264 # Size of YELP 2015 dataset
# cnt = 10000

for line in data_file:
    data.append(json.loads(line))
    cnt -= 1
    if cnt == 0:
        break
    
data_file.close()
df = pd.DataFrame(data)

print("Number of datapoints:", len(df))
df.head()

## Train-test split

In [None]:
df_size = len(df)
idx = [x for x in range(df_size)]
random.Random(555).shuffle(idx)

train_num = int(df_size * 0.8)
val_num = int(df_size * 0.1)
test_num = int(df_size * 0.1)

# print(train_num, val_num, test_num)

train_idx = idx[:train_num]
val_idx = idx[train_num : (train_num + val_num)]
test_idx = idx[(train_num + val_num) : ]

train_df = df.iloc[train_idx]
val_df = df.iloc[val_idx]
test_df = df.iloc[test_idx]

print('Size of trainset:', len(train_df))
print('Size of valset:', len(val_df))
print('Size of testset:', len(test_df))

## Setup vocabulary utils
Here the frequency counter, vocab object and tokenizer is defined.

Reference: https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [None]:
# Set up Vocab
# Source: https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab, build_vocab_from_iterator

from collections import Counter

tokenizer = get_tokenizer("basic_english")
            
def get_counter(df_lists):
    counter = Counter()
    for _df in df_lists:
        for i in range(len(_df)):
            counter.update(tokenizer(_df.iloc[i]['text']))
    return counter

# counter = get_counter([train_df, val_df]) - train_df is too large already.
counter = get_counter([train_df])

vocab = vocab(
    counter,
    specials= ["<unk>"],
    min_freq= 6
)
vocab.set_default_index(vocab["<unk>"])

vocab_size = len(vocab)
print('Vocabulary size:', vocab_size)

torch.save(vocab, 'vocab.pth')
torch.save(counter, 'counter.pth')

# PyTorch Dataset class
In this section, the `YELPDataset` class is defined with zero-padding batch function `collate_batch`.

In [None]:
# YELP Dataset
def collate_batch(batch):
    '''
    Collate batch with zero-padding
    Consult: https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
    '''
    texts = []
    labels = []
    mask = []
    for _text, _label in batch:
        texts.append(_text)
        labels.append(_label)
        
    L = max([len(text) for text in texts])
    
    for i in range(len(texts)):
        l = texts[i].shape[0]
        cur_mask = torch.ones(L)
        if l < L:
            cur_mask[l:L] = 0
            # Zero-padding text, only on one side.
            texts[i] = F.pad(texts[i], (0, L-l), 'constant', 0)
            
        mask.append(cur_mask)
    
    texts = torch.stack(texts)
    labels = torch.stack(labels)
    mask = torch.stack(mask)
    return texts, labels, mask

class YELPDataset(Dataset):
    def __init__(self, df, vocab, tokenizer, df_sort= True):
        self.df = df
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.len_vocab = len(vocab)
        if df_sort:
            self.sort_df_by_txt_len()
    
    def sort_df_by_txt_len(self):
        len_list = [-len(self.df.iloc[i]['text']) for i in range(len(self.df))]
        self.df = self.df.iloc[np.argsort(len_list)]
    
    def __len__(self):
        return len(self.df)

    def text_pipeline(self, x):
        return self.vocab(self.tokenizer(x))

    def label_pipeline(self, x):
        return int(x) - 1
    
    def __getitem__(self, idx):
        txt = self.text_pipeline(self.df.iloc[idx]['text'])
        txt = torch.tensor(txt, dtype= torch.int64)
#         txt = F.one_hot(txt, num_classes= self.len_vocab)
        
        label = self.label_pipeline(self.df.iloc[idx]['stars'])
        label = torch.tensor(label, dtype= torch.int64)
        
        return (txt, label)

In [None]:
# Dataset, Dataloader
trainset = YELPDataset(train_df, vocab, tokenizer)
valset = YELPDataset(val_df, vocab, tokenizer)
testset = YELPDataset(test_df, vocab, tokenizer)

trainloader = DataLoader(trainset, batch_size= BATCH_SIZE, 
                         shuffle= False, pin_memory= True, collate_fn= collate_batch)
valloader = DataLoader(valset, batch_size= BATCH_SIZE, 
                         shuffle= False, pin_memory= True, collate_fn= collate_batch)
testloader = DataLoader(testset, batch_size= BATCH_SIZE, 
                         shuffle= False, pin_memory= True, collate_fn= collate_batch)

for X, y, mask in trainloader:
    print("Shape of Texts:", X.shape)
    print("Shape of Labels:", y.shape)
    print("Shape of Mask:", mask.shape)
    break

# Word2Vec Module

In [None]:
# Task configs
VOCAB_SIZE = len(vocab)
EMBEDDING_SIZE = 200
CONTEXT_WINDOW = 10
NUM_SAMPLES = 5

## Embedding module

In [None]:
# Word2vec embedding module
class NegSamplingEmbedding(nn.Module):
    '''
    Vocab_size: V
    Embedding_size: E
    Text_length: L
    Batch_size: B
    
    Consult: https://github.com/mindspore-courses/DeepNLP-models-MindSpore/
            blob/main/notebooks/02.Skip-gram-Negative-Sampling.ipynb
    '''
    def __init__(self, vocab_size, embedding_size):
        super(NegSamplingEmbedding, self).__init__()
        self.U = nn.Embedding(vocab_size, embedding_size) # Center embedding
        self.V = nn.Embedding(vocab_size, embedding_size) # Outside embedding
        self.LogSig = nn.LogSigmoid()
        
    def forward(self, wc, wo, wk, mask_c, mask_o, check_shape= False):
        vc = self.V(wc) # Center embedding. Shape: (B, L, E)
        uo = self.U(wo) # Outside embedding. Shape: (B, L, C, E)
        uk = self.U(wk) # Random embedding. Shape: (B, L, C, E, K)
        
        if check_shape:
            B = uk.shape[0]
            L = uk.shape[1]
            C = uk.shape[2]
            K = uk.shape[3]
            E = uk.shape[4]
            print(f"Basic shapes: B = {B}; L = {L}; C = {C}; K = {K}; E = {E}")
            print('*********************************')
            print('Shape of vc:', vc.shape)
            print('Shape of uo:', uo.shape)
            print('Shape of uk:', uk.shape)
            print('*********************************')
        cmp1 = torch.einsum('blce,ble->blc', uo, vc) # Shape: (B, L, C)
        cmp2 = torch.einsum('blcke,ble->blck', uk, vc) # Shape: (B, L, C, K)
        
        cmp1 = self.LogSig(cmp1) * mask_o # Shape: (B, L, C)
        cmp2 = self.LogSig(-cmp2) # Shape: (B, L, C, K)
        cmp2 = torch.einsum('blck->blc', cmp2) * mask_o # Shape: (B, L, C)
    
        cmp1 = torch.einsum('blc->bl', cmp1) # Shape: (B, L)
        cmp2 = torch.einsum('blc->bl', cmp2) # Shape: (B, L)
        
        loss = torch.mean(cmp1 + cmp2)
        
        if check_shape:
            print('Shape of cmp1:', cmp1.shape)
            print('Shape of cmp2:', cmp2.shape)
            print('Shape of LOSS:', loss.shape)
        return -loss

## Sampling utilities

In [None]:
# Negative Sampling
def NoiseDistribution(counter, vocab, Z= 0.001):
    '''
    Noise distribution P_n(w) = [U(w)^0.75]/Z .
    '''
    Fr = np.zeros(len(vocab)) # Frequency
    for k in counter.keys():
        ix = vocab[k]
        Fr[ix] += counter[k]
    
    U = Fr / np.sum(Fr)
    P = (U ** 0.75) / Z
    return Fr, P

def SubSamplingMask(Fr, t= 1e-5):
    '''
    Subsampling mask
    '''
    masks = np.ones(Fr.shape[0])
    for i in range(Fr.shape[0]):
        break
        coin = np.random.binomial(1, np.sqrt(t/Fr[i]))
        masks[i] = coin
        
    return masks

def SamplingDistribution(vocab, P):
    '''
    Construct sampling vocabulary
    Consult: https://github.com/mindspore-courses/DeepNLP-models-MindSpore/
            blob/main/notebooks/02.Skip-gram-Negative-Sampling.ipynb
    '''
    unigram_table = []
    cnt = 0
    vocab_size = len(vocab)
    for i in range(vocab_size):
        unigram_table = unigram_table + ([i] * int(P[i]))
    return np.array(unigram_table)

def NegativeSampling(unigram_table, k):
    '''
    Negative sampling over unigram_table, taking k random samples.
    '''
    idx = np.random.choice(unigram_table.shape[0], k)
    return torch.tensor(unigram_table[idx])

def OutsideSampling(text, C, K, vocab, Fr= None, P= None, unigram_table= None):
    '''
    Construction of wo, mask_o.
    <text> is expected to have type of torch.Tensor with shape of (L,)
    
    Unigram frequency: Fr
    Noise distribution: P
    
    Text length: L
    Context window: C
    Number of random samples: K
    Embedding size: E
    '''
    assert C % 2 == 0 # Context windows must be divisible by 2 
    hC = int(C/2) # Half window
    
    wo = [] # Shape: (L, C)
    mask_o = [] # Shape: (L, C)
    wk = [] # Shape: (L, C, K)
    
    L = text.shape[0]
    txt = F.pad(text, (hC, hC), 'constant', 0) # [<unk>] padding on both sides
    
    for i in range(L):
        l = i
        r = i + C
        mid = i + hC
        cur_mask_o = torch.ones(C)
        
        if i-hC < 0:
            cur_mask_o[0 : (hC-i)] = 1
        if i+hC > L-1:
            cur_mask_o[(L-i-1+hC) : C] = 1

        wo.append(torch.cat((txt[l:mid], txt[(mid+1):(r+1)]), dim= 0))
        mask_o.append(cur_mask_o)
        
    wo = torch.stack(wo)
    mask_o = torch.stack(mask_o)
    
    return wo, mask_o

def BatchSampling(texts, C, K, vocab, Fr= None, P= None, unigram_table= None):
    '''
    Construction of wc, wo, wk, mask_c, mask_o.
    <texts> is expected to have type of list(torch.Tensor) with shape (B, L)
    
    Unigram frequency: Fr
    Noise distribution: P
    
    Batch size: B
    Text length: L
    Context window: C
    Number of random samples: K
    Embedding size: E
    '''
    wc = texts # Shape: (B, L)
    wo = [] # Shape: (B, L, C)
    wk = [] # Shape: (B, L, C, K)
    
    mask_o = [] # Shape: (B, L, C)
    
    B = texts.shape[0]
    L = texts.shape[1]
        
    # Get outside-tokens with zero-padding
    for i in range(B):
        cur_wo, cur_mask_o = OutsideSampling(texts[i], C, K, vocab, 
                                              Fr, P, unigram_table)
        wo.append(cur_wo)
        mask_o.append(cur_mask_o)
        
    # Batch-level subsampling
    if unigram_table is None:
        assert P is not None and Fr is not None
        masks = SubSamplingMask(Fr)
        unigram_table = SamplingDistribution(vocab, P*masks)
    
    # Random sampling over unigram_table with shape of (B, L, C, K)
    idx = torch.randint(low= 0, high= unigram_table.shape[0], size= (B, L, C, K))
    wk = torch.tensor(unigram_table[idx])
        
    # Tensorize
    wo = torch.stack(wo)
    mask_o = torch.stack(mask_o)
    
    return wc, wo, wk, mask_o

## Module testing

In [None]:
# Defining model, noise distribution
Fr, P = NoiseDistribution(counter, vocab)
model = NegSamplingEmbedding(VOCAB_SIZE, EMBEDDING_SIZE).to(DEVICE)

masks = SubSamplingMask(Fr)
unigram_table = SamplingDistribution(vocab, P*masks)

# Dimension checking
for X, y, mask_c in trainloader:
    print('Shape of X:', X.shape)
    print('Shape of mask:', mask_c.shape)
    X, y, mask_c = X.to(DEVICE), y.to(DEVICE), mask_c.to(DEVICE)
    wc, wo, wk, mask_o = BatchSampling(X, 
                                       C= CONTEXT_WINDOW,
                                       K= NUM_SAMPLES,
                                       vocab= vocab,
                                       unigram_table= unigram_table)
    wc, wo, wk = wc.to(DEVICE), wo.to(DEVICE), wk.to(DEVICE)
    mask_o = mask_o.to(DEVICE)
    print('Shape of wc:', wc.shape)
    print('Shape of wo:', wo.shape)
    print('Shape of wk:', wk.shape)
    print('*********************************')
    loss = model(wc, wo, wk, mask_c, mask_o, check_shape= True)
    print('*********************************')
    print('LOSS =', loss)
    break

# Training

In [None]:
# Traing configs
LR = 1e-3
EPOCHS = 1
ITER = EPOCHS * len(trainloader)
OPTIMIZER = torch.optim.AdamW(model.parameters(), lr= LR)
SCHEDULER = lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(OPTIMIZER, T_max = ITER)

In [None]:
# Train procedures
def train(trainloader, model, optimizer, scheduler, vocab, unigram_table):
    model.train()
    tloss = []
    cur_loss = 1e9
    for i, (X, y, mask_c) in enumerate(trainloader):
        X, y, mask_c = X.to(DEVICE), y.to(DEVICE), mask_c.to(DEVICE)
        wc, wo, wk, mask_o = BatchSampling(X, 
                                   C= CONTEXT_WINDOW,
                                   K= NUM_SAMPLES,
                                   vocab= vocab,
                                   unigram_table= unigram_table)
        wc, wo, wk = wc.to(DEVICE), wo.to(DEVICE), wk.to(DEVICE)
        mask_o = mask_o.to(DEVICE)
        
        loss = model(wc, wo, wk, mask_c, mask_o)
        
        if i % 1000 == 0:
            tloss.append(loss.cpu().detach().numpy())
            print(f'Iter {i}, loss = ',tloss[-1])
            if tloss[-1] < cur_loss:
                cur_loss = tloss[-1]
                print('Saving model...')
                torch.save(model.state_dict(), 'word2vec.pth')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
    if tloss[-1] < cur_loss:
        cur_loss = tloss[-1]
        torch.save(model.state_dict(), 'word2vec.pth')
    return tloss

In [None]:
# TRAINING
iter_loss = []
epoch_loss = []

for t in range(EPOCHS):
    print(f'Epoch {t} starts.')
    # Epoch-level subsampling
    Fr, P = NoiseDistribution(counter, vocab)
    masks = SubSamplingMask(Fr)
    unigram_table = SamplingDistribution(vocab, P*masks)
    
    tloss = train(trainloader, model, OPTIMIZER, SCHEDULER, vocab, unigram_table)
    
    iter_loss = iter_loss + tloss
    epoch_loss.append(sum(tloss) / len(tloss))
    
    print(f'Epoch {t}: LOSS = {epoch_loss[-1]}')
    
fig, axes = plt.subplots()
axes.plot(iter_loss, label = 'train-loss')
axes.legend()
axes.set_xlabel('Iteration')
axes.set_ylabel('Loss')
plt.show()