# Practical NLP Tutorial: Word Embeddings

## Basics

### Tensor Attributes and Methods

In [None]:
x = torch.tensor([[4., 5., 6.]])

print('x:', x)
print('x.ndim:', x.ndim)
print('x.shape:', x.shape)
print('x.size():', x.size())
print('x.dtype:', x.dtype)
print('x.device:', x.device)

x: tensor([[4., 5., 6.]])
x.ndim: 2
x.shape: torch.Size([1, 3])
x.size(): torch.Size([1, 3])
x.dtype: torch.float32
x.device: cpu


In [None]:
y = torch.tensor([[4, 5, 6.]], dtype=torch.long, device=torch.device('cuda:0'))
print('y:', y)
print('y.dtype:', y.dtype)
print('y.device:', y.device)

y: tensor([[4, 5, 6]], device='cuda:0')
y.dtype: torch.int64
y.device: cuda:0


In [None]:
x_long = x.to(torch.long)   # to() returns a copy if conversion needed
print('x_long.dtype:', x_long.dtype)

x_long = x.long()           # alias
print('x_long.dtype:', x_long.dtype)

x_long.dtype: torch.int64
x_long.dtype: torch.int64


In [None]:
x_gpu = x.to(torch.device('cuda:0'))  # to() returns a copy if conversion needed
print('x_gpu.device:', x_gpu.device)

x_gpu = x.cuda()                      # alias
print('x_gpu.device:', x_gpu.device)

x_gpu.device: cuda:0
x_gpu.device: cuda:0


### Tensor Conversion

In [None]:
x = torch.tensor([[4., 5., 6.]])    # accepts python list

print(x.tolist())    # returns python list
print(x.numpy())    # returns numpy array

[[4.0, 5.0, 6.0]]

In [None]:
y = torch.tensor([[7.]])

print('y:', y)
print('y.tolist():', y.tolist())
print('y.item():', y.item())    # if y has a single value
                                # returns that as python number

y: tensor([[7.]])
y.tolist(): [[7.0]]
y.item(): 7.0


### Calculating gradients

In [None]:
X = torch.tensor([1., 2])
W = torch.tensor([3., 4], requires_grad=True)
b = torch.tensor([5.], requires_grad=True)
W

tensor([3., 4.], requires_grad=True)

In [None]:
a = X * W
print('a:', a)
Y = a + b
print('Y:', Y)

a: tensor([3., 8.], grad_fn=<MulBackward0>)
Y: tensor([ 8., 13.], grad_fn=<AddBackward0>)


In [None]:
loss = Y.sum()
print('loss:', loss)

loss.backward()   # loss should be a single value
                  # to call backward without input

loss: tensor(21., grad_fn=<SumBackward0>)


In [None]:
print('W.grad:', W.grad)
print('b.grad:', b.grad)

W.grad: tensor([2., 4.])
b.grad: tensor([6.])


### Applying gradients

In [None]:
from torch.optim import SGD

optimizer = torch.optim.SGD([W, b], lr=0.01)
optimizer.step()

In [None]:
print('updated W:', W)
print('updated b:', b)

updated W: tensor([2.9800, 3.9600], requires_grad=True)
updated b: tensor([4.9400], requires_grad=True)


## Let's Dive Deeper

In [None]:
from IPython.display import clear_output
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import pandas as pd

torch.manual_seed(1)

<torch._C.Generator at 0x7f3d249526d0>

Links to PyTorch documentations:

* [torch.nn](https://pytorch.org/docs/stable/nn.html)
* [torch.nn.functional](https://pytorch.org/docs/stable/nn.functional.html)
* [torch.optim](https://pytorch.org/docs/stable/optim.html)

In [None]:
#@title Upload `kaggle.json`
# We use kaggle library to download dataset directly to colab notebook
!pip install -q kaggle

from google.colab import files
files.upload()

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/

clear_output()
print('done')

### N-Gram Language Modeling

In [None]:
#@title Download and unzip the dataset
!kaggle datasets download -d aminghd/large-corpus-of-farsi-poems
!unzip large-corpus-of-farsi-poems
clear_output()
print('done')

done


In [None]:
with open('hafez_norm.txt', 'r') as f:
    hafez_poems = f.read()

In [None]:
hafez_poems = hafez_poems.split()
vocab = set(hafez_poems)
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [None]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

In [None]:
ngrams = [
    (
        [hafez_poems[i - j - 1] for j in range(CONTEXT_SIZE)],
        hafez_poems[i]
    )
    for i in range(CONTEXT_SIZE, len(hafez_poems))
]
ngrams[:3]

[(['یا', 'الا'], 'ایها'),
 (['ایها', 'یا'], 'الساقی'),
 (['الساقی', 'ایها'], 'ادر')]

In [None]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        return out


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

losses = []
loss_function = nn.CrossEntropyLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001)

In [None]:
epoch_pbar = tqdm(range(200))
for epoch in epoch_pbar:
  total_loss = 0
  total_count = 0
  step_pbar = tqdm(ngrams, total=len(ngrams))
  for context, target in step_pbar:
    context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long).to(device)

    model.zero_grad()
    log_probs = model(context_idxs)
    loss = loss_function(log_probs, torch.tensor([word_to_ix[target]]).to(device))

    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    total_count += 1
    step_pbar.set_postfix(loss=loss.item(), average_loss=total_loss/total_count)
  epoch_pbar.set_postfix(total_loss=total_loss, average_loss=total_loss/total_count)
  losses.append(total_loss)


### Using Pre-Trained Embeddings

In [None]:
#@title Download and unzip the dataset
!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis
!unzip twitter-entity-sentiment-analysis
clear_output()
print('done')

done


In [None]:
data = pd.read_csv('twitter_training.csv', header=None, names=['tweet_id', 'entity', 'label', 'content'])
data

Unnamed: 0,tweet_id,entity,label,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [None]:
data.label.unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

Links to some pretrained embedding models:

* [Google Pretrained Embeddings](https://code.google.com/archive/p/word2vec/)
* [Word2Vec 400M Tweets Embedding model](https://github.com/loretoparisi/word2vec-twitter)
* [Farsi Pretrained Embeddings](https://nlpdataset.ir/farsi/pre-trained_embeddings.html)
* [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/)


In [None]:
#@title Download pretrained model
!wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"
!unzip wiki-news-300d-1M.vec.zip

--2022-02-27 08:57:05--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2022-02-27 09:06:20 (1.17 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   


In [None]:
import gensim

word2vec = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec')

print ("# of words", len(word2vec.vocab))
print ("# of vectors", len(word2vec.vectors))
print ("the first 10 elements of embedding vector for the word king:",
       word2vec.vectors[word2vec.vocab["king"].index][:10])

# of words 999994
# of vectors 999994
the first 10 elements of embedding vector for the word king: [ 0.1082  0.0445 -0.0384  0.0011 -0.0888  0.0713 -0.0696 -0.0477  0.0071
 -0.0408]


In [None]:
word2vec.add(['<PAD>'], [np.zeros(300)])

In [None]:
word2ix = {k:word2vec.vocab[k].index for k in word2vec.vocab.keys()}
ix2word = {v:k for k, v in word2ix.items()}
weights = torch.FloatTensor(word2vec.vectors)

In [None]:
from sklearn import preprocessing
import numpy as np

class TwitterSentimentAnalysisDataset(torch.utils.data.Dataset):
  def __init__(self, csv_file):
    super(TwitterSentimentAnalysisDataset, self).__init__()
    self.df = pd.read_csv('twitter_training.csv', header=None, names=['tweet_id', 'entity', 'label', 'content'])
    self.df = self.df.dropna(axis=0, subset=['content', 'label'])
    self.df.content = self.df.content.astype('string')
    self.label_encoder = preprocessing.LabelEncoder().fit(self.df['label'])
  
  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    input = self.df.iloc[index, 3].split()
    input = [word2ix[token] for token in input if token in word2ix]
    input = [input[i] if i < len(input) else word2ix['<PAD>'] for i in range(32)]

    output = self.df.iloc[index, 2]
    output = self.label_encoder.transform([output])[0]
    
    return torch.tensor(input), torch.tensor(output)

train_dataset = TwitterSentimentAnalysisDataset('twitter_training.csv')
validation_dataset = TwitterSentimentAnalysisDataset('twitter_validation.csv')

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=64, shuffle=True)


In [None]:
class TwitterSentimentAnalysisModel(nn.Module):
  def __init__(self, embedding_weights):
    super(TwitterSentimentAnalysisModel, self).__init__()
    self.embedding = nn.Embedding.from_pretrained(embedding_weights)
    self.linear1 = nn.Linear(32, 1)
    self.linear2 = nn.Linear(300, 100)
    self.linear3 = nn.Linear(100, 4)
  
  def forward(self, inputs):
    x = self.embedding(inputs)
    x = x.transpose(1, -1)
    x = self.linear1(x)
    x = F.relu(x)
    x = torch.squeeze(x)
    x = self.linear2(x)
    x = F.relu(x)
    x = self.linear3(x)
    return F.softmax(x, dim=1)


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
losses = []
loss_function = nn.CrossEntropyLoss()
model = TwitterSentimentAnalysisModel(weights).to(device)
model.embedding.weight.required_grad = False
optimizer = optim.AdamW(model.parameters(), lr=0.001)

In [None]:
from sklearn import metrics

epoch_pbar = tqdm(range(10))
for epoch in epoch_pbar:
    total_loss = 0
    total_count = 0
    
    model.train()
    step_pbar = tqdm(train_dataloader, total=len(train_dataloader))
    for content, label in step_pbar:
        context_idxs = content.to(device)

        model.zero_grad()
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, label.to(device))

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_count += 1
        step_pbar.set_postfix(loss=loss.item(), average_loss=total_loss/total_count)
        
        
    with torch.no_grad():
      validation_loss = 0
      validation_count = 0
      
      y_pred = np.array([])
      y_true = np.array([])
          
      model.eval()
      step_pbar = tqdm(validation_dataloader, total=len(validation_dataloader))
      step_pbar.set_description("Evaluation")

      for content, label in step_pbar:
          context_idxs = content.to(device)
          log_probs = model(context_idxs)
          
          y_pred = np.append(y_pred, torch.argmax(log_probs, dim=-1).cpu().numpy())
          y_true = np.append(y_true, label.numpy())
          loss = loss_function(log_probs, label.to(device))
          
          validation_loss += loss.item()
          validation_count += 1
          step_pbar.set_postfix(loss=loss.item())
          
      
      step_pbar.set_postfix(average_loss=validation_loss/validation_count, 
                            accuracy=metrics.accuracy_score(y_true, y_pred))
      step_pbar.update()
      step_pbar.close()
      
      epoch_pbar.set_postfix(total_loss=total_loss, average_loss=total_loss/total_count)
      losses.append(total_loss)
