# Course AI Homework 5
In Homework 5, we will train our own 'CBOW' Word2Vec embedding from WikiText2 dataset. (small dataset)
- Change Runtime option above to GPU if you could. (max 12 hours for one user)
- Save and submit the outputs of this notebook and model and vocab file you trained.
- Not allowed to have other python file or import pretrained model.

In [None]:
! pip uninstall torch -y
! pip install torch==2.3.0

: 

In [None]:
# YOU should run this command if you will train the model in COLAB environment
! pip install datasets transformers torchtext==0.18.0

In [None]:
import argparse
import yaml
import os
import torch
import torch.nn as nn
import torchtext
torchtext.disable_torchtext_deprecation_warning()

import json
import numpy as np

from functools import partial
from torch.utils.data import DataLoader
from torchtext.data import to_map_style_dataset
from torchtext.data.utils import get_tokenizer

from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import WikiText2 # WikiText103

import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

from datasets import load_dataset



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_seed_numb = 0
if device.type == 'cuda':
    torch.cuda.manual_seed(torch_seed_numb)

In [None]:
device

In [None]:
# If you use Google Colab environment, mount you google drive here to save model and vocab
from google.colab import drive
drive.mount('/content/drive')
root_dir = '/content/drive/MyDrive/course_ai_hw5'

### Constant Setting

In [None]:
# You could change parameters if you want.

train_batch_size =  96
val_batch_size = 96
shuffle =  True

optimizer =  'Adam'
learning_rate =  0.025
epochs = 50

result_dir = 'weights/'

# Parameters about CBOW model architecture and Vocab.
CBOW_N_WORDS = 4

MIN_WORD_FREQUENCY = 50
MAX_SEQUENCE_LENGTH = 256

EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

In [None]:
result_dir = os.path.join(root_dir, result_dir)
if not os.path.exists(result_dir):
    os.makedirs(result_dir)


## Prepare dataset and vocab

In [None]:
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
train_dataset = datasets["train"]
val_dataset = datasets['validation']
test_dataset = datasets['test']


In [None]:
# Let's print one example
train_dataset['text'][11]

As you can see, we need to clean up the sentences, lowercase them, tokenize them, and change each word into an index (one-hot vector). Before going through the whole process, we need to create a vocab set using the training dataset.

In [None]:
tokenizer = get_tokenizer("basic_english", language="en")

# TODO 1): make vocabulary
# Hint) use function: build_vocab_from_iterator, use train_dataset set special tokens.. etc

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(
    yield_tokens(train_dataset['text']),
    min_freq=MIN_WORD_FREQUENCY,
    specials=["<unk>", "<pad>"]
)
vocab.set_default_index(vocab["<unk>"])

We need a collate function to make dataset into CBOW trainning format. The collate function should iterate over (sliding) batch data and make train/test dataset.And each component of data should be composed of CBOW_N_WORD words in each left and right side as input and target output as word in center.  
Make the collate function return CBOW dataset in tensor type.

In [None]:
# Here is a lambda function to tokenize sentence and change words to vocab indexes.
text_pipeline = lambda x: vocab(tokenizer(x))

![cbow](https://user-images.githubusercontent.com/74028313/204695601-51d44a38-4bd3-4a69-8891-2854aa57c034.png)

In [None]:
def collate(batch, text_pipeline):

    batch_input, batch_output = [], []

    # TODO 2): make collate function
    for text in batch:
        text_tokens_ids = text_pipeline(text)
        
        if len(text_tokens_ids) < CBOW_N_WORDS * 2 + 1:
            continue
        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]
        
        for idx in range(len(text_tokens_ids) - CBOW_N_WORDS * 2):
            context = (
                text_tokens_ids[idx : idx + CBOW_N_WORDS] +
                text_tokens_ids[idx + CBOW_N_WORDS + 1 : idx + CBOW_N_WORDS * 2 + 1]
            )
            target = text_tokens_ids[idx + CBOW_N_WORDS]
            
            batch_input.append(context)
            batch_output.append(target)
    
    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)

    return batch_input, batch_output

In [None]:
train_dataloader = DataLoader(
    train_dataset['text'],
    batch_size=train_batch_size,
    shuffle=shuffle,
    collate_fn=partial(collate, text_pipeline=text_pipeline),
)

val_dataloader = DataLoader(
    val_dataset['text'],
    batch_size=val_batch_size,
    shuffle=shuffle,
    collate_fn=partial(collate, text_pipeline=text_pipeline),
)

## Make CBOW Model
![image](https://user-images.githubusercontent.com/74028313/204701161-cd9df4bf-78b8-4b4d-b8b7-ed4a3b5c3922.png)

CBOW Models' main concept is to predict center-target word using context words. As you see in above simple architecture, input 2XCBOW_N_WORDS length words are projected to Projection layer. In order to convert each word to embedding, it needs look-up table and we will use torch's Embedding function to convert it. After combining embeddings of context, it use shallow linear neural network to predict target word and compare result with center word's index using cross-entropy loss. Finally, the embedding layer (lookup table) of the trained model itself serves as an embedding representing words.

In [None]:
class CBOW_Model(nn.Module):
    def __init__(self, vocab_size: int, EMBED_DIMENSION, EMBED_MAX_NORM):
        super(CBOW_Model, self).__init__()
        # TODO 3-1): make CBOW model using nn.Embedding and nn.Linear function
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM
        )
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size
        )

    def forward(self, _inputs):
        # TODO 3-2): make forward function
        x = self.embeddings(_inputs)
        
        x = x.mean(axis=1)
        
        _outputs = self.linear(x)

        return _outputs

## Train the model

Let's make _train_epoch and _validate_epoch functions to train the CBOW model.  
- model.train() and model.eval() change torch mode in some parts (Dropout, BatchNorm..  etc) of the model to behave differently during inference time.
- There is lr_scheduler option which changes learning rate according to epoch level. Try the option if you are interested in.

In [None]:
vocab_size = len(vocab.get_stoi())

model = CBOW_Model(vocab_size=vocab_size, EMBED_DIMENSION = EMBED_DIMENSION, EMBED_MAX_NORM = EMBED_MAX_NORM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [None]:
class Train_CBOW:

    def __init__(
        self,
        model,
        epochs,
        train_dataloader,
        val_dataloader,
        loss_function,
        optimizer,
        device,
        model_dir,
        lr_scheduler = None
    ):
        self.model = model
        self.epochs = epochs
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.loss_function = loss_function
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
        self.device = device
        self.model_dir = model_dir

        self.loss = {"train": [], "val": []}
        self.model.to(self.device)

    def train(self):
        for epoch in range(self.epochs):
            self._train_epoch()
            self._validate_epoch()
            print(
                "Epoch: {}/{}, Train Loss={:.5f}, Val Loss={:.5f}".format(
                    epoch + 1,
                    self.epochs,
                    self.loss["train"][-1],
                    self.loss["val"][-1],
                )
            )
            if self.lr_scheduler is not None:
                self.lr_scheduler.step()


    def _train_epoch(self):
        self.model.train() # set model as train
        loss_list = []
        # TODO 4-1):
        for batch_input, batch_output in self.train_dataloader:
            batch_input = batch_input.to(self.device)
            batch_output = batch_output.to(self.device)
            
            self.optimizer.zero_grad()
            
            output = self.model(batch_input)
            
            loss = self.loss_function(output, batch_output)
            
            loss.backward()
            
            self.optimizer.step()
            
            loss_list.append(loss.item())

        # end of TODO
        epoch_loss = np.mean(loss_list)
        self.loss["train"].append(epoch_loss)

    def _validate_epoch(self):
        self.model.eval()
        loss_list = []

        with torch.no_grad():
            # TODO 4-2):
            for batch_input, batch_output in self.val_dataloader:
                batch_input = batch_input.to(self.device)
                batch_output = batch_output.to(self.device)
                
                output = self.model(batch_input)
                
                loss = self.loss_function(output, batch_output)
                
                loss_list.append(loss.item())

            # end of TODO
        epoch_loss = np.mean(loss_list)
        self.loss["val"].append(epoch_loss)


    def save_model(self):
        model_path = os.path.join(self.model_dir, "model.pt")
        torch.save(self.model, model_path)

    def save_loss(self):
        loss_path = os.path.join(self.model_dir, "loss.json")
        with open(loss_path, "w") as fp:
            json.dump(self.loss, fp)

In [None]:
# Option: you could add and change lr_sceduler
scheduler = LambdaLR(optimizer, lr_lambda = lambda epoch: 0.95 ** epoch)

In [None]:
trainer = Train_CBOW(
    model=model,
    epochs=epochs,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    loss_function=loss_function,
    optimizer=optimizer,
    lr_scheduler=None,
    device=device,
    model_dir=result_dir,
)

trainer.train()
print("Training finished.")


In [None]:
# save model
trainer.save_model()
trainer.save_loss()

vocab_path = os.path.join(result_dir, "vocab.pt")
torch.save(vocab, vocab_path)

### Result
Let's inference trained word embedding and visualize it.

In [None]:
import pandas as pd
import sys

from sklearn.manifold import TSNE
import plotly.graph_objects as go

sys.path.append("../")

In [None]:
result_dir

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# reload saved model and vocab
model = torch.load(os.path.join(result_dir,"model.pt"), map_location=device)
vocab = torch.load(os.path.join(result_dir,"vocab.pt"))

# embedding is model's first layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape



### Make t-SNE graph of trained embedding and color numeric values

In [None]:
embeddings_df = pd.DataFrame(embeddings_norm)
fig = go.Figure()
# TODO 5-1) : make 2-d t-SNE graph of all vocabs and color only for numeric values(others, just color black)

tsne = TSNE(n_components=2, random_state=0, perplexity=30)
embeddings_tsne = tsne.fit_transform(embeddings_norm)

words = vocab.get_itos()

numeric_words = [word for word in words if word.isdigit()]
is_numeric = [word.isdigit() for word in words]

colors = []
for word in words:
    if word.isdigit():
        colors.append(int(word) if int(word) < 10 else 9)
    else:
        colors.append(-1)

fig.add_trace(go.Scatter(
    x=embeddings_tsne[:, 0],
    y=embeddings_tsne[:, 1],
    mode='markers',
    marker=dict(
        size=5,
        color=colors,
        colorscale='Viridis',
        showscale=True,
        cmin=0,
        cmax=9,
    ),
    text=words,
    hoverinfo='text'
))

fig.update_layout(
    title="t-SNE Visualization of Word Embeddings (Numeric words colored)",
    xaxis_title="t-SNE Component 1",
    yaxis_title="t-SNE Component 2",
    width=1000,
    height=800
)

fig.show()

fig.write_image(os.path.join(result_dir, "image.png"))


### Find top N similar words


In [None]:
def find_top_similar(word: str, vocab, embeddings_norm, topN: int = 10):
    # TODO 5-2) : make function returning top n similiar words and similarity scores
    topN_dict = {}
    
    word_idx = vocab[word]
    
    word_embedding = embeddings_norm[word_idx]
    
    similarities = np.dot(embeddings_norm, word_embedding)
    
    top_indices = np.argsort(similarities)[::-1][:topN+1]
    
    for idx in top_indices:
        similar_word = vocab.lookup_token(idx)
        if similar_word != word:
            topN_dict[similar_word] = float(similarities[idx])
            if len(topN_dict) == topN:
                break

    return topN_dict


In [None]:
for word, sim in find_top_similar("english", vocab, embeddings_norm).items():
    print("{}: {:.3f}".format(word, sim))


### Result Report

Save the colab result and submit it with your trained model file, vocab file, and t-SNE result image in the .zip format. Check one more time your submitted notebook file has result.

You can change the CBOW model parameters Training parameters and details if you want.