<a href="https://colab.research.google.com/github/yfzhang3/2048/blob/main/v2_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load & Preprocess the data

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
from torchtext.data.utils import get_tokenizer
import nltk
from nltk.tokenize import word_tokenize
import random
from sklearn.metrics import precision_score, recall_score, f1_score

#
import sys
sys.path.append('./sample_data/')

# from challenge_dataset import ChallengeDataset
# I don't know where this notebook is so idk which directory this belongs in

In [None]:
nltk.download('punkt')  # Download the tokenizer data (only need to do this once)

torch.manual_seed(1)  # set a manual seed for training reproducibility for random parameters

# Load training file data
training_path = 'train.txt'
with open(training_path, 'r', encoding='utf-8', errors='replace') as file:
    lines = file.readlines()

# Split data into original and corrupted at tab
data = [line.strip().split('\t') for line in lines]
print(data[:5])

# Separate original and corrupted
original_sentences = [pair[0] for pair in data]
corrupted_sentences = [pair[1] for pair in data]
print(original_sentences[:3])

# Tokenize the sentences
tokenized_sentences = []
for sentence in original_sentences:
    tokens = word_tokenize(sentence)
    tokenized_sentences.append(tokens)

num_sentences = len(tokenized_sentences)
print(num_sentences)
print(tokenized_sentences[:3])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


FileNotFoundError: ignored

###Tokenize Data

In [None]:
import torchtext
from torchtext.data.utils import get_tokenizer

# Customize hyperparameters
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
EMDEDDING_DIM = 100

# Flatten data into 1 list
all_words = [word for sentence in tokenized_sentences for word in sentence]
print(all_words[:2])

# Deriving a set from `train_set` to remove duplicates
vocab = set(all_words)
vocab_size = len(vocab)

# dictionary that maps words to their corresponding indices
word_to_ix = {word:ix for ix, word in enumerate(vocab)}
# dictionary that maps indices to their corresponding words
ix_to_word = {ix:word for ix, word in enumerate(vocab)}

# Creating training data
data = []
for sentence in tokenized_sentences:
  for i in range(CONTEXT_SIZE, len(sentence) - CONTEXT_SIZE):
      context = (
          sentence[i - CONTEXT_SIZE:i]  # Context words to the left
          + sentence[i + 1:i + CONTEXT_SIZE + 1]  # Context words to the right
      )
      target = sentence[i]
      data.append((context, target))

print(data[:5])

In [None]:
# Split into training & validation but currently not working
import random

# Set seed for reproducibility
random_seed = 3
random.seed(random_seed)
random.shuffle(data)

total_sentences_len = len(data)
print("length of tokenized sentenences", total_sentences_len)

# Define the proportions for training, validation, and testing sets
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2

# Calculate the sizes of each subset based on the ratios
train_size = int(train_ratio * num_sentences)
val_size = int(val_ratio * num_sentences)
# Split the tokenized sentences into three subsets
train_set = data[:train_size]
val_set = data[train_size:train_size + val_size]
test_set = data[train_size + val_size:]

### Defining the Neural Network

In [None]:
class CBOW(nn.Module):

    # Params for vocab size, embedding dem, batch size
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()

        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        context_vector = torch.sum(embeds, dim=0).view(1, -1) # Summing word embeddings
        print(context_vector.shape) # torch.Size([1, 100])
        out = self.linear1(embeds) # now a number per node 1x128
        out = self.activation_function1(out) # relu ->number 1x128
        out = self.linear2(out) # 1 x voc sz
        out = self.activation_function2(out) #soft max, probs sum = 1
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

# Functions from pytorch docs
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

make_context_vector(data[0][0], word_to_ix)  # example

### Creating, Training, and Evaluating the Model

In [None]:
model = CBOW(vocab_size, EMDEDDING_DIM)

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.005)

#TRAINING
for epoch in range(10):
    total_loss = 0

    for context, target in data:
        # Vector mapping each word in context to appropriate index
        context_vector = make_context_vector(context, word_to_ix)
        # 1D Tensor (1,) holding the index of the target word * len of context vector
        target_tensor = torch.tensor([word_to_ix[target]] * len(context_vector))

        # Passes context vector through model -> returns predicted log probabilities
        log_probs = model(context_vector)
        loss = loss_function(log_probs, target_tensor)

        # Cumulative loss over all instances in the current epoch
        # used in backprop
        total_loss += loss

    #optimize at the end of each epoch
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    print("finish training")

    # VALIDATION (after each epoch)

    model.eval() # Set model to evaulation mode
    with torch.no_grad():
        correct = 0
        total = 0
        predictions = []  # Store model predictions for precision/recall

        for context, target in val_set:
            # Generates vector, predicats lob probs
            context_vector = make_context_vector(context, word_to_ix)
            target_idx = word_to_ix[target]
            log_probs = model(context_vector)
            probabilities = torch.exp(log_probs) # convert log to regular

            # Gets the predicted index w highest log prob
            _, predicted = log_probs.max(1)

            # Checks if prediction is correct
            if predicted.item() == target_idx:
                correct += 1
            total += 1

        accuracy = correct / total  # Calculate accuracy for the validation set
        print(f"Accuracy (Validation) - Epoch {epoch + 1}: {accuracy:.4f}")
        print("Finish training")

    model.train()  # Set the model back to training mode