In [91]:
import os
import string
from collections import Counter
from torchtext.data.utils import get_tokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


In [92]:
def read_and_tokenize(file_path):
    tokenizer = get_tokenizer('basic_english')
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    # Preprocess the text by removing punctuation and converting to lowercase
    text = preprocess_text(text)
    return tokenizer(text)

# Function to preprocess the text
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return text

# Function to tokenize all files in a given directory
def tokenize_directory(directory_path):
    tokenized_texts = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.txt'):  # Make sure to process only text files
            file_path = os.path.join(directory_path, file_name)
            tokens = read_and_tokenize(file_path)
            tokenized_texts.extend(tokens)
    return tokenized_texts

In [93]:
# Assuming your directories are named as follows:
train_folder = 'inf265_v24_project03_data/data_train'
val_folder = 'inf265_v24_project03_data/data_val'
test_folder = 'inf265_v24_project03_data/data_test'

# Tokenize all texts in each directory
tokenized_train = tokenize_directory(train_folder)
tokenized_val = tokenize_directory(val_folder)
tokenized_test = tokenize_directory(test_folder)

# Now you have lists of words for each dataset
print(f"Number of words in training data: {len(tokenized_train)}")
print(f"Number of words in validation data: {len(tokenized_val)}")
print(f"Number of words in test data: {len(tokenized_test)}")

Number of words in training data: 2348007
Number of words in validation data: 43758
Number of words in test data: 107587


In [94]:
def build_vocabulary(tokenized_texts, min_freq=100):
    word_freq = Counter(tokenized_texts)
    vocab = {'<unk>': 0}  # Start with the '<unk>' token
    index = 1  # Start indexing from 1 for other words
    for word, freq in word_freq.items():
        if freq >= min_freq:
            vocab[word] = index
            index += 1
    total_words = sum(word_freq.values())
    distinct_words = len(word_freq)
    vocab_size = len(vocab)
    return vocab, total_words, distinct_words, vocab_size

vocab, total_words, distinct_words, vocab_size = build_vocabulary(tokenized_train, min_freq=100)


In [95]:
print(f"Total number of words in the training dataset: {total_words}")
print(f"Number of distinct words in the training dataset: {distinct_words}")
print(f"Size of the defined vocabulary (words appearing at least 100 times): {vocab_size}")

print("\nComments on Results:")
if vocab_size < distinct_words:
    print("The defined vocabulary is smaller than the total number of distinct words,")
    print("indicating that not all words appear frequently enough to be included.")
else:
    print("All distinct words appear frequently and are included in the vocabulary.")

Total number of words in the training dataset: 2348007
Number of distinct words in the training dataset: 52585
Size of the defined vocabulary (words appearing at least 100 times): 2082

Comments on Results:
The defined vocabulary is smaller than the total number of distinct words,
indicating that not all words appear frequently enough to be included.


In [96]:
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)  # Embedding layer
        self.linear = nn.Linear(embedding_dim, vocab_size)  # Projection layer

    def forward(self, inputs):
        # Inputs is a list of context word indices for each target word
        embeds = self.embeddings(inputs)  # Convert word indices to embeddings
        embeds_mean = torch.mean(embeds, dim=1)  # Take the mean across the context window
        out = self.linear(embeds_mean)  # Make a prediction for the target word
        log_probs = F.log_softmax(out, dim=1)  # Use log_softmax to convert to log probabilities
        return log_probs

In [97]:
class CBOWDataset(Dataset):
    def __init__(self, tokenized_texts, word_to_ix, context_size):
        self.word_to_ix = word_to_ix  # Store the mapping as an attribute
        self.data = []
        for i in range(context_size, len(tokenized_texts) - context_size):
            context = [tokenized_texts[j] for j in range(i - context_size, i + context_size + 1) if j != i]
            target = tokenized_texts[i]
            context_indices = [word_to_ix.get(word, word_to_ix['<unk>']) for word in context]
            target_index = word_to_ix.get(target, word_to_ix['<unk>'])
            self.data.append((context_indices, target_index))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context_indices, target_index = self.data[idx]
        return torch.tensor(context_indices, dtype=torch.long), torch.tensor(target_index, dtype=torch.long)


In [98]:
def compute_class_weights(word_to_ix, dataset):
    # Extract all target indices from the dataset to a list
    all_targets = []
    for _, target in dataset:
        all_targets.append(target)

    # Ensure classes contain all possible labels from the vocabulary
    classes = np.array(list(word_to_ix.values()))  # This uses all word indices from your vocabulary

    # Calculate class weights
    class_weights = compute_class_weight('balanced', classes=classes, y=all_targets)
    return torch.tensor(class_weights, dtype=torch.float)

In [99]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
word_to_ix = {word: i for i, word in enumerate(vocab)}
train_data = CBOWDataset(tokenized_train, word_to_ix, context_size=2)
val_data = CBOWDataset(tokenized_val, word_to_ix, context_size=2)

# Compute class weights
#class_weights = compute_class_weights(word_to_ix, train_data)

# Prepare the DataLoader
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=True)

# Store each model's validation loss to select the best model
model_losses = {}

In [100]:
def test_compute_class_weights():
    # Directly use the correct class indices and targets as numpy arrays
    all_targets = np.array([target for _, target in train_data])  # Make sure this is an np.array
    class_indices = np.array(list(word_to_ix.values()))  # Also ensure this is an np.array

    try:
        class_weights = compute_class_weight('balanced', classes=class_indices, y=all_targets)
        print("Class weights:", class_weights)
        return torch.tensor(class_weights, dtype=torch.float)
    except Exception as e:
        print("Error computing class weights:", e)

# Test this function to see if it still raises an error
class_weights = test_compute_class_weights()


Class weights: [3.01304916e-03 7.45505344e-03 1.71758027e-02 ... 5.78340107e+00
 4.66017855e+00 4.33755080e+00]


In [101]:
def train_cbow_models(vocab_size, embedding_dims, epochs, learning_rate):
    for embedding_dim in embedding_dims:
        # Initialize the model
        model = CBOWModel(vocab_size, embedding_dim).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

        # Train the model
        for epoch in range(epochs):
            model.train()
            total_loss = 0
            for context, target in train_loader:
                context, target = context.to(device), target.to(device)
                optimizer.zero_grad()
                log_probs = model(context)
                loss = criterion(log_probs, target)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                if epoch == 0:  # Limit debug output to the first epoch to prevent too much logging
                    print(f"Sample logits: {log_probs[0][:10]}")  # Print the first 10 logits of the first batch
                    print(f"Sample targets: {target[:10]}")
            print(f'Epoch {epoch}, Loss: {total_loss}')

        # Evaluate the model
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for context, target in val_loader:
                context, target = context.to(device), target.to(device)
                log_probs = model(context)
                loss = criterion(log_probs, target)
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)
        model_losses[embedding_dim] = avg_val_loss
        print(f'Model with embedding dim {embedding_dim}, Validation Loss: {avg_val_loss}')

    # Select the best model
    best_embedding_dim = min(model_losses, key=model_losses.get)
    print(f'The best model has an embedding dimension of {best_embedding_dim} with a validation loss of {model_losses[best_embedding_dim]}')

    return best_embedding_dim




In [102]:
vocab_size = len(vocab)
context_size = 2  # Set your context size
embedding_dims = [10, 12, 16]  # The list of embedding dimensions you want to try
epochs = 5
learning_rate = 0.01

best_embedding_dim = train_cbow_models(vocab_size, embedding_dims, epochs, learning_rate)

print(f"The best model uses an embedding dimension of {best_embedding_dim}.")

Epoch 0, Loss: 519514.82663822174


KeyboardInterrupt: 