## CBoW-Model

Implemented with:
- Adam
- Early-Stopping, Dropout, L2 Reg


In [29]:
import requests
from io import StringIO
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import defaultdict
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np

In [30]:
url = 'https://raw.githubusercontent.com/tobiaswtzl/dlss-project24/main/data/preprocessed/comments.csv'

headers = {
    'Authorization': 'token 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'
}

response = requests.get(url, headers=headers)
data = StringIO(response.text)

comments = pd.read_csv(data)

In [31]:
# Splitting the data into train, validation, and test sets
train_df, temp_df = train_test_split(comments, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

#Adding all comments for generating the vocabulary. If not a error occurs when tokens missing
total_comments_list = comments["cleaned"].dropna().astype(str).tolist()

train_list = train_df["cleaned"].dropna().astype(str).tolist()
val_list = val_df["cleaned"].dropna().astype(str).tolist()
test_list = test_df["cleaned"].dropna().astype(str).tolist()

In [32]:
# Ensure each entry is a string and split each sentence into words
total_corpus = [doc.split() for doc in total_comments_list]
corpus_train = [doc.split() for doc in train_list]
corpus_val = [doc.split() for doc in val_list]
corpus_test = [doc.split() for doc in test_list]

# Create a vocabulary: count occurrences of each word
vocab = defaultdict(int)
for sentence in total_corpus:
    for word in sentence:
        vocab[word] += 1

# Remove infrequent words from the vocabulary
min_count = 1
vocab = {word: count for word, count in vocab.items() if count >= min_count}

# Create word to index and index to word mappings
word_to_index = {word: idx for idx, (word, _) in enumerate(vocab.items())}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Create DataFrame from vocabulary
vocab_df = pd.DataFrame(list(vocab.items()), columns=['Word', 'Count'])
vocab_df

Unnamed: 0,Word,Count
0,nice,20
1,try,97
2,but,1350
3,false,42
4,equivalency,5
...,...,...
14815,exorbitant,1
14816,commercially,1
14817,lcoe,1
14818,troublesome,1


In [33]:
# Parameters
context_size = 2
embedding_dim = 50
batch_size = 64
epochs = 50
learning_rate = 0.001
weight_decay = 1e-4  # L2 regularization factor
patience = 5  # Patience for early stopping

In [34]:
# Function to create context-target pairs
def create_context_target_pairs(text, context_size):
    pairs = []
    for sentence in text:
        for i in range(context_size, len(sentence) - context_size):
            context = sentence[i - context_size:i] + sentence[i + 1:i + context_size + 1]
            target = sentence[i]
            pairs.append((context, target))
    return pairs

train_pairs = create_context_target_pairs(corpus_train, context_size)
val_pairs = create_context_target_pairs(corpus_val, context_size)
test_pairs = create_context_target_pairs(corpus_test, context_size)

In [35]:
# Dataset and DataLoader definition
class Word2VecDataset(Dataset):
    def __init__(self, pairs, word_to_index):
        self.pairs = pairs
        self.word_to_index = word_to_index

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        context, target = self.pairs[idx]
        context_idxs = torch.tensor([self.word_to_index[word] for word in context], dtype=torch.long)
        target_idx = torch.tensor(self.word_to_index[target], dtype=torch.long)
        return context_idxs, target_idx

train_dataset = Word2VecDataset(train_pairs, word_to_index)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = Word2VecDataset(val_pairs, word_to_index)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_dataset = Word2VecDataset(test_pairs, word_to_index)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [36]:
# CBOW model definition
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, vocab_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, context):
        embeds = self.embeddings(context)
        combined = torch.mean(embeds, dim=1)
        out = self.linear1(self.dropout(combined))
        log_probs = torch.log_softmax(out, dim=1)
        return log_probs

# Initialize model, loss function, and optimizer with L2 regularization
vocab_size = len(word_to_index)
model = CBOW(vocab_size, embedding_dim)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Early stopping parameters
best_val_loss = float('inf')
epochs_no_improve = 0

# Lists to store loss values
train_losses = []
val_losses = []

# Training loop with early stopping
for epoch in range(epochs):
    total_loss = 0
    model.train()
    for context_idxs, target_idx in train_dataloader:
        model.zero_grad()
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, target_idx)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    train_losses.append(total_loss / len(train_dataloader))
    
    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for context_idxs, target_idx in val_dataloader:
            log_probs = model(context_idxs)
            loss = loss_function(log_probs, target_idx)
            val_loss += loss.item()
    
    val_loss /= len(val_dataloader)
    val_losses.append(val_loss)
    
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {train_losses[-1]:.4f}, Validation Loss: {val_loss:.4f}")
    
    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break

# Display learned word vectors
word_embeddings = model.embeddings.weight.data.numpy()
for word, idx in word_to_index.items():
    print(f"{word}: {word_embeddings[idx]}")

# Plotting the training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Training Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

Epoch 1/50, Training Loss: 7.8542, Validation Loss: 7.2085
Epoch 2/50, Training Loss: 7.0404, Validation Loss: 6.9796
Epoch 3/50, Training Loss: 6.8645, Validation Loss: 6.8621
Epoch 4/50, Training Loss: 6.7610, Validation Loss: 6.7855
Epoch 5/50, Training Loss: 6.6923, Validation Loss: 6.7309
Epoch 6/50, Training Loss: 6.6425, Validation Loss: 6.6910
Epoch 7/50, Training Loss: 6.6049, Validation Loss: 6.6620
Epoch 8/50, Training Loss: 6.5775, Validation Loss: 6.6396
Epoch 9/50, Training Loss: 6.5569, Validation Loss: 6.6209
Epoch 10/50, Training Loss: 6.5375, Validation Loss: 6.6073
Epoch 11/50, Training Loss: 6.5249, Validation Loss: 6.5963
Epoch 12/50, Training Loss: 6.5132, Validation Loss: 6.5864


In [None]:
# Function to evaluate the model
def evaluate_model(model, dataloader, loss_function):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for context_idxs, target_idx in dataloader