<div style="text-align: center;" >
<h1 style="margin-top: 0.2em; margin-bottom: 0.1em;">Skipgram Model</h1>
<h4 style="margin-top: 0.7em; margin-bottom: 0.3em; font-style:italic">



</div>
<br>

In [1]:
import requests
from io import StringIO
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import defaultdict
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np

In [2]:
url = 'https://raw.githubusercontent.com/tobiaswtzl/dlss-project24/main/data/preprocessed/comments.csv'

headers = {
    'Authorization': 'token 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'
}

response = requests.get(url, headers=headers)
data = StringIO(response.text)

comments = pd.read_csv(data)

In [3]:
# Splitting the data into train, validation, and test sets
train_df, temp_df = train_test_split(comments, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

#Adding all comments for generating the vocabulary. If not a error occurs when tokens missing
total_comments_list = comments["cleaned"].dropna().astype(str).tolist()

train_list = train_df["cleaned"].dropna().astype(str).tolist()
val_list = val_df["cleaned"].dropna().astype(str).tolist()
test_list = test_df["cleaned"].dropna().astype(str).tolist()

In [4]:
# Ensure each entry is a string and split each sentence into words
total_corpus = [doc.split() for doc in total_comments_list]
corpus_train = [doc.split() for doc in train_list]
corpus_val = [doc.split() for doc in val_list]
corpus_test = [doc.split() for doc in test_list]

# Create a vocabulary: count occurrences of each word
vocab = defaultdict(int)
for sentence in total_corpus:
    for word in sentence:
        vocab[word] += 1

# Remove infrequent words from the vocabulary
min_count = 1
vocab = {word: count for word, count in vocab.items() if count >= min_count}

# Create word to index and index to word mappings
word_to_index = {word: idx for idx, (word, _) in enumerate(vocab.items())}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Create DataFrame from vocabulary
vocab_df = pd.DataFrame(list(vocab.items()), columns=['Word', 'Count'])
vocab_df

Unnamed: 0,Word,Count
0,first,109
1,no,394
2,one,409
3,here,166
4,is,3006
...,...,...
12308,qje,4
12309,huntington,1
12310,exhaustion,1
12311,jel,1


In [5]:
# Parameters
context_size = 2
embedding_dim = 50
batch_size = 64
epochs = 50
learning_rate = 0.001
weight_decay = 1e-4  # L2 regularization factor
patience = 5  # Patience for early stopping

In [6]:
# Function to create context-target pairs for Skip-gram
def create_context_target_pairs(text, context_size):
    pairs = []
    for sentence in text:
        for i in range(len(sentence)):
            target = sentence[i]
            context = sentence[max(0, i - context_size):i] + sentence[i + 1:i + context_size + 1]
            for ctx in context:
                pairs.append((target, ctx))
    return pairs

train_pairs = create_context_target_pairs(corpus_train, context_size)
val_pairs = create_context_target_pairs(corpus_val, context_size)
test_pairs = create_context_target_pairs(corpus_test, context_size)

In [7]:
# Dataset and DataLoader definition
class Word2VecDataset(Dataset):
    def __init__(self, pairs, word_to_index):
        self.pairs = pairs
        self.word_to_index = word_to_index

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        target, context = self.pairs[idx]
        target_idx = torch.tensor(self.word_to_index[target], dtype=torch.long)
        context_idx = torch.tensor(self.word_to_index[context], dtype=torch.long)
        return target_idx, context_idx

train_dataset = Word2VecDataset(train_pairs, word_to_index)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = Word2VecDataset(val_pairs, word_to_index)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_dataset = Word2VecDataset(test_pairs, word_to_index)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
# Skip-gram model definition
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, vocab_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, target):
        embeds = self.embeddings(target)
        out = self.linear1(self.dropout(embeds))
        log_probs = torch.log_softmax(out, dim=1)
        return log_probs

# Initialize model, loss function, and optimizer with L2 regularization
vocab_size = len(word_to_index)
model = SkipGram(vocab_size, embedding_dim)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Early stopping parameters
best_val_loss = float('inf')
epochs_no_improve = 0

# Lists to store loss values
train_losses = []
val_losses = []

# Training loop with early stopping
for epoch in range(epochs):
    total_loss = 0
    model.train()
    for target_idx, context_idx in train_dataloader:
        model.zero_grad()
        log_probs = model(target_idx)
        loss = loss_function(log_probs, context_idx)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    train_losses.append(total_loss / len(train_dataloader))
    
    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for target_idx, context_idx in val_dataloader:
            log_probs = model(target_idx)
            loss = loss_function(log_probs, context_idx)
            val_loss += loss.item()
    
    val_loss /= len(val_dataloader)
    val_losses.append(val_loss)
    
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {train_losses[-1]:.4f}, Validation Loss: {val_loss:.4f}")
    
    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break

# Display learned word vectors
word_embeddings = model.embeddings.weight.data.numpy()
for word, idx in word_to_index.items():
    print(f"{word}: {word_embeddings[idx]}")

# Plotting the training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Training Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

KeyError: 'pond'

In [None]:
# Function to evaluate the model
def evaluate_model(model, dataloader, loss_function):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for target_idx, context_idx in dataloader:
            log_probs = model(target_idx)
            loss = loss_function(log_probs, context_idx)
            total_loss += loss.item()

            # Get the index of the max log-probability
            _, predicted_idx = torch.max(log_probs, dim=1)
            correct_predictions += (predicted_idx == context_idx).sum().item()
            total_predictions += context_idx.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    return avg_loss, accuracy

# Evaluate on validation and test datasets
val_loss, val_accuracy = evaluate_model(model, val_dataloader, loss_function)
test_loss, test_accuracy = evaluate_model(model, test_dataloader, loss_function)

print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")