In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

In [None]:
url = 'https://raw.githubusercontent.com/tobiaswtzl/dlss-project24/main/data/preprocessed/comments.csv'

headers = {
    'Authorization': 'token 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'
}

response = requests.get(url, headers=headers)
data = StringIO(response.text)

comments = pd.read_csv(data)
comments = comments[:500]

In [None]:
train_df, temp_df = train_test_split(comments, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_list = train_df["cleaned"].dropna().astype(str).tolist()
val_list = val_df["cleaned"].dropna().astype(str).tolist()
test_list = test_df["cleaned"].dropna().astype(str).tolist()

In [2]:
# Ensure each entry is a string and split each sentence into words
corpus_train = [doc.split() for doc in train_list]
corpus_val = [doc.split() for doc in val_list]
corpus_test = [doc.split() for doc in test_list]

# Create a vocabulary: count occurrences of each word
vocab = defaultdict(int)
for sentence in corpus_train:
    for word in sentence:
        vocab[word] += 1

# Remove infrequent words from the vocabulary
min_count = 1
vocab = {word: count for word, count in vocab.items() if count >= min_count}

# Create word to index and index to word mappings
word_to_index = {word: idx for idx, (word, _) in enumerate(vocab.items())}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Create DataFrame from vocabulary
vocab_df = pd.DataFrame(list(vocab.items()), columns=['Word', 'Count'])
vocab_df

In [3]:
# Parameter
context_size = 2
embedding_dim = 10
batch_size = 2
epochs = 100

# Wortindexierung
vocab = set(text)
vocab_size = len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

In [4]:
# Kontext-Ziel-Paare erstellen
def create_context_target_pairs(text, context_size):
    pairs = []
    for i in range(context_size, len(text) - context_size):
        context = text[i - context_size:i] + text[i + 1:i + context_size + 1]
        target = text[i]
        pairs.append((context, target))
    return pairs

pairs = create_context_target_pairs(text, context_size)

# Dataset und DataLoader definieren
class Word2VecDataset(Dataset):
    def __init__(self, pairs, word_to_ix):
        self.pairs = pairs
        self.word_to_ix = word_to_ix

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        context, target = self.pairs[idx]
        context_idxs = torch.tensor([self.word_to_ix[word] for word in context], dtype=torch.long)
        target_idx = torch.tensor(self.word_to_ix[target], dtype=torch.long)
        return context_idxs, target_idx

dataset = Word2VecDataset(pairs, word_to_ix)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [5]:
# CBOW-Modell definieren
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        embeds = self.embeddings(context)
        combined = torch.mean(embeds, dim=1)
        out = self.linear1(combined)
        log_probs = torch.log_softmax(out, dim=1)
        return log_probs

# Modell initialisieren, Verlustfunktion und Optimierer definieren
model = CBOW(vocab_size, embedding_dim)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

# Training
for epoch in range(epochs):
    total_loss = 0
    for context_idxs, target_idx in dataloader:
        model.zero_grad()
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, target_idx)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

# Gelernten Wortvektoren anzeigen
word_embeddings = model.embeddings.weight.data.numpy()
for word, idx in word_to_ix.items():
    print(f"{word}: {word_embeddings[idx]}")

Epoch 10/100, Loss: 10.3332
Epoch 20/100, Loss: 10.2715
Epoch 30/100, Loss: 10.2103
Epoch 40/100, Loss: 10.1495
Epoch 50/100, Loss: 10.0891
Epoch 60/100, Loss: 10.0292
Epoch 70/100, Loss: 9.9699
Epoch 80/100, Loss: 9.9109
Epoch 90/100, Loss: 9.8523
Epoch 100/100, Loss: 9.7943
continuous: [ 0.7950865  -1.2442249   3.1364465   0.35984868  0.35414985 -1.3477583
 -0.02608252  1.1959985   1.6642789   2.183412  ]
of: [-0.7939346   1.8061262  -1.0235156  -1.2668781   2.6324344   0.78999037
 -1.1714833   0.77110845 -0.22305082 -0.7369075 ]
word2vec: [-0.32894796  1.2133502   1.0604112   1.2307067  -0.12916546 -0.08325298
  2.0379024   0.23629397 -1.4854918   0.89328647]
PyTorch: [ 0.06804388  1.3175411   0.04018736  0.31990677  0.66031647 -1.893849
 -0.8772095  -0.17910883 -1.2881038  -2.9745314 ]
learning: [-0.7830033  -0.8982925  -0.60106516 -0.11992139  0.28616536 -1.8692768
  0.4851108   1.0124747  -0.17118877  1.1019868 ]
in: [ 0.21814209 -1.1415067  -1.5153449   0.643912    0.60991466 -2