In [1]:
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch

In [2]:
import nltk
from nltk.corpus import brown
from nltk.tokenize import word_tokenize

nltk.download('brown')
nltk.download('punkt')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ws-\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ws-\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the 'news' category from Brown corpus
news_texts = brown.sents(categories='news')

# Flatten the list of sentences into a single list of words
tokens = [word.lower() for sent in news_texts for word in sent]

In [4]:
# Create a word-to-index mapping
word_to_idx = {word: idx for idx, word in enumerate(set(tokens))}
vocab_size = len(word_to_idx)

In [5]:
def generate_skipgram_data(tokens, word_to_idx, window_size=2):
    data = []
    for i, target_word in enumerate(tokens):
        for j in range(i - window_size, i + window_size + 1):
            if j != i and 0 <= j < len(tokens):
                context_word = tokens[j]
                data.append((word_to_idx[target_word], word_to_idx[context_word]))
    return data

# Generate skip-gram pairs
skipgram_data = generate_skipgram_data(tokens, word_to_idx, window_size=2)

In [6]:
class SkipGramDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embed = self.embeddings(inputs)
        output = self.output(embed)
        return output

def train_skipgram(data, word_to_idx, window_size=2, embedding_dim=100, num_epochs=10, batch_size=32):
    dataset = SkipGramDataset(data)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model = SkipGramModel(len(word_to_idx), embedding_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')
    return model

# Train the model
trained_model = train_skipgram(skipgram_data, word_to_idx, window_size=2, embedding_dim=100, num_epochs=10, batch_size=32)

Epoch 1, Loss: 7.377390171671634
Epoch 2, Loss: 6.666394365184726
Epoch 3, Loss: 6.463412736879044
Epoch 4, Loss: 6.326357844602332
Epoch 5, Loss: 6.218093001700244
Epoch 6, Loss: 6.129263788570747
Epoch 7, Loss: 6.052483778219898
Epoch 8, Loss: 5.984150841165936
Epoch 9, Loss: 5.92403409262925
Epoch 10, Loss: 5.871559888442167


In [10]:
# Set the model to evaluation mode
trained_model.eval()

SkipGramModel(
  (embeddings): Embedding(13112, 100)
  (output): Linear(in_features=100, out_features=13112, bias=True)
)

In [11]:
import pickle

In [12]:
#dump the model to a file
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

NameError: name 'model' is not defined