In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.decomposition import PCA
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore[import]
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# Define the corpus
corpus = ['The cat sat on the mat',
          'The dog ran in the park',
          'The bird sang in the tree']

In [4]:
# Convert the corpus to a sequence of integers
from sklearn.preprocessing import LabelEncoder
from collections import Counter

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)

In [6]:
sequences

[[1, 3, 4, 5, 1, 6], [1, 7, 8, 2, 1, 9], [1, 10, 11, 2, 1, 12]]

In [None]:
print("Sequences of words in the corpus:", sequences)

Sequences of words in the corpus: [[1, 3, 4, 5, 1, 6], [1, 7, 8, 2, 1, 9], [1, 10, 11, 2, 1, 12]]


In [8]:
# Parameters
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
embedding_size = 10  # Size of the word embeddings
window_size = 2  # Context window size

In [9]:
# Generate context-target pairs
contexts = []
targets = []

for sequence in sequences:
    for i in range(window_size, len(sequence) - window_size):
        context = sequence[i-window_size:i] + sequence[i+1:i+window_size+1]
        target = sequence[i]
        contexts.append(context)
        targets.append(target)

In [10]:
len(sequences), len(contexts), len(targets)

(3, 6, 6)

In [11]:
contexts

[[1, 3, 5, 1],
 [3, 4, 1, 6],
 [1, 7, 2, 1],
 [7, 8, 1, 9],
 [1, 10, 2, 1],
 [10, 11, 1, 12]]

In [12]:
targets

[4, 5, 8, 2, 11, 2]

In [13]:
X = np.array(contexts)
y = np.array(targets)

In [14]:
# Create Dataset and DataLoader
class CBOWDataset(Dataset):
    def __init__(self, contexts, targets):
        self.contexts = torch.tensor(contexts, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.contexts[idx], self.targets[idx]


In [15]:
dataset = CBOWDataset(X, y)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [None]:
# Define CBOW model in PyTorch
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)

    def forward(self, inputs):
        embedded = self.embeddings(inputs)  # Get embeddings for the context words
        # print(embedded.shape)
        # Take the average of the context word embeddings
        embedded_mean = embedded.mean(dim=1) # feature wise average
        # print(embedded_mean.shape)

        out = self.linear(embedded_mean)  # Feed to linear layer to get word probabilities
        return out

In [26]:
# Initialize the model, loss function, and optimizer
model = CBOWModel(vocab_size=vocab_size, embedding_size=embedding_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for context, target in dataloader:
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss/len(dataloader)}')

Epoch 0, Loss: 2.5277430216471353
Epoch 10, Loss: 1.3286320368448894
Epoch 20, Loss: 0.6813733577728271
Epoch 30, Loss: 0.35370368758837384
Epoch 40, Loss: 0.2033375451962153
Epoch 50, Loss: 0.12779705474774042
Epoch 60, Loss: 0.08518798028429349
Epoch 70, Loss: 0.05930993209282557
Epoch 80, Loss: 0.04336586408317089
Epoch 90, Loss: 0.03280031184355418


In [29]:
random_tensor = torch.randint(0, 10, (2,4,10))
random_tensor

tensor([[[4, 6, 3, 0, 1, 0, 2, 5, 4, 1],
         [5, 3, 7, 7, 1, 6, 4, 8, 8, 6],
         [9, 2, 3, 1, 7, 1, 5, 4, 3, 8],
         [9, 3, 4, 8, 2, 9, 9, 4, 9, 7]],

        [[5, 7, 9, 1, 9, 7, 1, 2, 5, 3],
         [3, 1, 8, 5, 6, 3, 3, 3, 3, 6],
         [8, 3, 6, 5, 3, 9, 0, 2, 3, 7],
         [8, 5, 7, 8, 1, 3, 2, 6, 6, 5]]])

In [32]:
# convert random_tensor type to floating point 
random_tensor_float = random_tensor.float()
random_tensor_float.mean(dim=1)


tensor([[6.7500, 3.5000, 4.2500, 4.0000, 2.7500, 4.0000, 5.0000, 5.2500, 6.0000,
         5.5000],
        [6.0000, 4.0000, 7.5000, 4.7500, 4.7500, 5.5000, 1.5000, 3.2500, 4.2500,
         5.2500]])

In [35]:
# skipgram 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore[import]


# Define the corpus
corpus = ['The cat sat on the mat',
          'The dog ran in the park',
          'The bird sang in the tree']

# Convert the corpus to a sequence of integers
from sklearn.preprocessing import LabelEncoder
from collections import Counter

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)

print("Sequences of words in the corpus:", sequences)

# Parameters
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
embedding_size = 10  # Size of the word embeddings
window_size = 2  # Context window size

# Generate target-context pairs for Skip-gram
def generate_skipgram_data(sequences, window_size):
    contexts = []
    targets = []
    for sequence in sequences:
        for i in range(window_size, len(sequence) - window_size):
            target = sequence[i]
            context_words = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
            for context_word in context_words:
                contexts.append(target)
                targets.append(context_word)
    return np.array(contexts), np.array(targets)

X, y = generate_skipgram_data(sequences, window_size)

# Create Dataset and DataLoader
class SkipGramDataset(Dataset):
    def __init__(self, contexts, targets):
        self.contexts = torch.tensor(contexts, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.contexts[idx], self.targets[idx]

dataset = SkipGramDataset(X, y)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Define Skip-gram model in PyTorch
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)

    def forward(self, inputs):
        embedded = self.embeddings(inputs)  # Get embedding for target word
        out = self.linear(embedded)  # Predict context words from target word
        return out

# Initialize the model, loss function, and optimizer
model = SkipGramModel(vocab_size=vocab_size, embedding_size=embedding_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for context, target in dataloader:
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss/len(dataloader)}')

Sequences of words in the corpus: [[1, 3, 4, 5, 1, 6], [1, 7, 8, 2, 1, 9], [1, 10, 11, 2, 1, 12]]
Epoch 0, Loss: 2.6340874632199607
Epoch 10, Loss: 1.6211028695106506
Epoch 20, Loss: 1.4906717638174694
Epoch 30, Loss: 1.4755606005589168
Epoch 40, Loss: 1.4599988708893459
Epoch 50, Loss: 1.4655363708734512
Epoch 60, Loss: 1.4557153582572937
Epoch 70, Loss: 1.4393378893534343
Epoch 80, Loss: 1.461005449295044
Epoch 90, Loss: 1.4402139087518055
