## 词袋模型

In [1]:
texts = [
    "I love natural language processing.",
    "I love machine learning.",
    "I love coding in Python and Java.",
    "I love Java.",
    "I love Java, I don't love C++.",
    "I don't love Java."
]

In [5]:
words = [word for text in texts for word in text.split()]
print(words)

['I', 'love', 'natural', 'language', 'processing.', 'I', 'love', 'machine', 'learning.', 'I', 'love', 'coding', 'in', 'Python', 'and', 'Java.', 'I', 'love', 'Java.', 'I', 'love', 'Java,', 'I', "don't", 'love', 'C++.', 'I', "don't", 'love', 'Java.']


In [6]:
vocabulary = {}
for word in words:
    if word not in vocabulary:
        vocabulary[word] = len(vocabulary)

vocabulary

{'I': 0,
 'love': 1,
 'natural': 2,
 'language': 3,
 'processing.': 4,
 'machine': 5,
 'learning.': 6,
 'coding': 7,
 'in': 8,
 'Python': 9,
 'and': 10,
 'Java.': 11,
 'Java,': 12,
 "don't": 13,
 'C++.': 14}

In [None]:
bows = []
for text in texts:
    bow = [0] * len(vocabulary)
    for word in text.split():
        bow[vocabulary[word]] += 1
    bows.append(bow)

bows

### 相似度

In [None]:
def cosine_similarity(vector1, vector2):
    dot_product = sum(v1 * v2 for v1, v2 in zip(vector1, vector2))
    magnitude1 = sum(v ** 2 for v in vector1) ** 0.5
    magnitude2 = sum(v ** 2 for v in vector2) ** 0.5
    return dot_product / (magnitude1 * magnitude2)


print(texts[0])
print(texts[1])
cosine_similarity(bows[0], bows[1])

## skip-gram

In [None]:
sentences = [text.split() for text in texts]
sentences

In [None]:
vocabulary = {}
for sentence in sentences:
    for word in sentence:
        if word not in vocabulary:
            vocabulary[word] = len(vocabulary)
vocabulary

### One-hot 编码

In [None]:
import torch


def one_hot_encoding(index, vocab_size):
    one_hot = torch.zeros(vocab_size)
    one_hot[index] = 1
    return one_hot


one_hot_encoding(vocabulary['I'], len(vocabulary))

In [None]:
import torch


def generate_training_data(sentences, window_size):
    center_words = []
    target_words = []
    for sentence in sentences:
        indices = [vocabulary[word] for word in sentence]

        for center_index in range(len(indices)):
            start = max(0, center_index - window_size)
            end = min(len(indices), center_index + window_size + 1)

            for context_index in range(start, end):
                if context_index != center_index:
                    center_words.append(indices[center_index])
                    target_words.append(indices[context_index])

    return torch.tensor(center_words), torch.tensor(target_words)


widow_size = 2
center_words, target_words = generate_training_data(sentences, widow_size)
center_words, target_words

In [None]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(center_words, target_words)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

### 构建模型

In [None]:
import torch.nn as nn
import torch.optim as optim


class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.input_embeddings = nn.Linear(vocab_size, embedding_dim, bias=False)
        self.output_embeddings = nn.Linear(embedding_dim, vocab_size, bias=False)

    def forward(self, center_word):
        center_word_onehot = one_hot_encoding(center_word, vocab_size).view(1, -1)
        hidden = self.input_embeddings(center_word_onehot)
        return self.output_embeddings(hidden)


embedding_dim = 10
vocab_size = len(vocabulary)
model = SkipGram(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    for center_words, target_words in dataloader:
        output = model(center_words)
        loss = criterion(output, target_words)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch + 1}/{epochs}, Loss: {loss.item()}")

In [None]:
word_vectors = model.input_embeddings.weight.T
print(word_vectors.shape)

In [None]:
for word, index in vocabulary.items():
    print(f"{word}: {word_vectors[index]}")

In [None]:
import numpy as np


def encode_sentence(sentence):
    vector = [word_vectors[:, vocabulary[word]].detach().numpy() for word in sentence.split()]
    print(vector)
    return np.mean(vector, axis=0)


encode_sentence("I love machine learning.")


### 嵌入层优化

In [None]:
import torch.nn as nn
import torch.optim as optim


class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.input_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_embeddings = nn.Linear(embedding_dim, vocab_size)

    def forward(self, center_word):
        hidden = self.input_embeddings(center_word)
        return self.output_embeddings(hidden)


embedding_dim = 10
vocab_size = len(vocabulary)
model = SkipGram(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    for center_words, target_words in dataloader:
        output = model(center_words)
        loss = criterion(output, target_words)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch + 1}/{epochs}, Loss: {loss.item()}")

In [None]:
word_vectors = model.input_embeddings.weight.T
for word, index in vocabulary.items():
    print(f"{word}: {word_vectors[index]}")

## Cbow

In [2]:
sentences = [text.split() for text in texts]
sentences


[['I', 'love', 'natural', 'language', 'processing.'],
 ['I', 'love', 'machine', 'learning.'],
 ['I', 'love', 'coding', 'in', 'Python', 'and', 'Java.'],
 ['I', 'love', 'Java.'],
 ['I', 'love', 'Java,', 'I', "don't", 'love', 'C++.'],
 ['I', "don't", 'love', 'Java.']]

In [7]:
vocabulary = {}
for sentence in sentences:
    for word in sentence:
        if word not in vocabulary:
            vocabulary[word] = len(vocabulary)
vocabulary

{'I': 0,
 'love': 1,
 'natural': 2,
 'language': 3,
 'processing.': 4,
 'machine': 5,
 'learning.': 6,
 'coding': 7,
 'in': 8,
 'Python': 9,
 'and': 10,
 'Java.': 11,
 'Java,': 12,
 "don't": 13,
 'C++.': 14}

In [8]:
def generate_training_data(sentences, window_size):
    context_words = []
    target_words = []
    for sentence in sentences:
        indices = [vocabulary[word] for word in sentence]

        for center_index in range(len(indices)):
            start = max(0, center_index - window_size)
            end = min(len(indices), center_index + window_size + 1)
            context=[]

            for context_index in range(start, end):
                if context_index != center_index:
                    context.append(indices[context_index])

                if len( context)>=1:
                    context_words.append(context)
                    target_words.append(indices[center_index])

    return context_words, target_words

window_size = 2
context_words, target_words = generate_training_data(sentences, window_size)
context_words, target_words

([[1, 2],
  [1, 2],
  [0, 2, 3],
  [0, 2, 3],
  [0, 2, 3],
  [0, 2, 3],
  [0, 1, 3, 4],
  [0, 1, 3, 4],
  [0, 1, 3, 4],
  [0, 1, 3, 4],
  [0, 1, 3, 4],
  [1, 2, 4],
  [1, 2, 4],
  [1, 2, 4],
  [1, 2, 4],
  [2, 3],
  [2, 3],
  [2, 3],
  [1, 5],
  [1, 5],
  [0, 5, 6],
  [0, 5, 6],
  [0, 5, 6],
  [0, 5, 6],
  [0, 1, 6],
  [0, 1, 6],
  [0, 1, 6],
  [0, 1, 6],
  [1, 5],
  [1, 5],
  [1, 5],
  [1, 7],
  [1, 7],
  [0, 7, 8],
  [0, 7, 8],
  [0, 7, 8],
  [0, 7, 8],
  [0, 1, 8, 9],
  [0, 1, 8, 9],
  [0, 1, 8, 9],
  [0, 1, 8, 9],
  [0, 1, 8, 9],
  [1, 7, 9, 10],
  [1, 7, 9, 10],
  [1, 7, 9, 10],
  [1, 7, 9, 10],
  [1, 7, 9, 10],
  [7, 8, 10, 11],
  [7, 8, 10, 11],
  [7, 8, 10, 11],
  [7, 8, 10, 11],
  [7, 8, 10, 11],
  [8, 9, 11],
  [8, 9, 11],
  [8, 9, 11],
  [8, 9, 11],
  [9, 10],
  [9, 10],
  [9, 10],
  [1, 11],
  [1, 11],
  [0, 11],
  [0, 11],
  [0, 11],
  [0, 1],
  [0, 1],
  [0, 1],
  [1, 12],
  [1, 12],
  [0, 12, 0],
  [0, 12, 0],
  [0, 12, 0],
  [0, 12, 0],
  [0, 1, 0, 13],
  [0, 1, 0, 13],

In [9]:
max_context_size = max(len(context) for context in context_words)
padded_contexts = []
for context in context_words:
    padded=context+[0]*(max_context_size-len(context))
    padded_contexts.append(padded)

padded_contexts

[[1, 2, 0, 0],
 [1, 2, 0, 0],
 [0, 2, 3, 0],
 [0, 2, 3, 0],
 [0, 2, 3, 0],
 [0, 2, 3, 0],
 [0, 1, 3, 4],
 [0, 1, 3, 4],
 [0, 1, 3, 4],
 [0, 1, 3, 4],
 [0, 1, 3, 4],
 [1, 2, 4, 0],
 [1, 2, 4, 0],
 [1, 2, 4, 0],
 [1, 2, 4, 0],
 [2, 3, 0, 0],
 [2, 3, 0, 0],
 [2, 3, 0, 0],
 [1, 5, 0, 0],
 [1, 5, 0, 0],
 [0, 5, 6, 0],
 [0, 5, 6, 0],
 [0, 5, 6, 0],
 [0, 5, 6, 0],
 [0, 1, 6, 0],
 [0, 1, 6, 0],
 [0, 1, 6, 0],
 [0, 1, 6, 0],
 [1, 5, 0, 0],
 [1, 5, 0, 0],
 [1, 5, 0, 0],
 [1, 7, 0, 0],
 [1, 7, 0, 0],
 [0, 7, 8, 0],
 [0, 7, 8, 0],
 [0, 7, 8, 0],
 [0, 7, 8, 0],
 [0, 1, 8, 9],
 [0, 1, 8, 9],
 [0, 1, 8, 9],
 [0, 1, 8, 9],
 [0, 1, 8, 9],
 [1, 7, 9, 10],
 [1, 7, 9, 10],
 [1, 7, 9, 10],
 [1, 7, 9, 10],
 [1, 7, 9, 10],
 [7, 8, 10, 11],
 [7, 8, 10, 11],
 [7, 8, 10, 11],
 [7, 8, 10, 11],
 [7, 8, 10, 11],
 [8, 9, 11, 0],
 [8, 9, 11, 0],
 [8, 9, 11, 0],
 [8, 9, 11, 0],
 [9, 10, 0, 0],
 [9, 10, 0, 0],
 [9, 10, 0, 0],
 [1, 11, 0, 0],
 [1, 11, 0, 0],
 [0, 11, 0, 0],
 [0, 11, 0, 0],
 [0, 11, 0, 0],
 [0, 1, 0, 0]

In [10]:
import torch
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(torch.tensor(padded_contexts), torch.tensor(target_words))
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [11]:
import torch.nn as nn
import torch.optim as optim
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.input_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_embeddings = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_words):
        hidden = self.input_embeddings(context_words)
        avg=torch.mean(hidden, dim=1)
        return self.output_embeddings(avg)

embedding_dim=100
vocab_size=len(vocabulary)

model=CBOW(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10
for epoch in range(epochs):
    for context_words, target_words in dataloader:
        output = model(context_words)
        loss = criterion(output, target_words)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch + 1}/{epochs}, Loss: {loss.item()}")

Epoch: 1/10, Loss: 2.883824348449707
Epoch: 2/10, Loss: 3.0268871784210205
Epoch: 3/10, Loss: 2.508455753326416
Epoch: 4/10, Loss: 0.5566090941429138
Epoch: 5/10, Loss: 0.48134422302246094
Epoch: 6/10, Loss: 0.44803065061569214
Epoch: 7/10, Loss: 1.4029945135116577
Epoch: 8/10, Loss: 0.2548027038574219
Epoch: 9/10, Loss: 1.8012924194335938
Epoch: 10/10, Loss: 2.1545863151550293


In [12]:
word_vectors = model.input_embeddings.weight.T
for word, index in vocabulary.items():
    print(f"{word}: {word_vectors[index]}")

I: tensor([ 1.3686,  0.3975,  0.9415,  0.6726, -1.7063,  0.4028,  0.0643,  1.2155,
        -0.1305, -1.9735, -1.1331, -1.2892, -0.0151,  1.2197, -1.6659],
       grad_fn=<SelectBackward0>)
love: tensor([-1.5445, -0.2746, -0.2169,  0.5176, -0.9260, -0.0592,  0.5421, -0.1473,
        -0.5126,  0.1111,  1.2941,  1.8491,  1.7538,  0.1905,  0.6758],
       grad_fn=<SelectBackward0>)
natural: tensor([ 0.4634, -0.8749, -1.1605, -0.3061,  0.2505, -1.0818,  0.6324, -0.6324,
        -0.7074, -0.2094, -0.1390,  1.8150, -1.0685, -0.8418,  2.3039],
       grad_fn=<SelectBackward0>)
language: tensor([-0.0440,  0.1713,  0.1589,  0.6060,  1.6509,  0.6998,  1.4455, -0.0582,
        -0.8986,  1.4521, -0.7155, -2.3919, -1.2118,  1.1085, -1.1462],
       grad_fn=<SelectBackward0>)
processing.: tensor([-0.6503, -0.1953,  0.9942, -1.7584, -0.7329, -0.6782,  0.4042, -0.0207,
         0.5383, -1.6510,  0.2370,  2.1545,  0.7503,  0.8720,  1.3051],
       grad_fn=<SelectBackward0>)
machine: tensor([-0.1227, -1.