这部分实现了CBOW词嵌入

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1234)

<torch._C.Generator at 0x2186ad609d0>

In [3]:
word_to_ix = {'data': 0, 'science': 1}
word_to_ix

{'data': 0, 'science': 1}

In [4]:
embeds = nn.Embedding(2, 5)
embeds

Embedding(2, 5)

In [5]:
lookup_tensor = torch.tensor([word_to_ix['data']], dtype=torch.long)
lookup_tensor


tensor([0])

In [6]:
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[ 0.0461,  0.4024, -1.0115,  0.2167, -0.6123]],
       grad_fn=<EmbeddingBackward>)


In [7]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

In [8]:
test_sentence = """Alternate data extraction is when actual data is not available to the user, but the user can use the Internet to fetch data that is publicly available, and search for relevant information. For example, if I want to buy a laptop, I want to compare the price of the laptop on various online portals. I have one system scrape the price information from various websites and provide a summary of the prices to me. This process is called alternate data collection using web scraping, text processing and natural language processing.""".split()

In [9]:
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)
           ]
print(trigrams[:3])

[(['Alternate', 'data'], 'extraction'), (['data', 'extraction'], 'is'), (['extraction', 'is'], 'when')]


In [10]:
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [11]:
vocab

{'Alternate',
 'For',
 'I',
 'Internet',
 'This',
 'a',
 'actual',
 'alternate',
 'and',
 'available',
 'available,',
 'but',
 'buy',
 'called',
 'can',
 'collection',
 'compare',
 'data',
 'example,',
 'extraction',
 'fetch',
 'for',
 'from',
 'have',
 'if',
 'information',
 'information.',
 'is',
 'language',
 'laptop',
 'laptop,',
 'me.',
 'natural',
 'not',
 'of',
 'on',
 'one',
 'online',
 'portals.',
 'price',
 'prices',
 'process',
 'processing',
 'processing.',
 'provide',
 'publicly',
 'relevant',
 'scrape',
 'scraping,',
 'search',
 'summary',
 'system',
 'text',
 'that',
 'the',
 'to',
 'use',
 'user',
 'user,',
 'using',
 'various',
 'want',
 'web',
 'websites',
 'when'}

In [12]:
word_to_ix

{'have': 0,
 'actual': 1,
 'portals.': 2,
 'information': 3,
 'called': 4,
 'various': 5,
 'fetch': 6,
 'available,': 7,
 'price': 8,
 'to': 9,
 'but': 10,
 'if': 11,
 'one': 12,
 'system': 13,
 'when': 14,
 'information.': 15,
 'the': 16,
 'search': 17,
 'data': 18,
 'that': 19,
 'for': 20,
 'buy': 21,
 'summary': 22,
 'collection': 23,
 'user,': 24,
 'provide': 25,
 'and': 26,
 'Alternate': 27,
 'I': 28,
 'prices': 29,
 'is': 30,
 'process': 31,
 'For': 32,
 'natural': 33,
 'laptop': 34,
 'not': 35,
 'using': 36,
 'user': 37,
 'web': 38,
 'publicly': 39,
 'a': 40,
 'compare': 41,
 'laptop,': 42,
 'websites': 43,
 'processing.': 44,
 'scraping,': 45,
 'example,': 46,
 'available': 47,
 'Internet': 48,
 'scrape': 49,
 'alternate': 50,
 'use': 51,
 'from': 52,
 'me.': 53,
 'can': 54,
 'relevant': 55,
 'online': 56,
 'want': 57,
 'This': 58,
 'text': 59,
 'on': 60,
 'extraction': 61,
 'of': 62,
 'language': 63,
 'processing': 64}

In [24]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
    

In [25]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=1e-3)
losses
loss_function
model

NGramLanguageModeler(
  (embeddings): Embedding(65, 10)
  (linear1): Linear(in_features=20, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=65, bias=True)
)

In [21]:
optimizer

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.001
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [26]:
for epoch in range(10):
    total_loss = 0
    for context, target in trigrams:
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        
        model.zero_grad()
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)

    print(losses)

[372.0579733848572]
[372.0579733848572, 369.7833273410797]
[372.0579733848572, 369.7833273410797, 367.53419947624207]
[372.0579733848572, 369.7833273410797, 367.53419947624207, 365.3097529411316]
[372.0579733848572, 369.7833273410797, 367.53419947624207, 365.3097529411316, 363.1089813709259]
[372.0579733848572, 369.7833273410797, 367.53419947624207, 365.3097529411316, 363.1089813709259, 360.93003273010254]
[372.0579733848572, 369.7833273410797, 367.53419947624207, 365.3097529411316, 363.1089813709259, 360.93003273010254, 358.77337193489075]
[372.0579733848572, 369.7833273410797, 367.53419947624207, 365.3097529411316, 363.1089813709259, 360.93003273010254, 358.77337193489075, 356.63872599601746]
[372.0579733848572, 369.7833273410797, 367.53419947624207, 365.3097529411316, 363.1089813709259, 360.93003273010254, 358.77337193489075, 356.63872599601746, 354.52529549598694]
[372.0579733848572, 369.7833273410797, 367.53419947624207, 365.3097529411316, 363.1089813709259, 360.93003273010254, 35

In [28]:
print(context_idxs)

tensor([33, 63])


In [29]:
model

NGramLanguageModeler(
  (embeddings): Embedding(65, 10)
  (linear1): Linear(in_features=20, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=65, bias=True)
)

In [30]:
model(context_idxs)

tensor([[-4.1876, -4.0142, -4.0249, -4.3971, -4.0504, -4.0775, -3.7069, -4.4298,
         -4.1983, -3.7377, -4.7659, -4.3369, -4.6147, -4.4581, -3.7786, -4.2146,
         -3.7002, -4.5736, -3.5534, -4.6636, -4.0253, -4.3277, -4.3748, -4.4652,
         -4.3315, -4.7749, -4.0555, -3.8370, -4.0306, -3.8088, -4.2948, -4.3594,
         -4.0110, -4.6627, -3.9585, -4.0355, -4.0777, -4.3592, -4.4203, -4.4619,
         -3.7170, -4.4907, -4.4841, -4.5382, -3.5864, -4.1611, -4.5496, -4.1264,
         -4.4953, -4.6569, -4.6671, -3.6328, -3.9784, -4.4483, -4.4728, -4.5906,
         -4.1571, -4.1133, -4.2543, -4.0717, -4.2258, -4.5225, -4.3673, -4.4489,
         -3.7132]], grad_fn=<LogSoftmaxBackward>)

In [31]:
print(log_probs)

tensor([[-4.1866, -4.0129, -4.0243, -4.3958, -4.0501, -4.0764, -3.7059, -4.4281,
         -4.1977, -3.7362, -4.7646, -4.3360, -4.6129, -4.4574, -3.7782, -4.2132,
         -3.6987, -4.5742, -3.5513, -4.6626, -4.0238, -4.3262, -4.3736, -4.4656,
         -4.3309, -4.7744, -4.0549, -3.8355, -4.0293, -3.8078, -4.2936, -4.3574,
         -4.0102, -4.6616, -3.9576, -4.0342, -4.0765, -4.3585, -4.4183, -4.4595,
         -3.7161, -4.4891, -4.4834, -4.5377, -3.6260, -4.1592, -4.5492, -4.1247,
         -4.4944, -4.6571, -4.6659, -3.6311, -3.9775, -4.4471, -4.4724, -4.5890,
         -4.1556, -4.1118, -4.2545, -4.0700, -4.2247, -4.5222, -4.3666, -4.4482,
         -3.7120]], grad_fn=<LogSoftmaxBackward>)


In [33]:
print(torch.tensor([word_to_ix[target]]))

tensor([44])


In [34]:
nn.NLLLoss()

NLLLoss()

In [35]:
loss_function

NLLLoss()

In [36]:
dir(loss_function)

['__call__',
 '__class__',
 '__constants__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_name',
 '_load_from_state_dict',
 '_load_state_dict_pre_hooks',
 '_modules',
 '_named_members',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_save_to_state_dict',
 '_slow_forward',
 '_state_dict_hooks',
 '_version',
 'add_module',
 'apply',
 'bfloat16',
 'buffers',
 'children',
 'cpu',
 'cuda',
 'double',
 'dump_patches',
 'eval',
 'extra_repr',
 'float',
 'forward',
 'half',
 'ignore_index',
 '

这里实现CBOW词嵌入

In [69]:
import numpy as np
vocab = set(test_sentence)
vocab

{'Alternate',
 'For',
 'I',
 'Internet',
 'This',
 'a',
 'actual',
 'alternate',
 'and',
 'available',
 'available,',
 'but',
 'buy',
 'called',
 'can',
 'collection',
 'compare',
 'data',
 'example,',
 'extraction',
 'fetch',
 'for',
 'from',
 'have',
 'if',
 'information',
 'information.',
 'is',
 'language',
 'laptop',
 'laptop,',
 'me.',
 'natural',
 'not',
 'of',
 'on',
 'one',
 'online',
 'portals.',
 'price',
 'prices',
 'process',
 'processing',
 'processing.',
 'provide',
 'publicly',
 'relevant',
 'scrape',
 'scraping,',
 'search',
 'summary',
 'system',
 'text',
 'that',
 'the',
 'to',
 'use',
 'user',
 'user,',
 'using',
 'various',
 'want',
 'web',
 'websites',
 'when'}

In [38]:
vocab_size = len(vocab)
vocab_size

65

In [39]:

word_to_ix = {word: i for i, word in enumerate(vocab)}
word_to_ix

{'have': 0,
 'actual': 1,
 'portals.': 2,
 'information': 3,
 'called': 4,
 'various': 5,
 'fetch': 6,
 'available,': 7,
 'price': 8,
 'to': 9,
 'but': 10,
 'if': 11,
 'one': 12,
 'system': 13,
 'when': 14,
 'information.': 15,
 'the': 16,
 'search': 17,
 'data': 18,
 'that': 19,
 'for': 20,
 'buy': 21,
 'summary': 22,
 'collection': 23,
 'user,': 24,
 'provide': 25,
 'and': 26,
 'Alternate': 27,
 'I': 28,
 'prices': 29,
 'is': 30,
 'process': 31,
 'For': 32,
 'natural': 33,
 'laptop': 34,
 'not': 35,
 'using': 36,
 'user': 37,
 'web': 38,
 'publicly': 39,
 'a': 40,
 'compare': 41,
 'laptop,': 42,
 'websites': 43,
 'processing.': 44,
 'scraping,': 45,
 'example,': 46,
 'available': 47,
 'Internet': 48,
 'scrape': 49,
 'alternate': 50,
 'use': 51,
 'from': 52,
 'me.': 53,
 'can': 54,
 'relevant': 55,
 'online': 56,
 'want': 57,
 'This': 58,
 'text': 59,
 'on': 60,
 'extraction': 61,
 'of': 62,
 'language': 63,
 'processing': 64}

In [42]:
data = []
# 构建所有的环境及目标词
for i in range(2, len(test_sentence) - 2):
    context = [test_sentence[i - 2], test_sentence[i - 1], 
               test_sentence[i + 1], test_sentence[i + 2]
              ]
    target = test_sentence[i]
    data.append((context, target))
print(data[:5])

[(['Alternate', 'data', 'is', 'when'], 'extraction'), (['data', 'extraction', 'when', 'actual'], 'is'), (['extraction', 'is', 'actual', 'data'], 'when'), (['is', 'when', 'data', 'is'], 'actual'), (['when', 'actual', 'is', 'not'], 'data')]


In [84]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 定义网络的投影层
        self.proj = nn.Linear(embedding_dim, 128)
        # 网络的输出层
        self.output = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        # 获取inputs中词的嵌入
        embeds = []
        for w in inputs:
            embed = self.embeddings(w).view(1, -1)
            embeds.append(embed)
        # 对嵌入求和：Wx1 + Wx2 + Wx3 + Wx4 = W(x1 + x2 + x3 + x4)
        embeds = sum(embeds)
        out = F.relu(self.proj(embeds))
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


In [87]:
# 定义损失函数和优化器
losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=1e-2)

In [90]:
for epoch in range(100):
    total_loss = 0
    for context, target in trigrams:
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        
        model.zero_grad()
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(total_loss)

208.6863633543253
208.38795844838023
208.0932647176087
207.80281068757176
207.51611502841115
207.23331578448415
206.9541219882667
206.67829656228423
206.4057732336223
206.13634933158755
205.86997857689857
205.60662449523807
205.34641610831022
205.0891963802278
204.83483144268394
204.5834595542401
204.33500672876835
204.0893378406763
203.84638259001076
203.6062051448971
203.3684559520334
203.13323623500764
202.90072415024042
202.6708982102573
202.44377980194986
202.2192131653428
201.99729256518185
201.77779319509864
201.56055908463895
201.34588747099042
201.13416194170713
200.92486174590886
200.71767022833228
200.5127201974392
200.3098196387291
200.10889574885368
199.91011080890894
199.7134053464979
199.51872726529837
199.32613850571215
199.13555622659624
198.94699480198324
198.76053553260863
198.57607009075582
198.39408857002854
198.2140520568937
198.03632125258446
197.86037406884134
197.68625896796584
197.51395700499415
197.3434186745435
197.17473141103983
197.00768603384495
196.84243