In [None]:
import nltk
import csv
from nltk.corpus import brown  #　http://korpus.uib.no/icame/brown/bcm.html
from nltk.corpus import wordnet  # https://wordnet.princeton.edu/

nltk.download("brown")
nltk.download("wordnet")

print("段落个数：",len(brown.paras()))  # 段落
print("句子个数：",len(brown.sents()))  # 句子 
print("单词个数：",len(brown.words()))  # 单词
print("原始文本长度",len(brown.raw()))  # 原始文本

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
段落个数： 15667
句子个数： 57340
单词个数： 1161192
原始文本长度 9964284


In [None]:
num_train = 12000  # 从15667个段落中选择的12000个段落作为训练集
UNK_symbol = "<UNK>"
vocabulary = set([UNK_symbol])  #　词典
min_count = 5  # 单词最少出现次数

brown_corpus_train = []  # 存储每个段落的单词集
# 遍历每一个段落，解析出句子，再接着解析出单词
for idx, paragraph in enumerate(brown.paras()):
    if idx == num_train:
        break
    words = []
    for sentence in paragraph:
        for word in sentence:
            words.append(word.lower())  # 可能有重复的单词
    brown_corpus_train.append(words)


# 统计每个单词的词频，便于构建词典
word_term_frequency_train = {}
for words in brown_corpus_train:
  for word in words:
    word_term_frequency_train[word] = word_term_frequency_train.get(word,0) + 1

# 构建词典，加入词典的单词出现次数必须够多
for words in brown_corpus_train:
    for word in words:
        if word_term_frequency_train.get(word,0) >= min_count:
            vocabulary.add(word)

print("词典大小：",len(vocabulary))

词典大小： 12681


In [None]:
import numpy as np
x_train = []
y_train = []
x_test = []
y_test = []

# 建立单词-词典索引的字典，后续传入索引代替传入字符串
word_to_idx_mappings = {}
for idx, word in enumerate(vocabulary):
    word_to_idx_mappings[word] = idx

# 根据单词获得词典索引，如果单词不在词典中，返回<UNK>索引,0
def get_idx(word):
    return word_to_idx_mappings.get(word, word_to_idx_mappings["<UNK>"])

# 建立训练集和测试集
for idx, paragraph in enumerate(brown.paras()):
    for sentence in paragraph:
        for i, word in enumerate(sentence):
          # trigram模型，三个连续单词为一个序列
            if i+2 >= len(sentence):  # 序列中第三个单词无法获得
                break
            # trigram模型中，根据前两个预测第三个
            x_extract = [get_idx(sentence[i].lower()), get_idx(sentence[i+1].lower())]
            y_extract = [get_idx(sentence[i+2].lower())]
            # 训练集
            if idx < num_train:
                x_train.append(x_extract)
                y_train.append(y_extract)
            # 测试集
            else:
                x_test.append(x_extract)
                y_test.append(y_extract)

# 转成numpy对象
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)  
  
print("训练集输入和输出：", x_train.shape, y_train.shape)
print("测试集输入和输出：", x_test.shape, y_test.shape)

训练集输入和输出： (872823, 2) (872823, 1)
测试集输入和输出： (174016, 2) (174016, 1)


In [None]:
import torch
import multiprocessing
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import time   

# trigram模型，NNLM的基础
class TrigramNNmodel(nn.Module):
  # 在trigram中，context_size窗口大小为3-1=2，第三个单词是预测对象
  def __init__(self, vocabulary_size, embedding_dimension, context_size, hidden_unit):
    super(TrigramNNmodel, self).__init__()
    self.context_size = context_size
    self.embedding_dimension = embedding_dimension
    self.embeddings = nn.Embedding(vocabulary_size, embedding_dimension)
    self.linear1 = nn.Linear(context_size*embedding_dimension, hidden_unit, bias = True)  # 输入层-隐藏层
    self.linear2 = nn.Linear(hidden_unit, vocabulary_size, bias = True)  # 隐藏层-输出层

  def forward(self, inputs):
    x = self.embeddings(inputs).view((-1, self.context_size * self.embedding_dimension))  # shape：(1, (n-1)*m)
    hidden_layer_output = self.linear1(x) # shape:(h, 1)
    non_linear_output = torch.tanh(hidden_layer_output)  # shape:(h, 1)
    output_layer_output = self.linear2(non_linear_output)  # shape:(V, 1)
    y = F.log_softmax(output_layer_output, dim=1)  # shape:(V, 1)
    return y

In [None]:
gpu = 0 

# hyperparameters
EMBEDDING_DIMENSION = 200
CONTEXT_SIZE = 2
BATCH_SIZE = 256
HIDDEN_UNIT = 100

# 设置随机化种子，固定参数初始化，便于复现
# https://arxiv.org/pdf/2109.08203.pdf
torch.manual_seed(3407)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
available_workers = multiprocessing.cpu_count()
print("可用的核心数：",available_workers)

train_set = np.concatenate((x_train, y_train), axis=1)  # 训练集
test_set = np.concatenate((x_test, y_test), axis=1)

train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, num_workers = available_workers)
test_loader = DataLoader(test_set, batch_size = BATCH_SIZE, num_workers = available_workers)

cuda
可用的核心数： 2


In [39]:
def get_accuracy_from_log_probs(log_probs, labels):
    probs = torch.exp(log_probs)
    predicted_label = torch.argmax(probs, dim=1)
    acc = (predicted_label == labels).float().mean()
    return acc

def evaluate(model, criterion, dataloader, gpu):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        dev_st = time.time()
        for it, data_tensor in enumerate(dataloader):
            context_tensor = data_tensor[:,0:2]
            target_tensor = data_tensor[:,2]
            context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)
            log_probs = model(context_tensor)
            mean_loss += criterion(log_probs, target_tensor).item()
            mean_acc += get_accuracy_from_log_probs(log_probs, target_tensor)
            count += 1
            if it % 500 == 0: 
                print("Iteration {} complete. Mean Loss: {}; Mean Acc:{}; Time taken (s): {}".format(it, mean_loss / count, mean_acc / count, (time.time()-dev_st)))
                dev_st = time.time()

    return mean_acc / count, mean_loss / count


# negative log-likelihood loss
loss_function = nn.NLLLoss()

model = TrigramNNmodel(len(vocabulary), EMBEDDING_DIMENSION, CONTEXT_SIZE, HIDDEN_UNIT)
model.cuda(gpu)

optimizer = optim.Adam(model.parameters(), lr = 2e-3)


best_acc = 0
best_model_path = None
print(f"{'='*10}开始训练模型{'='*10}")

for epoch in range(5):
    st = time.time()
    print("\ntraining epoch:{}".format(epoch+1))
    for _, data_tensor in enumerate(train_loader):       
        context_tensor = data_tensor[:,0:2]  # 第一维表示样本数量，此处指batch_size个样本
        target_tensor = data_tensor[:,2]

        context_tensor = context_tensor.cuda(gpu)
        target_tensor = target_tensor.cuda(gpu)

        model.zero_grad()

        log_probs = model(context_tensor)

        acc = get_accuracy_from_log_probs(log_probs, target_tensor)

        loss = loss_function(log_probs, target_tensor)

        loss.backward()
        optimizer.step()

        if _ % 500 == 0: 
            print("Training Iteration {} of epoch {} complete. Loss: {}; Acc:{}; Time taken (s): {}".format(_, epoch, loss.item(), acc, (time.time()-st)))
            st = time.time()

    print(f"{'='*10}开始评估模型{'='*10}")
    test_acc, test_loss = evaluate(model, loss_function, test_loader, gpu)
    print("Epoch {} complete! Accuracy: {}; Loss: {}".format(epoch, test_acc, test_loss))
    if test_acc > best_acc:
        print("accuracy improved from {} to {}, saving model...".format(best_acc, test_acc))
        best_acc = test_acc
        path = 'model_epoch_{}.dat'.format(epoch)
        torch.save(model.state_dict(), path)


training epoch:1
Training Iteration 0 of epoch 0 complete. Loss: 9.508774757385254; Acc:0.0; Time taken (s): 0.08885073661804199
Training Iteration 500 of epoch 0 complete. Loss: 6.249395370483398; Acc:0.16796875; Time taken (s): 3.514956474304199
Training Iteration 1000 of epoch 0 complete. Loss: 6.142131328582764; Acc:0.140625; Time taken (s): 3.3961398601531982
Training Iteration 1500 of epoch 0 complete. Loss: 5.987728595733643; Acc:0.1484375; Time taken (s): 3.4260268211364746
Training Iteration 2000 of epoch 0 complete. Loss: 5.895061016082764; Acc:0.11328125; Time taken (s): 3.399430274963379
Training Iteration 2500 of epoch 0 complete. Loss: 6.144865036010742; Acc:0.15625; Time taken (s): 3.3996307849884033
Training Iteration 3000 of epoch 0 complete. Loss: 5.675825119018555; Acc:0.19140625; Time taken (s): 3.40194034576416
Dev Iteration 0 complete. Mean Loss: 5.021405220031738; Mean Acc:0.1953125; Time taken (s): 0.0893411636352539
Dev Iteration 500 complete. Mean Loss: 5.114

In [45]:
# 应用
model = TrigramNNmodel(len(vocabulary), EMBEDDING_DIMENSION, CONTEXT_SIZE, HIDDEN_UNIT)
model.load_state_dict(torch.load(path))
model.cuda(gpu)

cos = nn.CosineSimilarity(dim=0)

lm_similarities = {}

word_pairs = {('computer','keyboard'),('cat','dog'),('dog','car'),('keyboard','cat')}

for word_pair in word_pairs:
    w1 = word_pair[0]
    w2 = word_pair[1]
    words_tensor = torch.LongTensor([get_idx(w1),get_idx(w2)])
    words_tensor = words_tensor.cuda(gpu)
    words_embeds = model.embeddings(words_tensor)
    sim = cos(words_embeds[0],words_embeds[1])
    lm_similarities[word_pair] = sim.item()

print(lm_similarities)

{('cat', 'dog'): 0.024955740198493004, ('computer', 'keyboard'): -0.11399353295564651, ('keyboard', 'cat'): -0.01905217580497265, ('dog', 'car'): 0.09305672347545624}
