In [97]:
import torch
import torch.nn as nn
import torch.optim as optim
from TorchCRF import CRF
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

In [98]:
# 数据准备
sentences = [
    ["I", "live", "in", "New", "York", "."],
    ["Barack", "Obama", "was", "the", "president", "of", "the", "United", "States", "."],
    ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "around", "$", "1", "billion", "."],
    ["London", "is", "the", "capital", "of", "England", "."],
]
labels = [
    ["O", "O", "O", "B-LOC", "I-LOC", "O"],
    ["B-PER", "I-PER", "O", "O", "O", "O", "B-LOC", "I-LOC", "I-LOC", "O"],
    ["B-ORG", "O", "O", "O", "B-ORG", "I-ORG", "O", "O", "O", "O", "O", "O"],
    ["B-LOC", "O", "O", "O", "B-LOC", "I-LOC", "O"],
]

# 词汇表和标签编码
word_list = list(set(word for sentence in sentences for word in sentence))
tag_list = list(set(tag for label in labels for tag in label))

word_to_ix = {word: i for i, word in enumerate(word_list)}
tag_to_ix = {tag: i for i, tag in enumerate(tag_list)}
ix_to_tag = {i: tag for tag, i in tag_to_ix.items()}

In [99]:
ix_to_tag

{0: 'I-PER',
 1: 'B-ORG',
 2: 'I-ORG',
 3: 'B-LOC',
 4: 'B-PER',
 5: 'I-LOC',
 6: 'O'}

In [100]:
tag_to_ix

{'I-PER': 0,
 'B-ORG': 1,
 'I-ORG': 2,
 'B-LOC': 3,
 'B-PER': 4,
 'I-LOC': 5,
 'O': 6}

In [101]:
word_to_ix

{'was': 0,
 'I': 1,
 '1': 2,
 'the': 3,
 'Obama': 4,
 'New': 5,
 '.': 6,
 '$': 7,
 'capital': 8,
 'U.K.': 9,
 'in': 10,
 'Apple': 11,
 'York': 12,
 'Barack': 13,
 'president': 14,
 'is': 15,
 'at': 16,
 'billion': 17,
 'London': 18,
 'startup': 19,
 'buying': 20,
 'of': 21,
 'England': 22,
 'around': 23,
 'for': 24,
 'United': 25,
 'live': 26,
 'looking': 27,
 'States': 28}

In [102]:
# 数据准备，添加填充
def prepare_data(sentences, labels):
    max_len = max(len(sentence) for sentence in sentences)  # 找到最长的句子长度
    X = [[word_to_ix[word] for word in sentence] + [0] * (max_len - len(sentence)) for sentence in sentences]  # 使用0填充
    y = [[tag_to_ix[tag] for tag in label] + [tag_to_ix["O"]] * (max_len - len(label)) for label in labels]  # 使用O填充
    mask = [[1] * len(label) + [0] * (max_len - len(label)) for label in labels]  # 生成mask
    return X, y, mask

X, y, mask = prepare_data(sentences, labels)
X = torch.tensor(X, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)
mask = torch.tensor(mask, dtype=torch.bool)  # 转换为布尔类型

In [103]:
X[:5]

tensor([[ 1, 26, 10,  5, 12,  6,  0,  0,  0,  0,  0,  0,  0],
        [13,  4,  0,  3, 14, 21,  3, 25, 28,  6,  0,  0,  0],
        [11, 15, 27, 16, 20,  9, 19, 24, 23,  7,  2, 17,  6],
        [18, 15,  3,  8, 21, 22,  6,  0,  0,  0,  0,  0,  0]])

In [104]:
y[:5]

tensor([[6, 6, 6, 3, 5, 6, 6, 6, 6, 6, 6, 6, 6],
        [4, 0, 6, 6, 6, 6, 3, 5, 5, 6, 6, 6, 6],
        [1, 6, 6, 6, 1, 2, 6, 6, 6, 6, 6, 6, 6],
        [3, 6, 6, 6, 3, 5, 6, 6, 6, 6, 6, 6, 6]])

In [105]:
# 超参数
EMBEDDING_DIM =64
HIDDEN_DIM = 64

# BiLSTM + CRF 模型
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag_size, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, bidirectional=True, batch_first=True)
        self.hidden_dim = hidden_dim
        self.fc = nn.Linear(hidden_dim, tag_size)
        self.crf = CRF(tag_size)

    def forward(self, sentences):
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        emissions = self.fc(lstm_out)
        return emissions

    def loss(self, sentences, tags, mask):
        emissions = self.forward(sentences)
        # 计算损失
        return self.crf(emissions, tags, mask=mask)

    def predict(self, sentences, mask):
        with torch.no_grad():
            emissions = self.forward(sentences)
            # 计算最优路径
            return self.crf.viterbi_decode(emissions, mask=mask)

In [106]:
# 模型训练
model = BiLSTM_CRF(len(word_list), len(tag_list), EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练
for epoch in range(20):
    model.train()
    optimizer.zero_grad()
    loss = model.loss(X, y, mask)
    loss_value = loss.mean()  # 确保损失是标量
    loss_value.backward()
    optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {loss_value.item()}')



Epoch 1, Loss: -18.066131591796875
Epoch 2, Loss: -18.38288116455078
Epoch 3, Loss: -18.702377319335938
Epoch 4, Loss: -19.02557373046875
Epoch 5, Loss: -19.353496551513672
Epoch 6, Loss: -19.687332153320312
Epoch 7, Loss: -20.02839469909668
Epoch 8, Loss: -20.37810516357422
Epoch 9, Loss: -20.737934112548828
Epoch 10, Loss: -21.109386444091797
Epoch 11, Loss: -21.49394989013672
Epoch 12, Loss: -21.89310073852539
Epoch 13, Loss: -22.30831527709961
Epoch 14, Loss: -22.741079330444336
Epoch 15, Loss: -23.19290542602539
Epoch 16, Loss: -23.665363311767578
Epoch 17, Loss: -24.16005516052246
Epoch 18, Loss: -24.678665161132812
Epoch 19, Loss: -25.222936630249023
Epoch 20, Loss: -25.79469108581543


In [108]:
# 测试
model.eval()
test_sentence = ["I", "was","in","in", "in", "London", "."]
max_len = max(len(sentence) for sentence in sentences)  # 重用 max_len
test_tensor = torch.tensor([[word_to_ix[word] for word in test_sentence] + [0] * (max_len - len(test_sentence))], dtype=torch.long)
test_mask = torch.tensor([[1] * len(test_sentence) + [0] * (max_len - len(test_sentence))], dtype=torch.bool)  # 转换为布尔类型
predicted_tags = model.predict(test_tensor, test_mask)

# 显示预测结果
predicted_labels = [ix_to_tag[tag] for tag in predicted_tags[0]]  # 获取第一个预测序列
print(list(zip(test_sentence, predicted_labels)))

[('I', 'B-ORG'), ('was', 'I-LOC'), ('in', 'B-ORG'), ('in', 'I-LOC'), ('in', 'B-ORG'), ('London', 'B-ORG'), ('.', 'I-LOC')]
