In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from gensim.models import Word2Vec
import numpy as np

# 训练Word2Vec模型
def train_word2vec(sentences, embedding_dim):
    model = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4)
    return model

# 创建词汇和标签字典
def build_vocab(sentences, labels, word2vec_model):
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    tag2idx = {'O': 0}
    
    for sentence in sentences:
        for word in sentence:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
                
    for label in labels:
        for tag in label:
            if tag not in tag2idx:
                tag2idx[tag] = len(tag2idx)
    
    # 创建词向量矩阵
    embedding_matrix = np.zeros((len(word2idx), word2vec_model.vector_size))
    for word, idx in word2idx.items():
        if word in word2vec_model.wv:
            embedding_matrix[idx] = word2vec_model.wv[word]
        else:
            embedding_matrix[idx] = np.random.randn(word2vec_model.vector_size)
    
    return word2idx, tag2idx, embedding_matrix

# 加载训练数据
def read_data(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) > 0:  # 检查是否有内容
                sentences.append(parts[0])  # 假设每行的第一个部分是句子
                labels.append(parts[1:])   # 假设其余部分是标签
    return sentences, labels

# 数据加载和预处理
class CWSDataSet(Dataset):
    def __init__(self, sentences, labels, word2idx, tag2idx, max_len):
        self.sentences = sentences
        self.labels = labels
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        
        # 将单词和标签转换为索引
        sentence_idx = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in sentence]
        label_idx = [self.tag2idx.get(tag, self.tag2idx['O']) for tag in label]
        
        # 填充或截断
        sentence_idx = sentence_idx[:self.max_len] + [self.word2idx['<PAD>']] * (self.max_len - len(sentence_idx))
        label_idx = label_idx[:self.max_len] + [self.tag2idx['O']] * (self.max_len - len(label_idx))
        
        return torch.tensor(sentence_idx), torch.tensor(label_idx)

# 定义注意力层
class AttentionLayer(nn.Module):
    def __init__(self, input_dim):
        super(AttentionLayer, self).__init__()
        self.W = nn.Linear(input_dim, input_dim)
        self.u = nn.Parameter(torch.randn(input_dim))
        
    def forward(self, x):
        u_t = torch.tanh(self.W(x))
        a_t = F.softmax(torch.matmul(u_t, self.u), dim=1)
        output = torch.sum(x * a_t.unsqueeze(-1), dim=1)
        return output

# 手动实现 CRF 层
class CRF(nn.Module):
    def __init__(self, num_tags):
        super(CRF, self).__init__()
        self.num_tags = num_tags
        self.start_transitions = nn.Parameter(torch.randn(num_tags))
        self.end_transitions = nn.Parameter(torch.randn(num_tags))
        self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))

    def forward(self, emissions, tags, mask):
        score = self._compute_score(emissions, tags, mask)
        partition = self._compute_normalizer(emissions, mask)
        return torch.sum(partition - score)

    def _compute_score(self, emissions, tags, mask):
        batch_size, seq_length, num_tags = emissions.shape
        score = self.start_transitions[tags[:, 0]]
        for i in range(seq_length - 1):
            current_tag, next_tag = tags[:, i], tags[:, i + 1]
            score += self.transitions[current_tag, next_tag] * mask[:, i + 1]
            score += emissions[:, i, current_tag] * mask[:, i]
        score += self.end_transitions[tags[:, -1]] * mask[:, -1]
        return score

    def _compute_normalizer(self, emissions, mask):
        batch_size, seq_length, num_tags = emissions.shape
        alpha = self.start_transitions + emissions[:, 0]
        for i in range(1, seq_length):
            emit_score = emissions[:, i].unsqueeze(1)
            trans_score = self.transitions.unsqueeze(0)
            alpha = self._log_sum_exp(alpha.unsqueeze(2) + trans_score + emit_score, dim=1)
        return self._log_sum_exp(alpha + self.end_transitions, dim=1)

    def _log_sum_exp(self, tensor, dim):
        max_score, _ = tensor.max(dim)
        return max_score + (tensor - max_score.unsqueeze(dim)).exp().sum(dim).log()

    def decode(self, emissions, mask):
        batch_size, seq_length, num_tags = emissions.shape
        viterbi = self.start_transitions + emissions[:, 0]
        backpointers = []

        for i in range(1, seq_length):
            viterbi_t = viterbi.unsqueeze(2) + self.transitions.unsqueeze(0) + emissions[:, i].unsqueeze(1)
            best_tag = torch.max(viterbi_t, dim=1)[1]
            viterbi = torch.max(viterbi_t, dim=1)[0]
            backpointers.append(best_tag)

        best_tags = [torch.max(viterbi + self.end_transitions, dim=1)[1].unsqueeze(1)]
        for backpointer in reversed(backpointers):
            best_tags.insert(0, backpointer.gather(1, best_tags[0]))

        return torch.cat(best_tags, dim=1)

# 定义 ACNNC 模型
class ACNNCModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, input_length, num_classes, conv_filters, conv_kernel_size):
        super(ACNNCModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
        self.attention = AttentionLayer(embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, conv_filters, conv_kernel_size)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(conv_filters + embedding_dim, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.crf = CRF(num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        attention_out = self.attention(embedded)
        conv_out = self.conv(embedded.permute(0, 2, 1))  # Conv1d expects input in (batch_size, embedding_dim, seq_length) format
        conv_out = self.pool(conv_out).squeeze(-1)
        merged = torch.cat((attention_out, conv_out), dim=1)
        merged = self.dropout(merged)
        dense_out = self.fc(merged)
        return dense_out

    def forward_crf(self, x, tags=None, mask=None):
        emissions = self.forward(x)
        emissions = emissions.unsqueeze(1).repeat(1, x.size(1), 1)  # (batch_size, seq_length, num_classes)
        if mask is None:
            mask = (x != 0).float()
        if tags is not None:
            loss = self.crf(emissions, tags, mask)
            return loss
        else:
            return self.crf.decode(emissions, mask)

# 加载数据集
train_sentences, train_labels = read_data("E:/Desktop/icwb2-data/training/pku_training.txt")
test_sentences, test_labels = read_data("E:/Desktop/icwb2-data/testing/pku_test.txt")

# 训练Word2Vec模型
embedding_dim = 100
word2vec_model = train_word2vec(train_sentences, embedding_dim)

# 创建词汇表和标签表
word2idx, tag2idx, embedding_matrix = build_vocab(train_sentences, train_labels, word2vec_model)

# 创建数据集和数据加载器
max_len = 100  # 最大序列长度
batch_size = 32

train_dataset = CWSDataSet(train_sentences, train_labels, word2idx, tag2idx, max_len)
test_dataset = CWSDataSet(test_sentences, test_labels, word2idx, tag2idx, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 构建模型
vocab_size = len(word2idx)
num_classes = len(tag2idx)
conv_filters = 64
conv_kernel_size = 3

model = ACNNCModel(vocab_size, embedding_dim, embedding_matrix, max_len, num_classes, conv_filters, conv_kernel_size)

# 定义损失函数和优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
def train(model, train_loader, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for sentences, labels in train_loader:
            optimizer.zero_grad()
            loss = model.forward_crf(sentences, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

# 评估模型
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for sentences, labels in dataloader:
            loss = model.forward_crf(sentences, labels)
            total_loss += loss.item()
    print(f"Average Loss: {total_loss/len(dataloader):.4f}")

# 训练模型
num_epochs = 5
train(model, train_loader, optimizer, num_epochs)

# 评估模型
evaluate(model, test_loader)


ModuleNotFoundError: No module named 'gensim'

In [2]:
pip install gensim

Collecting gensim
  Downloading gensim-4.3.2-cp38-cp38-win_amd64.whl (24.0 MB)
Collecting scipy>=1.7.0
  Downloading scipy-1.10.1-cp38-cp38-win_amd64.whl (42.2 MB)
Note: you may need to restart the kernel to use updated packages.


ERROR: Exception:
Traceback (most recent call last):
  File "E:\anaconda\lib\site-packages\pip\_vendor\urllib3\response.py", line 437, in _error_catcher
    yield
  File "E:\anaconda\lib\site-packages\pip\_vendor\urllib3\response.py", line 519, in read
    data = self._fp.read(amt) if not fp_closed else b""
  File "E:\anaconda\lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 62, in read
    data = self.__fp.read(amt)
  File "E:\anaconda\lib\http\client.py", line 458, in read
    n = self.readinto(b)
  File "E:\anaconda\lib\http\client.py", line 502, in readinto
    n = self.fp.readinto(b)
  File "E:\anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "E:\anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "E:\anaconda\lib\ssl.py", line 1099, in read
    return self._sslobj.read(len, buffer)
socket.timeout: The read operation timed out

During handling of the above exception, another exception occ