In [8]:
import torch
from torch import nn
import numpy as np
# 在BiLSTM+CRF模型中，BiLSTM部分可以使用Pytorch等深度学习框架，CRF部分必须手写完成。
# https://github.com/phipleg/keras/blob/crf/keras/layers/crf.py


In [9]:
import model.ChainCRF
import importlib
importlib.reload(model.ChainCRF)
from model.ChainCRF import ChainCRF


In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BiLSTM_CRF(nn.Module):
    def __init__(self, num_classes, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        # self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes

        # 词嵌入层
        # self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # BiLSTM层
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True, batch_first=True)

        # 线性映射到标签空间
        self.hidden2tag = nn.Linear(hidden_dim, num_classes)

        # 定义CRF层
        self.crf = ChainCRF(num_classes)
        # self.sigmoid = nn.Sigmoid()

    def forward(self, sentence, mask,targets=None):
        # 获取词嵌入
        # embeds = self.word_embeddings(sentence)
        embeds = sentence
        # BiLSTM层
        lstm_out, _ = self.lstm(embeds)
        # [batch_size,max_length,hidden_dim]

        # 线性映射到标签空间
        # print(lstm_out.shape)
        emissions = self.hidden2tag(lstm_out)
        # emissions = self.sigmoid(emissions)

        # print(emissions.shape)

        if targets is not None:
            # 计算CRF损失
            # mask = (sentence != 0)  # 使用 0 填充的词的位置作为掩码
            crf_loss = self.crf(emissions,targets,mask)
            return crf_loss
        else:
            # 测试时，使用维特比解码
            tags = self.crf.viterbi_decode(emissions,mask)
            return tags





In [11]:
import torchtext
from check import *
embed_size = 100
glove_path="glove/"
vocab = torchtext.vocab.GloVe(name="6B",dim=embed_size,cache=glove_path)
print(vocab['中'])
print(vocab['me'])
from get_data import *
language ='English'
train_data = get_train_data(language)
def sent2word(sentence):
    return [w for w, _ in sentence]
def sent2label(sentence):
    return [l for _, l in sentence]
max_length = max([len(l) for l in train_data])
max_length = max(max_length, 128)
sorted_labels = sorted_labels_chn if language == 'Chinese' else sorted_labels_eng
def label2index(label):
    return sorted_labels.index(label)
#对句子的word进行词嵌入，对tag转换为index，对句子按照max_length进行截断或者填充，并产生对应的mask，最后产生对应的dataset
print(len(train_data))

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])
tensor([ 5.6719e-02,  1.3333e-01,  7.2690e-01, -4.6336e-01, -5.9334e-01,
         7.1746e-01, -1.1795e-01,  2.1614e-01,  4.3036e-01, -6.7053e-01,
         5.7480e-01,  2.6827e-01,  2.4659e-02,  1.6066e-01,  2.0400e-01,
        -3.9246e-01, -6.3294e-01,  6.2915e-01, -7.6340e-01,  1.1581e+00,
         3.6218e-01,  3.1932e-01, -6.5613e-01, -4.7797e-01,  2.9885e-01,
         6.2435e-01, -4.6060e-01, -9.6276e-01,  1.2214e+00, -2.3152e-01,
        -6.8889e-02,  6.3519e-01,  7.7546e-01,  3.3128e-01, -3.5220e-01,
         7.4236e-01, -6.6703e-01,  3.2260e-01,  

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

# Assuming train_data is a list of sentences where each sentence is a list of tuples (word, label)
# Example: [('This', 'O'), ('is', 'O'), ('a', 'O'), ('sentence', 'B')]

class CustomDataset(Dataset):
    def __init__(self, data, vocab, label2index, max_length):
        self.data = data
        self.vocab = vocab
        self.label2index = label2index
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]

        # Extract words and labels
        words = [w for w, _ in sentence]
        labels = [l for _, l in sentence]

        # Convert words to embeddings
        word_embeddings = [self.vocab[w] for w in words]

        # Convert labels to indices
        label_indices = [self.label2index(l) for l in labels]

        # Pad or truncate to max_length
        if len(word_embeddings) < self.max_length:
            pad_length = self.max_length - len(word_embeddings)
            word_embeddings = word_embeddings + [torch.zeros_like(word_embeddings[0])] * pad_length
            label_indices = label_indices + [0] * pad_length  # Assuming 0 is the index for padding
        else:
            word_embeddings = word_embeddings[:self.max_length]
            label_indices = label_indices[:self.max_length]

        # Create a mask
        mask = [1] * min(len(words),max_length) + [0] * max(0,(self.max_length - len(words)))

        return {
            'word_embeddings': torch.stack(word_embeddings),
            'label_indices': torch.tensor(label_indices),
            'mask': torch.tensor(mask)
        }

# Create a custom dataset
custom_dataset = CustomDataset(train_data, vocab, label2index, max_length)
# Create a DataLoader
batch_size = 32
dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)



In [13]:
import torch
import torch.optim as optim
from tqdm import tqdm

def train(model, train_loader, num_epochs, learning_rate, device):
    # Move the model to GPU
    model.to(device)

    # Define the optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Create a progress bar
    progress_bar = tqdm(total=num_epochs * len(train_loader))

    for epoch in range(num_epochs):
        model.train()
        sum_loss = 0

        for batch in train_loader:
            # Move inputs, labels, and mask to GPU
            inputs = batch['word_embeddings'].to(device)
            labels = batch['label_indices'].to(device)
            mask = batch['mask'].to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            loss = model(inputs, mask, labels)

            # Backward pass
            loss.backward()
            sum_loss += loss.item()

            # Update the parameters
            optimizer.step()
            progress_bar.update(1)

        progress_bar.set_postfix_str("Epoch:{}, Loss:{}".format(epoch + 1, sum_loss / len(train_loader)))

    # Close the progress bar
    progress_bar.close()

# Example usage:
# Assuming you have a model, train_loader, num_epochs, and learning_rate



In [14]:
hidden_dim = 100
bilstm_crf = BiLSTM_CRF(len(sorted_labels),embed_size,hidden_dim)
file = "BILSTM_CRF_{}.bin".format(language)
device = torch.device("cuda" if torch.cuda.is_available() else "mps")

mode = True
if mode:
    train(bilstm_crf,dataloader,10,1e-2,device)
    torch.save(bilstm_crf.state_dict(),file)
else:
    bilstm_crf.load_state_dict(torch.load(file))



KeyboardInterrupt: 

In [None]:
def mycheck(language,vocab,res_file,model,max_length,train_or_valid):
    valid = get_data_from_file(res_file)
    pred_path = "example_data/BILSTM_CRF_{}_{}.txt".format(language,"train" if train_or_valid else "dev")
    valid_data = CustomDataset(valid, vocab, label2index, max_length)
    valdataloader = DataLoader(valid_data, batch_size=64, shuffle=False)
    iter = 0
    with open(pred_path, "w") as f:
        with torch.no_grad():
            iter = 0
            for val in valdataloader:
                preds = model(val['word_embeddings'],val['mask'])
                masks = val['mask']
                for pred,mask in zip(preds,masks):
                    pred_labels = []
                    for i in range(len(pred)):
                        if mask[i] == 1:
                            f.write(valid[iter][i][0] +" " +sorted_labels[pred[i]]+'\n')
                            pred_labels.append(sorted_labels[pred[i]])
                        else:
                            f.write('\n')
                            iter = iter+1
                            break
    check(language,"{}/{}.txt".format(language,"train" if train_or_valid else "validation"),pred_path)

In [None]:

mycheck(language=language,vocab=vocab,res_file="{}/train.txt".format(language),model=bilstm_crf,max_length=max_length,train_or_valid=1)
mycheck(language=language,vocab=vocab,res_file="{}/validation.txt".format(language),model=bilstm_crf,max_length=max_length,train_or_valid=0)

              precision    recall  f1-score   support

       B-PER     0.5397    0.5052    0.5218      6600
       I-PER     0.5383    0.7365    0.6220      4528
       B-ORG     0.2627    0.4700    0.3371      6321
       I-ORG     0.3380    0.5715    0.4248      3704
       B-LOC     0.4574    0.2083    0.2862      7140
       I-LOC     0.0790    0.0199    0.0318      1157
      B-MISC     0.1072    0.0492    0.0674      3438
      I-MISC     0.1613    0.0130    0.0240      1155

   micro avg     0.3826    0.3951    0.3888     34043
   macro avg     0.3105    0.3217    0.2894     34043
weighted avg     0.3767    0.3951    0.3614     34043

              precision    recall  f1-score   support

       B-PER     0.6063    0.5510    0.5774      1842
       I-PER     0.6033    0.7796    0.6802      1307
       B-ORG     0.2156    0.4452    0.2905      1341
       I-ORG     0.2832    0.5087    0.3638       751
       B-LOC     0.4542    0.1943    0.2722      1837
       I-LOC     0.1429 