In [1]:
import sys
sys.path.append("D:/Experiment")
from MyKu import processing
from MyKu import training
from MyKu import MHeadAttention
import pandas as pd
from tqdm import tqdm
import pandas as pd
import torchtext
from torchtext.vocab import Vectors
from torchtext.legacy import data
import torch
from torch import nn
import torch.nn.functional as F
from d2l import torch as d2l
from torch.autograd import Variable
from spacy.lang.en import English
from sklearn import metrics

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)




cuda:0


In [2]:
processing.create_Sem2018()

In [3]:
def tokenizer(text):  # create a tokenizer function
    """
    定义分词操作
    """
    return processing.Pre_processing_tweets().tokenize_process(text)


def DataLoader():
    def tokenize(x): return x.split()

    TEXT = data.Field(sequential=True, tokenize=tokenizer,
                      lower=True, include_lengths=True, fix_length=60)
    LABEL = data.Field(sequential=False, use_vocab=False)
    # 假如train.csv文件并不是只有两列，比如1、3列是review和polarity，2列是我们不需要的数据，
    # 那么就要添加一个全是None的元组， fields列表存储的Field的顺序必须和csv文件中每一列的顺序对应，

    train_fields = [(None, None), ('tweet', TEXT), ('subtask_a', LABEL)]
    # train_fields = [(None, None), (None, None), (None, None), (None, None), ('text', TEXT), ('task1', LABEL)]
    train_data = data.TabularDataset(
        path='D:/Experiment/datasets/OLID/train.tsv',
        # path='D:/Experiment/datasets/EXIST2021/train.tsv',
        format='tsv',
        fields=train_fields,
        skip_header=True  # 是否跳过文件的第一行
    )
    test_fields = [(None, None), ('tweet', TEXT), ('label', LABEL)]
    # test_fields = [(None, None), (None, None), (None, None), (None, None), ('text', TEXT), ('task1', LABEL)]
    test_data = data.TabularDataset(
        path='D:/Experiment/datasets/OLID/testA.tsv',
        # path='D:/Experiment/datasets/EXIST2021/test.tsv',
        format='tsv',
        fields=test_fields,
        skip_header=True  # 是否跳过文件的第一行
    )
    return train_data, test_data, TEXT, LABEL


In [5]:
train_data, test_data, TEXT, LABEL = DataLoader()

vectors = Vectors(name='glove.6B.300d.txt', cache=processing.EMBEDDING_PATH)

TEXT.build_vocab(train_data,  # 建词表是用训练集建，不要用验证集和测试集
                  max_size=400000, # 单词表容量
                  vectors=vectors, # 还有'glove.840B.300d'已经很多可以选
                  unk_init=torch.Tensor.normal_ # 初始化train_data中不存在预训练词向量词表中的单词
)

In [6]:

class Model(nn.Module):
    def __init__(self,vocab_size, embed_size, num_hiddens, output_dim, max_length, num_layers, dropout, **kwargs):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.LSTM = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers, bidirectional=True, dropout=dropout, batch_first=True)
        self.n_class = output_dim
        self.decoder1 = nn.Linear(num_hiddens * 4, max_length)
        self.decoder2 = nn.Linear(num_hiddens, self.n_class)
        self.weight_W = nn.Parameter(torch.Tensor(embed_size, embed_size))
        self.weight_proj = nn.Parameter(torch.Tensor(embed_size, max_length))
        self.U = nn.Parameter(torch.Tensor(max_length, output_dim))
        self.V = nn.Parameter(torch.Tensor(max_length, output_dim))
        self.g = nn.Parameter(torch.Tensor(output_dim))
        self.W_f = nn.Parameter(torch.Tensor(output_dim, output_dim))
        self.bias = nn.Parameter(torch.Tensor(output_dim))
        nn.init.uniform_(self.weight_W, -0.1, 0.1)
        nn.init.uniform_(self.weight_proj, -0.1, 0.1)
        nn.init.uniform_(self.U, -0.1, 0.1)
        nn.init.uniform_(self.V, -0.1, 0.1)
        nn.init.uniform_(self.g, -0.1, 0.1)
        nn.init.uniform_(self.W_f, -0.1, 0.1)
        nn.init.uniform_(self.bias, -0.1, 0.1)
    
    def forward(self, inputs): #inputs torch.Size([64, 40])
        input = self.embedding(inputs.permute(1, 0)) #input torch.Size([64, 40, 300])
        w = torch.tanh(torch.matmul(input, self.weight_W))  # w torch.Size([64, 40, 300])
        self_matching = torch.matmul(w, self.weight_proj)   # w torch.Size([64, 40, 40])
        att_score, idxs = torch.max(self_matching, dim=1)   # att_score torch.Size([64, 40])
        self.LSTM.flatten_parameters()
        outputs, _ = self.LSTM(input)    #outputs torch.Size([64, 11, 400])
        output = torch.cat((outputs[:,0,:], outputs[:,-1,:]), dim=1)    # output torch.Size([64, 800])
        output = self.decoder1(output)      # output torch.Size([64, max])
        # outs = self.decoder2(output)
        self_matching_out = att_score.mul(output) # self_matching_out torch.Size([64, max])
        # outs = torch.cat((output, self_matching_out), dim=-1)
        # output torch.Size([64, 2])
        f_a = torch.matmul(self_matching_out, self.U)
        f_b = torch.matmul(output, self.V)  # output torch.Size([64, 2])
        f = f_a.mul(f_b) + self.g   # output torch.Size([64, 2])
        outs = torch.softmax(torch.matmul(f, self.W_f) + self.bias, dim=1)
        return outs


In [7]:
def train(model, train_iter, optimizer, loss, epoch):
    model.train()
    epoch_loss = 0
    num_sample = 0
    correct = 0
    for batch in tqdm(train_iter, desc=f"Training Epoch {epoch}", colour='red'):
        optimizer.zero_grad()
        text, text_len = batch.tweet
        label = batch.subtask_a
        # text, text_len = batch.text
        # label = batch.task1
        output = model(text)
        pred_y = torch.argmax(output, dim=1)
        correct += torch.sum(pred_y == label)
        l = loss(output, label)
        l.backward()
        epoch_loss += l.item()
        num_sample += len(batch)
        optimizer.step()
    print(
        f'\tTrain Loss: {epoch_loss / num_sample:.3f} | Train Acc: {correct.float() / num_sample* 100:.2f}%')


def test(model, test_iter):
    true_y, pred_y = [], []
    for batch in tqdm(test_iter, desc=f"Testing", colour='green'):
        text, text_len = batch.tweet
        label = batch.label
        # text, text_len = batch.text
        # label = batch.task1
        with torch.no_grad():
            output = model(text)
            pred_y.extend(output.argmax(dim=1).tolist())
            true_y.extend(label.tolist())
    print(metrics.confusion_matrix(true_y, pred_y))
    print(metrics.classification_report(true_y, pred_y))
    print(f'Acc : {metrics.accuracy_score(true_y, pred_y)}\t F1: {metrics.f1_score(true_y, pred_y, average="macro")}')


In [8]:

num_hiddens, output_dim, max_length, num_layers, dropout = 100, 2, 60, 1, 0.5
model = Model(len(TEXT.vocab), 300, num_hiddens, output_dim, max_length, num_layers, dropout)
model.to(DEVICE)
pretrained_embeddings = TEXT.vocab.vectors
# print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(300)
model.embedding.weight.data[PAD_IDX] = torch.zeros(300)


train_iter, test_iter = data.BucketIterator.splits((train_data, test_data), batch_size=64, sort_within_batch=True, sort_key=lambda x: len(x.tweet), device=DEVICE)




In [9]:
import time

In [10]:
lr, num_epochs = 0.0001, 20

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
begin = time.time()
for epoch in range(1, 2):
    train(model, train_iter, optimizer, loss, epoch)
    test(model, test_iter)
end = time.time()
print(end-begin)


Training Epoch 1: 100%|[31m██████████[0m| 207/207 [00:03<00:00, 67.83it/s]


	Train Loss: 0.010 | Train Acc: 59.53%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 259.02it/s]

[[592  28]
 [138 102]]
              precision    recall  f1-score   support

           0       0.81      0.95      0.88       620
           1       0.78      0.42      0.55       240

    accuracy                           0.81       860
   macro avg       0.80      0.69      0.71       860
weighted avg       0.80      0.81      0.79       860

Acc : 0.8069767441860465	 F1: 0.7141941941941942
3.1168291568756104



