In [1]:
import sys
sys.path.append("D:/Experiment")
from MyKu import processing
from MyKu import training
from MyKu import MHeadAttention
import pandas as pd
from tqdm import tqdm
import pandas as pd
import torchtext
from torchtext.vocab import Vectors
from torchtext.legacy import data
import torch
from torch import nn
import torch.nn.functional as F
from d2l import torch as d2l
from torch.autograd import Variable
from spacy.lang.en import English
from sklearn import metrics

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)




cuda:0


In [2]:
processing.create_Sem2018()

In [3]:
def tokenizer(text):  # create a tokenizer function
    """
    定义分词操作
    """
    return processing.Pre_processing_tweets().tokenize_process(text)


def DataLoader():

    TEXT = data.Field(sequential=True, tokenize=tokenizer,
                      lower=True, include_lengths=True, fix_length=20)
    LABEL = data.Field(sequential=False, use_vocab=False)
    # 假如train.csv文件并不是只有两列，比如1、3列是review和polarity，2列是我们不需要的数据，
    # 那么就要添加一个全是None的元组， fields列表存储的Field的顺序必须和csv文件中每一列的顺序对应，

    train_fields = [(None, None), ('label', LABEL),  ('tweet', TEXT)]
    # train_fields = [(None, None), (None, None), (None, None), (None, None), ('text', TEXT), ('task1', LABEL)]
    train_data = data.TabularDataset(
        path='D:/Experiment/datasets/SEM2018/train.tsv',
        # path='D:/Experiment/datasets/EXIST2021/train.tsv',
        format='tsv',
        fields=train_fields,
        skip_header=True  # 是否跳过文件的第一行
    )
    test_fields = [(None, None), ('label', LABEL),  ('tweet', TEXT)]
    # test_fields = [(None, None), (None, None), (None, None), (None, None), ('text', TEXT), ('task1', LABEL)]
    test_data = data.TabularDataset(
        path='D:/Experiment/datasets/SEM2018/test.tsv',
        # path='D:/Experiment/datasets/EXIST2021/test.tsv',
        format='tsv',
        fields=test_fields,
        skip_header=True  # 是否跳过文件的第一行
    )
    return train_data, test_data, TEXT, LABEL


In [4]:
train_data, test_data, TEXT, LABEL = DataLoader()

vectors = Vectors(name='glove.6B.300d.txt', cache=processing.EMBEDDING_PATH)

TEXT.build_vocab(train_data,  # 建词表是用训练集建，不要用验证集和测试集
                  max_size=400000, # 单词表容量
                  vectors=vectors, # 还有'glove.840B.300d'已经很多可以选
                  unk_init=torch.Tensor.normal_ # 初始化train_data中不存在预训练词向量词表中的单词
)

In [5]:
train_data, test_data, TEXT, LABEL = DataLoader()

vectors = Vectors(name='glove.6B.300d.txt', cache=processing.EMBEDDING_PATH)

TEXT.build_vocab(train_data,  # 建词表是用训练集建，不要用验证集和测试集
                  max_size=400000, # 单词表容量
                  vectors=vectors, # 还有'glove.840B.300d'已经很多可以选
                  unk_init=torch.Tensor.normal_ # 初始化train_data中不存在预训练词向量词表中的单词
)

In [6]:

class Model(nn.Module):
    def __init__(self,vocab_size, embed_size, num_hiddens, output_dim, max_length, num_layers, dropout, **kwargs):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.LSTM = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers, bidirectional=True, dropout=dropout, batch_first=True)
        self.n_class = output_dim
        self.decoder1 = nn.Linear(num_hiddens * 4, max_length)
        self.decoder2 = nn.Linear(num_hiddens, self.n_class)
        self.weight_W = nn.Parameter(torch.Tensor(embed_size, embed_size))
        self.weight_proj = nn.Parameter(torch.Tensor(embed_size, max_length))
        self.U = nn.Parameter(torch.Tensor(max_length, output_dim))
        self.V = nn.Parameter(torch.Tensor(max_length, output_dim))
        self.g = nn.Parameter(torch.Tensor(max_length))
        self.W_f = nn.Parameter(torch.Tensor(max_length, output_dim))
        self.bias = nn.Parameter(torch.Tensor(output_dim))
        nn.init.uniform_(self.weight_W, -0.1, 0.1)
        nn.init.uniform_(self.weight_proj, -0.1, 0.1)
        nn.init.uniform_(self.U, -0.1, 0.1)
        nn.init.uniform_(self.V, -0.1, 0.1)
        nn.init.uniform_(self.g, -0.1, 0.1)
        nn.init.uniform_(self.W_f, -0.1, 0.1)
        nn.init.uniform_(self.bias, -0.1, 0.1)
    
    def forward(self, inputs): #inputs torch.Size([64, 40])
        input = self.embedding(inputs.permute(1, 0)) #input torch.Size([64, 40, 300])
        w = torch.tanh(torch.matmul(input, self.weight_W))  # w torch.Size([64, 40, 300])
        self_matching = torch.matmul(w, self.weight_proj)   # w torch.Size([64, 40, 40])
        att_score, idxs = torch.max(self_matching, dim=1)   # att_score torch.Size([64, 40])
        self.LSTM.flatten_parameters()
        outputs, _ = self.LSTM(input)    #outputs torch.Size([64, 11, 400])
        output = torch.cat((outputs[:,0,:], outputs[:,-1,:]), dim=1)    # output torch.Size([64, 800])
        output = self.decoder1(output)      # output torch.Size([64, 100])
        # outs = self.decoder2(output)
        outs = att_score.mul(output) + self.g
        # f_a = torch.matmul(att_score, self.U)   # output torch.Size([64, 2])
        # f_b = torch.matmul(output, self.V)  # output torch.Size([64, 2])
        # f = f_a.mul(f_b) + self.g   # output torch.Size([64, 2])
        outs = torch.softmax(torch.matmul(outs, self.W_f) + self.bias, dim=1)
        return outs


In [8]:
def train(model, train_iter, optimizer, loss, epoch):
    model.train()
    epoch_loss = 0
    num_sample = 0
    correct = 0
    for batch in tqdm(train_iter, desc=f"Training Epoch {epoch}", colour='red'):
        optimizer.zero_grad()
        text, text_len = batch.tweet
        label = batch.label
        # text, text_len = batch.text
        # label = batch.task1
        output = model(text)
        pred_y = torch.argmax(output, dim=1)
        correct += torch.sum(pred_y == label)
        l = loss(output, label)
        l.backward()
        epoch_loss += l.item()
        num_sample += len(batch)
        optimizer.step()
    print(
        f'\tTrain Loss: {epoch_loss / num_sample:.3f} | Train Acc: {correct.float() / num_sample* 100:.2f}%')


def test(model, test_iter):
    true_y, pred_y = [], []
    for batch in tqdm(test_iter, desc=f"Testing", colour='green'):
        text, text_len = batch.tweet
        label = batch.label
        # text, text_len = batch.text
        # label = batch.task1
        with torch.no_grad():
            output = model(text)
            pred_y.extend(output.argmax(dim=1).tolist())
            true_y.extend(label.tolist())
    print(metrics.confusion_matrix(true_y, pred_y))
    print(metrics.classification_report(true_y, pred_y))
    print(f'Acc : {metrics.accuracy_score(true_y, pred_y)}\t F1: {metrics.f1_score(true_y, pred_y, average="macro")}')


In [12]:

num_hiddens, output_dim, max_length, num_layers, dropout = 100, 2, 20, 2, 0.3
model = Model(len(TEXT.vocab), 300, num_hiddens, output_dim, max_length, num_layers, dropout)
model.to(DEVICE)
pretrained_embeddings = TEXT.vocab.vectors
# print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(300)
model.embedding.weight.data[PAD_IDX] = torch.zeros(300)


train_iter, test_iter = data.BucketIterator.splits((train_data, test_data), batch_size=64, sort_within_batch=True, sort_key=lambda x: len(x.tweet), device=DEVICE)


In [14]:
lr, num_epochs = 0.0001, 20

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
for epoch in range(1, num_epochs+1):
    train(model, train_iter, optimizer, loss, epoch)
    test(model, test_iter)


Training Epoch 1: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 88.34it/s]


	Train Loss: 0.006 | Train Acc: 96.33%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 302.04it/s]


[[297 176]
 [ 92 219]]
              precision    recall  f1-score   support

           0       0.76      0.63      0.69       473
           1       0.55      0.70      0.62       311

    accuracy                           0.66       784
   macro avg       0.66      0.67      0.65       784
weighted avg       0.68      0.66      0.66       784

Acc : 0.6581632653061225	 F1: 0.6547458640883905


Training Epoch 2: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 87.90it/s]


	Train Loss: 0.006 | Train Acc: 96.75%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 295.18it/s]


[[347 126]
 [129 182]]
              precision    recall  f1-score   support

           0       0.73      0.73      0.73       473
           1       0.59      0.59      0.59       311

    accuracy                           0.67       784
   macro avg       0.66      0.66      0.66       784
weighted avg       0.67      0.67      0.67       784

Acc : 0.6747448979591837	 F1: 0.6596706677039517


Training Epoch 3: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 87.12it/s]


	Train Loss: 0.005 | Train Acc: 97.04%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 300.94it/s]


[[349 124]
 [128 183]]
              precision    recall  f1-score   support

           0       0.73      0.74      0.73       473
           1       0.60      0.59      0.59       311

    accuracy                           0.68       784
   macro avg       0.66      0.66      0.66       784
weighted avg       0.68      0.68      0.68       784

Acc : 0.6785714285714286	 F1: 0.6634849259070006


Training Epoch 4: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 88.47it/s]


	Train Loss: 0.005 | Train Acc: 97.28%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 302.05it/s]


[[342 131]
 [127 184]]
              precision    recall  f1-score   support

           0       0.73      0.72      0.73       473
           1       0.58      0.59      0.59       311

    accuracy                           0.67       784
   macro avg       0.66      0.66      0.66       784
weighted avg       0.67      0.67      0.67       784

Acc : 0.6709183673469388	 F1: 0.6569870373008282


Training Epoch 5: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 88.98it/s]


	Train Loss: 0.005 | Train Acc: 97.30%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 292.40it/s]


[[301 172]
 [102 209]]
              precision    recall  f1-score   support

           0       0.75      0.64      0.69       473
           1       0.55      0.67      0.60       311

    accuracy                           0.65       784
   macro avg       0.65      0.65      0.65       784
weighted avg       0.67      0.65      0.65       784

Acc : 0.6505102040816326	 F1: 0.6456304273233564


Training Epoch 6: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 88.41it/s]


	Train Loss: 0.005 | Train Acc: 97.46%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 309.25it/s]


[[330 143]
 [124 187]]
              precision    recall  f1-score   support

           0       0.73      0.70      0.71       473
           1       0.57      0.60      0.58       311

    accuracy                           0.66       784
   macro avg       0.65      0.65      0.65       784
weighted avg       0.66      0.66      0.66       784

Acc : 0.6594387755102041	 F1: 0.647718724282952


Training Epoch 7: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 90.47it/s]


	Train Loss: 0.005 | Train Acc: 97.56%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 302.05it/s]


[[339 134]
 [131 180]]
              precision    recall  f1-score   support

           0       0.72      0.72      0.72       473
           1       0.57      0.58      0.58       311

    accuracy                           0.66       784
   macro avg       0.65      0.65      0.65       784
weighted avg       0.66      0.66      0.66       784

Acc : 0.6619897959183674	 F1: 0.64749098621421


Training Epoch 8: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 89.16it/s]


	Train Loss: 0.005 | Train Acc: 97.72%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 316.79it/s]


[[331 142]
 [124 187]]
              precision    recall  f1-score   support

           0       0.73      0.70      0.71       473
           1       0.57      0.60      0.58       311

    accuracy                           0.66       784
   macro avg       0.65      0.65      0.65       784
weighted avg       0.66      0.66      0.66       784

Acc : 0.6607142857142857	 F1: 0.6488685344827586


Training Epoch 9: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 91.80it/s]


	Train Loss: 0.005 | Train Acc: 97.77%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 288.63it/s]


[[328 145]
 [130 181]]
              precision    recall  f1-score   support

           0       0.72      0.69      0.70       473
           1       0.56      0.58      0.57       311

    accuracy                           0.65       784
   macro avg       0.64      0.64      0.64       784
weighted avg       0.65      0.65      0.65       784

Acc : 0.6492346938775511	 F1: 0.6364537717921177


Training Epoch 10: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 91.56it/s]


	Train Loss: 0.005 | Train Acc: 97.93%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 302.05it/s]


[[338 135]
 [132 179]]
              precision    recall  f1-score   support

           0       0.72      0.71      0.72       473
           1       0.57      0.58      0.57       311

    accuracy                           0.66       784
   macro avg       0.64      0.65      0.64       784
weighted avg       0.66      0.66      0.66       784

Acc : 0.6594387755102041	 F1: 0.6448305408271475


Training Epoch 11: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 92.23it/s]


	Train Loss: 0.005 | Train Acc: 97.83%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 316.79it/s]


[[303 170]
 [ 96 215]]
              precision    recall  f1-score   support

           0       0.76      0.64      0.69       473
           1       0.56      0.69      0.62       311

    accuracy                           0.66       784
   macro avg       0.66      0.67      0.66       784
weighted avg       0.68      0.66      0.66       784

Acc : 0.6607142857142857	 F1: 0.6563851101971949


Training Epoch 12: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 90.37it/s]


	Train Loss: 0.005 | Train Acc: 98.11%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 309.25it/s]


[[332 141]
 [129 182]]
              precision    recall  f1-score   support

           0       0.72      0.70      0.71       473
           1       0.56      0.59      0.57       311

    accuracy                           0.66       784
   macro avg       0.64      0.64      0.64       784
weighted avg       0.66      0.66      0.66       784

Acc : 0.6556122448979592	 F1: 0.6425266314957545


Training Epoch 13: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 91.38it/s]


	Train Loss: 0.005 | Train Acc: 98.19%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 324.71it/s]


[[310 163]
 [111 200]]
              precision    recall  f1-score   support

           0       0.74      0.66      0.69       473
           1       0.55      0.64      0.59       311

    accuracy                           0.65       784
   macro avg       0.64      0.65      0.64       784
weighted avg       0.66      0.65      0.65       784

Acc : 0.6505102040816326	 F1: 0.64349205716979


Training Epoch 14: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 90.88it/s]


	Train Loss: 0.005 | Train Acc: 98.30%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 309.25it/s]


[[315 158]
 [117 194]]
              precision    recall  f1-score   support

           0       0.73      0.67      0.70       473
           1       0.55      0.62      0.59       311

    accuracy                           0.65       784
   macro avg       0.64      0.64      0.64       784
weighted avg       0.66      0.65      0.65       784

Acc : 0.6492346938775511	 F1: 0.6406756497754222


Training Epoch 15: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 92.86it/s]


	Train Loss: 0.005 | Train Acc: 98.32%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 316.79it/s]


[[310 163]
 [114 197]]
              precision    recall  f1-score   support

           0       0.73      0.66      0.69       473
           1       0.55      0.63      0.59       311

    accuracy                           0.65       784
   macro avg       0.64      0.64      0.64       784
weighted avg       0.66      0.65      0.65       784

Acc : 0.6466836734693877	 F1: 0.6391880868003463


Training Epoch 16: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 90.32it/s]


	Train Loss: 0.005 | Train Acc: 98.32%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 309.24it/s]


[[308 165]
 [109 202]]
              precision    recall  f1-score   support

           0       0.74      0.65      0.69       473
           1       0.55      0.65      0.60       311

    accuracy                           0.65       784
   macro avg       0.64      0.65      0.64       784
weighted avg       0.66      0.65      0.65       784

Acc : 0.6505102040816326	 F1: 0.6440025189751748


Training Epoch 17: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 81.61it/s]


	Train Loss: 0.005 | Train Acc: 98.30%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 302.06it/s]


[[314 159]
 [114 197]]
              precision    recall  f1-score   support

           0       0.73      0.66      0.70       473
           1       0.55      0.63      0.59       311

    accuracy                           0.65       784
   macro avg       0.64      0.65      0.64       784
weighted avg       0.66      0.65      0.65       784

Acc : 0.6517857142857143	 F1: 0.643853988654951


Training Epoch 18: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 81.60it/s]


	Train Loss: 0.005 | Train Acc: 98.32%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 288.63it/s]


[[318 155]
 [122 189]]
              precision    recall  f1-score   support

           0       0.72      0.67      0.70       473
           1       0.55      0.61      0.58       311

    accuracy                           0.65       784
   macro avg       0.64      0.64      0.64       784
weighted avg       0.65      0.65      0.65       784

Acc : 0.6466836734693877	 F1: 0.6368519184301398


Training Epoch 19: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 81.60it/s]


	Train Loss: 0.005 | Train Acc: 98.38%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 282.36it/s]


[[321 152]
 [124 187]]
              precision    recall  f1-score   support

           0       0.72      0.68      0.70       473
           1       0.55      0.60      0.58       311

    accuracy                           0.65       784
   macro avg       0.64      0.64      0.64       784
weighted avg       0.65      0.65      0.65       784

Acc : 0.6479591836734694	 F1: 0.6373655103066869


Training Epoch 20: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 81.27it/s]


	Train Loss: 0.005 | Train Acc: 98.38%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 302.05it/s]

[[321 152]
 [128 183]]
              precision    recall  f1-score   support

           0       0.71      0.68      0.70       473
           1       0.55      0.59      0.57       311

    accuracy                           0.64       784
   macro avg       0.63      0.63      0.63       784
weighted avg       0.65      0.64      0.64       784

Acc : 0.6428571428571429	 F1: 0.6314379159587113



