In [1]:
import sys
sys.path.append("D:/Experiment")
from MyKu import processing
from MyKu import training
from MyKu import MHeadAttention
import pandas as pd
from tqdm import tqdm
import pandas as pd
import torchtext
from torchtext.vocab import Vectors
from torchtext.legacy import data
import torch
from torch import nn
import torch.nn.functional as F
from d2l import torch as d2l
from torch.autograd import Variable
from spacy.lang.en import English
from sklearn import metrics

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)


cuda:0


In [2]:
processing.create_Sem2018()

In [2]:
def tokenizer(text):  # create a tokenizer function
    """
    定义分词操作
    """
    return processing.Pre_processing_tweets().tokenize_process(text)


def DataLoader():

    TEXT = data.Field(sequential=True, tokenize=tokenizer,
                      lower=True, include_lengths=True, fix_length=20)
    LABEL = data.Field(sequential=False, use_vocab=False)
    # 假如train.csv文件并不是只有两列，比如1、3列是review和polarity，2列是我们不需要的数据，
    # 那么就要添加一个全是None的元组， fields列表存储的Field的顺序必须和csv文件中每一列的顺序对应，

    train_fields = [(None, None), ('label', LABEL),  ('tweet', TEXT)]
    # train_fields = [(None, None), (None, None), (None, None), (None, None), ('text', TEXT), ('task1', LABEL)]
    train_data = data.TabularDataset(
        path='D:/Experiment/datasets/SEM2018/train.tsv',
        # path='D:/Experiment/datasets/EXIST2021/train.tsv',
        format='tsv',
        fields=train_fields,
        skip_header=True  # 是否跳过文件的第一行
    )
    test_fields = [(None, None), ('label', LABEL),  ('tweet', TEXT)]
    # test_fields = [(None, None), (None, None), (None, None), (None, None), ('text', TEXT), ('task1', LABEL)]
    test_data = data.TabularDataset(
        path='D:/Experiment/datasets/SEM2018/test.tsv',
        # path='D:/Experiment/datasets/EXIST2021/test.tsv',
        format='tsv',
        fields=test_fields,
        skip_header=True  # 是否跳过文件的第一行
    )
    return train_data, test_data, TEXT, LABEL


In [3]:
train_data, test_data, TEXT, LABEL = DataLoader()

vectors = Vectors(name='glove.6B.300d.txt', cache=processing.EMBEDDING_PATH)

TEXT.build_vocab(train_data,  # 建词表是用训练集建，不要用验证集和测试集
                  max_size=400000, # 单词表容量
                  vectors=vectors, # 还有'glove.840B.300d'已经很多可以选
                  unk_init=torch.Tensor.normal_ # 初始化train_data中不存在预训练词向量词表中的单词
)

In [4]:
train_data, test_data, TEXT, LABEL = DataLoader()

vectors = Vectors(name='glove.6B.300d.txt', cache=processing.EMBEDDING_PATH)

TEXT.build_vocab(train_data,  # 建词表是用训练集建，不要用验证集和测试集
                  max_size=400000, # 单词表容量
                  vectors=vectors, # 还有'glove.840B.300d'已经很多可以选
                  unk_init=torch.Tensor.normal_ # 初始化train_data中不存在预训练词向量词表中的单词
)

In [9]:

class Model(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout, **kwargs):
        super(Model, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.LSTM(
            embed_size, num_hiddens, num_layers=num_layers, bidirectional=True, dropout=dropout)
        self.decoder = nn.Linear(num_hiddens * 4, 2)

    def forward(self, inputs):
        # input torch.Size([64, 40, 300])
        input = self.embedding(inputs.permute(1, 0))
        self.encoder.flatten_parameters()
        outputs, _ = self.encoder(input)
        output = torch.cat((outputs[:,0,:], outputs[:,-1,:]), dim=1)    # output torch.Size([64, 800])
        outs = self.decoder(output)
        return outs


In [16]:

num_hiddens, num_layers, dropout = 100, 2, 0.7
model = Model(len(TEXT.vocab), 300, num_hiddens, num_layers, dropout)
model.to(DEVICE)
pretrained_embeddings = TEXT.vocab.vectors
# print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(300)
model.embedding.weight.data[PAD_IDX] = torch.zeros(300)


train_iter, test_iter = data.BucketIterator.splits((train_data, test_data), batch_size=64, sort_within_batch=True, sort_key=lambda x: len(x.tweet), device=DEVICE)


In [7]:
def train(model, train_iter, optimizer, loss, epoch):
    model.train()
    epoch_loss = 0
    num_sample = 0
    correct = 0
    for batch in tqdm(train_iter, desc=f"Training Epoch {epoch}", colour='red'):
        optimizer.zero_grad()
        text, text_len = batch.tweet
        label = batch.label
        # text, text_len = batch.text
        # label = batch.task1
        output = model(text)
        pred_y = torch.argmax(output, dim=1)
        correct += torch.sum(pred_y == label)
        l = loss(output, label)
        l.backward()
        epoch_loss += l.item()
        num_sample += len(batch)
        optimizer.step()
    print(
        f'\tTrain Loss: {epoch_loss / num_sample:.3f} | Train Acc: {correct.float() / num_sample* 100:.2f}%')


def test(model, test_iter):
    true_y, pred_y = [], []
    for batch in tqdm(test_iter, desc=f"Testing", colour='green'):
        text, text_len = batch.tweet
        label = batch.label
        # text, text_len = batch.text
        # label = batch.task1
        with torch.no_grad():
            output = model(text)
            pred_y.extend(output.argmax(dim=1).tolist())
            true_y.extend(label.tolist())
    print(metrics.confusion_matrix(true_y, pred_y))
    print(metrics.classification_report(true_y, pred_y))
    print(f'Acc : {metrics.accuracy_score(true_y, pred_y)}\t F1: {metrics.f1_score(true_y, pred_y, average="macro")}')


In [20]:
lr, num_epochs = 0.0001, 20

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
for epoch in range(1, num_epochs+1):
    train(model, train_iter, optimizer, loss, epoch)
    test(model, test_iter)


Training Epoch 1: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 89.47it/s]


	Train Loss: 0.007 | Train Acc: 74.48%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 295.19it/s]


[[292 181]
 [148 163]]
              precision    recall  f1-score   support

           0       0.66      0.62      0.64       473
           1       0.47      0.52      0.50       311

    accuracy                           0.58       784
   macro avg       0.57      0.57      0.57       784
weighted avg       0.59      0.58      0.58       784

Acc : 0.5803571428571429	 F1: 0.5686797153917543


Training Epoch 2: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 90.83it/s]


	Train Loss: 0.007 | Train Acc: 75.11%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 288.63it/s]


[[297 176]
 [154 157]]
              precision    recall  f1-score   support

           0       0.66      0.63      0.64       473
           1       0.47      0.50      0.49       311

    accuracy                           0.58       784
   macro avg       0.57      0.57      0.57       784
weighted avg       0.58      0.58      0.58       784

Acc : 0.5790816326530612	 F1: 0.5652173913043478


Training Epoch 3: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 89.92it/s]


	Train Loss: 0.007 | Train Acc: 74.06%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 276.35it/s]


[[304 169]
 [158 153]]
              precision    recall  f1-score   support

           0       0.66      0.64      0.65       473
           1       0.48      0.49      0.48       311

    accuracy                           0.58       784
   macro avg       0.57      0.57      0.57       784
weighted avg       0.59      0.58      0.58       784

Acc : 0.5829081632653061	 F1: 0.566839850977013


Training Epoch 4: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 90.60it/s]


	Train Loss: 0.007 | Train Acc: 74.27%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 282.35it/s]


[[260 213]
 [132 179]]
              precision    recall  f1-score   support

           0       0.66      0.55      0.60       473
           1       0.46      0.58      0.51       311

    accuracy                           0.56       784
   macro avg       0.56      0.56      0.56       784
weighted avg       0.58      0.56      0.56       784

Acc : 0.5599489795918368	 F1: 0.5552010787788092


Training Epoch 5: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 92.94it/s]


	Train Loss: 0.007 | Train Acc: 74.40%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 282.35it/s]


[[271 202]
 [139 172]]
              precision    recall  f1-score   support

           0       0.66      0.57      0.61       473
           1       0.46      0.55      0.50       311

    accuracy                           0.57       784
   macro avg       0.56      0.56      0.56       784
weighted avg       0.58      0.57      0.57       784

Acc : 0.5650510204081632	 F1: 0.558003157781617


Training Epoch 6: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 91.87it/s]


	Train Loss: 0.007 | Train Acc: 75.27%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 276.34it/s]


[[301 172]
 [159 152]]
              precision    recall  f1-score   support

           0       0.65      0.64      0.65       473
           1       0.47      0.49      0.48       311

    accuracy                           0.58       784
   macro avg       0.56      0.56      0.56       784
weighted avg       0.58      0.58      0.58       784

Acc : 0.5778061224489796	 F1: 0.5619852984614865


Training Epoch 7: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 92.93it/s]


	Train Loss: 0.007 | Train Acc: 74.80%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 254.67it/s]


[[282 191]
 [144 167]]
              precision    recall  f1-score   support

           0       0.66      0.60      0.63       473
           1       0.47      0.54      0.50       311

    accuracy                           0.57       784
   macro avg       0.56      0.57      0.56       784
weighted avg       0.58      0.57      0.58       784

Acc : 0.5727040816326531	 F1: 0.5633081766653198


Training Epoch 8: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 92.08it/s]


	Train Loss: 0.007 | Train Acc: 75.74%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 282.35it/s]


[[288 185]
 [163 148]]
              precision    recall  f1-score   support

           0       0.64      0.61      0.62       473
           1       0.44      0.48      0.46       311

    accuracy                           0.56       784
   macro avg       0.54      0.54      0.54       784
weighted avg       0.56      0.56      0.56       784

Acc : 0.5561224489795918	 F1: 0.541501976284585


Training Epoch 9: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 93.29it/s]


	Train Loss: 0.007 | Train Acc: 75.84%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 302.06it/s]


[[264 209]
 [115 196]]
              precision    recall  f1-score   support

           0       0.70      0.56      0.62       473
           1       0.48      0.63      0.55       311

    accuracy                           0.59       784
   macro avg       0.59      0.59      0.58       784
weighted avg       0.61      0.59      0.59       784

Acc : 0.5867346938775511	 F1: 0.583602171689354


Training Epoch 10: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 94.93it/s]


	Train Loss: 0.007 | Train Acc: 75.71%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 276.35it/s]


[[303 170]
 [159 152]]
              precision    recall  f1-score   support

           0       0.66      0.64      0.65       473
           1       0.47      0.49      0.48       311

    accuracy                           0.58       784
   macro avg       0.56      0.56      0.56       784
weighted avg       0.58      0.58      0.58       784

Acc : 0.5803571428571429	 F1: 0.5641905534294718


Training Epoch 11: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 93.43it/s]


	Train Loss: 0.007 | Train Acc: 73.93%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 282.35it/s]


[[270 203]
 [139 172]]
              precision    recall  f1-score   support

           0       0.66      0.57      0.61       473
           1       0.46      0.55      0.50       311

    accuracy                           0.56       784
   macro avg       0.56      0.56      0.56       784
weighted avg       0.58      0.56      0.57       784

Acc : 0.5637755102040817	 F1: 0.5568513119533528


Training Epoch 12: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 93.37it/s]


	Train Loss: 0.007 | Train Acc: 75.29%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 276.35it/s]


[[274 199]
 [132 179]]
              precision    recall  f1-score   support

           0       0.67      0.58      0.62       473
           1       0.47      0.58      0.52       311

    accuracy                           0.58       784
   macro avg       0.57      0.58      0.57       784
weighted avg       0.60      0.58      0.58       784

Acc : 0.5778061224489796	 F1: 0.571514668172534


Training Epoch 13: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 92.85it/s]


	Train Loss: 0.007 | Train Acc: 75.27%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 288.63it/s]


[[280 193]
 [141 170]]
              precision    recall  f1-score   support

           0       0.67      0.59      0.63       473
           1       0.47      0.55      0.50       311

    accuracy                           0.57       784
   macro avg       0.57      0.57      0.57       784
weighted avg       0.59      0.57      0.58       784

Acc : 0.5739795918367347	 F1: 0.5654246244332477


Training Epoch 14: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 96.45it/s]


	Train Loss: 0.007 | Train Acc: 75.77%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 282.35it/s]


[[265 208]
 [131 180]]
              precision    recall  f1-score   support

           0       0.67      0.56      0.61       473
           1       0.46      0.58      0.52       311

    accuracy                           0.57       784
   macro avg       0.57      0.57      0.56       784
weighted avg       0.59      0.57      0.57       784

Acc : 0.5676020408163265	 F1: 0.5624589459543553


Training Epoch 15: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 95.28it/s]


	Train Loss: 0.007 | Train Acc: 74.51%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 295.18it/s]


[[271 202]
 [150 161]]
              precision    recall  f1-score   support

           0       0.64      0.57      0.61       473
           1       0.44      0.52      0.48       311

    accuracy                           0.55       784
   macro avg       0.54      0.55      0.54       784
weighted avg       0.56      0.55      0.56       784

Acc : 0.5510204081632653	 F1: 0.542004394612285


Training Epoch 16: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 94.16it/s]


	Train Loss: 0.007 | Train Acc: 74.82%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 295.18it/s]


[[290 183]
 [137 174]]
              precision    recall  f1-score   support

           0       0.68      0.61      0.64       473
           1       0.49      0.56      0.52       311

    accuracy                           0.59       784
   macro avg       0.58      0.59      0.58       784
weighted avg       0.60      0.59      0.60       784

Acc : 0.5918367346938775	 F1: 0.5827012641383897


Training Epoch 17: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 94.58it/s]


	Train Loss: 0.007 | Train Acc: 74.82%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 295.18it/s]


[[303 170]
 [152 159]]
              precision    recall  f1-score   support

           0       0.67      0.64      0.65       473
           1       0.48      0.51      0.50       311

    accuracy                           0.59       784
   macro avg       0.57      0.58      0.57       784
weighted avg       0.59      0.59      0.59       784

Acc : 0.5892857142857143	 F1: 0.5749461206896551


Training Epoch 18: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 94.94it/s]


	Train Loss: 0.007 | Train Acc: 75.29%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 288.63it/s]


[[287 186]
 [144 167]]
              precision    recall  f1-score   support

           0       0.67      0.61      0.63       473
           1       0.47      0.54      0.50       311

    accuracy                           0.58       784
   macro avg       0.57      0.57      0.57       784
weighted avg       0.59      0.58      0.58       784

Acc : 0.5790816326530612	 F1: 0.5689839002025803


Training Epoch 19: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 94.18it/s]


	Train Loss: 0.007 | Train Acc: 76.32%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 282.36it/s]


[[346 127]
 [179 132]]
              precision    recall  f1-score   support

           0       0.66      0.73      0.69       473
           1       0.51      0.42      0.46       311

    accuracy                           0.61       784
   macro avg       0.58      0.58      0.58       784
weighted avg       0.60      0.61      0.60       784

Acc : 0.6096938775510204	 F1: 0.5782723341419681


Training Epoch 20: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 94.82it/s]


	Train Loss: 0.007 | Train Acc: 75.66%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 309.25it/s]

[[279 194]
 [133 178]]
              precision    recall  f1-score   support

           0       0.68      0.59      0.63       473
           1       0.48      0.57      0.52       311

    accuracy                           0.58       784
   macro avg       0.58      0.58      0.58       784
weighted avg       0.60      0.58      0.59       784

Acc : 0.5829081632653061	 F1: 0.5758691714023376



