In [1]:
import pandas as pd
import torchtext
from torchtext.vocab import Vectors
from torchtext.legacy import data
import torch
from torch import nn
import torch.nn.functional as F
from d2l import torch as d2l
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from sklearn import metrics
import sys
sys.path.append("D:/Experiment")
from tqdm import tqdm
from MyKu import training
from MyKu import processing
from torchtext.vocab import Vectors
from spacy.lang.en import English

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)


cuda:0


In [2]:
processing.create_OLID()
# processing.set_olid_train_data(processing.ORIGIN_DATASET_PATH + '/OLID')
# processing.set_olid_testA_data(processing.ORIGIN_DATASET_PATH + '/OLID')


In [2]:
def tokenizer(text):  # create a tokenizer function
    """
    定义分词操作
    """
    return processing.Pre_processing_tweets().tokenize_process(text)


def DataLoader():

    TEXT = data.Field(sequential=True, tokenize=tokenizer,
                      lower=True, include_lengths=True, fix_length=20)
    LABEL = data.Field(sequential=False, use_vocab=False)
    # 假如train.csv文件并不是只有两列，比如1、3列是review和polarity，2列是我们不需要的数据，
    # 那么就要添加一个全是None的元组， fields列表存储的Field的顺序必须和csv文件中每一列的顺序对应，

    train_fields = [(None, None), ('label', LABEL),  ('tweet', TEXT)]
    # train_fields = [(None, None), (None, None), (None, None), (None, None), ('text', TEXT), ('task1', LABEL)]
    train_data = data.TabularDataset(
        path='D:/Experiment/datasets/SEM2018/train.tsv',
        # path='D:/Experiment/datasets/EXIST2021/train.tsv',
        format='tsv',
        fields=train_fields,
        skip_header=True  # 是否跳过文件的第一行
    )
    test_fields = [(None, None), ('label', LABEL),  ('tweet', TEXT)]
    # test_fields = [(None, None), (None, None), (None, None), (None, None), ('text', TEXT), ('task1', LABEL)]
    test_data = data.TabularDataset(
        path='D:/Experiment/datasets/SEM2018/test.tsv',
        # path='D:/Experiment/datasets/EXIST2021/test.tsv',
        format='tsv',
        fields=test_fields,
        skip_header=True  # 是否跳过文件的第一行
    )
    return train_data, test_data, TEXT, LABEL


In [3]:
train_data, test_data, TEXT, LABEL = DataLoader()


vectors = Vectors(name='glove.6B.300d.txt', cache=processing.EMBEDDING_PATH)

TEXT.build_vocab(train_data,  # 建词表是用训练集建，不要用验证集和测试集
                  max_size=400000, # 单词表容量
                  vectors=vectors, # 还有'glove.840B.300d'已经很多可以选
                  unk_init=torch.Tensor.normal_ # 初始化train_data中不存在预训练词向量词表中的单词
)

In [4]:
from operator import truediv

# BiLSTM + Attention 
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, n_class, bidirectional, dropout, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.LSTM(
            embed_size, num_hiddens, num_layers=num_layers, bidirectional=True, dropout=dropout, batch_first=True)
        self.n_class = n_class  # 类别数
        self.bidirectional = bidirectional  # 控制是否双向LSTM
        if self.bidirectional:
            self.decoder1 = nn.Linear(num_hiddens * 2, num_hiddens)
            self.decoder2 = nn.Linear(num_hiddens, n_class)
        else:
            self.decoder1 = nn.Linear(num_hiddens * 2, num_hiddens)
            self.decoder2 = nn.Linear(num_hiddens, n_class)
        self.weight_W = nn.Parameter(torch.Tensor(2 * num_hiddens, 2 * num_hiddens))
        self.weight_proj = nn.Parameter(torch.Tensor(2 * num_hiddens, 1))
        self.t = nn.Parameter(torch.Tensor(num_hiddens, 2))
        nn.init.uniform_(self.weight_W, -0.1, 0.1)
        nn.init.uniform_(self.weight_proj, -0.1, 0.1)


    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        # print(embeddings.shape)
        self.encoder.flatten_parameters()
        outputs, _ = self.encoder(embeddings.permute(1, 0, 2))
        # print(outputs.shape)
        u = torch.tanh(torch.matmul(outputs, self.weight_W))
        # print(u.shape)
        att = torch.matmul(u, self.weight_proj)
        att_score = F.softmax(att, dim=1)
        # print(att_score.shape)
        score_x = outputs * att_score
        encoding = torch.sum(score_x, dim=1)
        # encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        outs = self.decoder1(encoding)
        outs = self.decoder2(outs)
        # print(outs.shape)
        return outs


In [24]:
# X = torch.ones((4, 6), dtype=torch.long).to(device=DEVICE)
model = BiRNN(len(TEXT.vocab), 300, 200, 1, 2, True, 0.7)
model.to(DEVICE)



BiRNN(
  (embedding): Embedding(9199, 300)
  (encoder): LSTM(300, 200, batch_first=True, dropout=0.7, bidirectional=True)
  (decoder1): Linear(in_features=400, out_features=200, bias=True)
  (decoder2): Linear(in_features=200, out_features=2, bias=True)
)

In [25]:
pretrained_embeddings = TEXT.vocab.vectors
# print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(300)
model.embedding.weight.data[PAD_IDX] = torch.zeros(300)

In [7]:
train_iter, test_iter = data.BucketIterator.splits(
    (train_data, test_data), batch_size=64, sort_within_batch=True, sort_key=lambda x : len(x.tweet), device=DEVICE
)

In [8]:
def train(model, train_iter, optimizer, loss, epoch):
    model.train()
    epoch_loss = 0
    num_sample = 0
    correct = 0
    for batch in tqdm(train_iter, desc=f"Training Epoch {epoch}", colour='red'):
        optimizer.zero_grad()
        text, text_len = batch.tweet
        label = batch.label
        # text, text_len = batch.text
        # label = batch.task1
        output = model(text)
        pred_y = torch.argmax(output, dim=1)
        correct += torch.sum(pred_y == label)
        l = loss(output, label)
        l.backward()
        epoch_loss += l.item()
        num_sample += len(batch)
        optimizer.step()
    print(
        f'\tTrain Loss: {epoch_loss / num_sample:.3f} | Train Acc: {correct.float() / num_sample* 100:.2f}%')


def test(model, test_iter):
    true_y, pred_y = [], []
    for batch in tqdm(test_iter, desc=f"Testing", colour='green'):
        text, text_len = batch.tweet
        label = batch.label
        # text, text_len = batch.text
        # label = batch.task1
        with torch.no_grad():
            output = model(text)
            pred_y.extend(output.argmax(dim=1).tolist())
            true_y.extend(label.tolist())
    print(metrics.confusion_matrix(true_y, pred_y))
    print(metrics.classification_report(true_y, pred_y))
    print(f'Acc : {metrics.accuracy_score(true_y, pred_y)}\t F1: {metrics.f1_score(true_y, pred_y, average="macro")}')


In [26]:
lr, num_epochs = 0.0001, 20
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
for epoch in range(1, num_epochs+1):
    train(model, train_iter, optimizer, loss, epoch)
    test(model, test_iter)

Training Epoch 1: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 107.12it/s]


	Train Loss: 0.011 | Train Acc: 51.30%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 282.35it/s]


[[ 27 446]
 [  7 304]]
              precision    recall  f1-score   support

           0       0.79      0.06      0.11       473
           1       0.41      0.98      0.57       311

    accuracy                           0.42       784
   macro avg       0.60      0.52      0.34       784
weighted avg       0.64      0.42      0.29       784

Acc : 0.4221938775510204	 F1: 0.33977658678593936


Training Epoch 2: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 104.34it/s]


	Train Loss: 0.011 | Train Acc: 57.09%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 371.09it/s]


[[150 323]
 [ 49 262]]
              precision    recall  f1-score   support

           0       0.75      0.32      0.45       473
           1       0.45      0.84      0.58       311

    accuracy                           0.53       784
   macro avg       0.60      0.58      0.52       784
weighted avg       0.63      0.53      0.50       784

Acc : 0.5255102040816326	 F1: 0.515625


Training Epoch 3: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 110.29it/s]


	Train Loss: 0.010 | Train Acc: 60.10%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 360.78it/s]


[[196 277]
 [ 67 244]]
              precision    recall  f1-score   support

           0       0.75      0.41      0.53       473
           1       0.47      0.78      0.59       311

    accuracy                           0.56       784
   macro avg       0.61      0.60      0.56       784
weighted avg       0.64      0.56      0.55       784

Acc : 0.5612244897959183	 F1: 0.5595735785953178


Training Epoch 4: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 104.62it/s]


	Train Loss: 0.010 | Train Acc: 65.23%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 319.91it/s]


[[296 177]
 [107 204]]
              precision    recall  f1-score   support

           0       0.73      0.63      0.68       473
           1       0.54      0.66      0.59       311

    accuracy                           0.64       784
   macro avg       0.63      0.64      0.63       784
weighted avg       0.66      0.64      0.64       784

Acc : 0.6377551020408163	 F1: 0.6326972312402671


Training Epoch 5: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 103.13it/s]


	Train Loss: 0.009 | Train Acc: 69.48%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 371.09it/s]


[[341 132]
 [137 174]]
              precision    recall  f1-score   support

           0       0.71      0.72      0.72       473
           1       0.57      0.56      0.56       311

    accuracy                           0.66       784
   macro avg       0.64      0.64      0.64       784
weighted avg       0.66      0.66      0.66       784

Acc : 0.6568877551020408	 F1: 0.6405796508665279


Training Epoch 6: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 111.98it/s]


	Train Loss: 0.009 | Train Acc: 72.75%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 360.78it/s]


[[350 123]
 [143 168]]
              precision    recall  f1-score   support

           0       0.71      0.74      0.72       473
           1       0.58      0.54      0.56       311

    accuracy                           0.66       784
   macro avg       0.64      0.64      0.64       784
weighted avg       0.66      0.66      0.66       784

Acc : 0.6607142857142857	 F1: 0.6413886080215706


Training Epoch 7: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 112.26it/s]


	Train Loss: 0.008 | Train Acc: 76.19%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 371.09it/s]


[[370 103]
 [149 162]]
              precision    recall  f1-score   support

           0       0.71      0.78      0.75       473
           1       0.61      0.52      0.56       311

    accuracy                           0.68       784
   macro avg       0.66      0.65      0.65       784
weighted avg       0.67      0.68      0.67       784

Acc : 0.6785714285714286	 F1: 0.6542338709677419


Training Epoch 8: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 111.49it/s]


	Train Loss: 0.007 | Train Acc: 79.49%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 341.79it/s]


[[319 154]
 [122 189]]
              precision    recall  f1-score   support

           0       0.72      0.67      0.70       473
           1       0.55      0.61      0.58       311

    accuracy                           0.65       784
   macro avg       0.64      0.64      0.64       784
weighted avg       0.65      0.65      0.65       784

Acc : 0.6479591836734694	 F1: 0.6380061429747255


Training Epoch 9: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 111.36it/s]


	Train Loss: 0.006 | Train Acc: 82.60%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 341.79it/s]


[[306 167]
 [104 207]]
              precision    recall  f1-score   support

           0       0.75      0.65      0.69       473
           1       0.55      0.67      0.60       311

    accuracy                           0.65       784
   macro avg       0.65      0.66      0.65       784
weighted avg       0.67      0.65      0.66       784

Acc : 0.6543367346938775	 F1: 0.6487356473865638


Training Epoch 10: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 111.95it/s]


	Train Loss: 0.005 | Train Acc: 85.12%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 371.09it/s]


[[331 142]
 [124 187]]
              precision    recall  f1-score   support

           0       0.73      0.70      0.71       473
           1       0.57      0.60      0.58       311

    accuracy                           0.66       784
   macro avg       0.65      0.65      0.65       784
weighted avg       0.66      0.66      0.66       784

Acc : 0.6607142857142857	 F1: 0.6488685344827586


Training Epoch 11: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 112.77it/s]


	Train Loss: 0.005 | Train Acc: 87.69%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 282.35it/s]


[[280 193]
 [ 92 219]]
              precision    recall  f1-score   support

           0       0.75      0.59      0.66       473
           1       0.53      0.70      0.61       311

    accuracy                           0.64       784
   macro avg       0.64      0.65      0.63       784
weighted avg       0.66      0.64      0.64       784

Acc : 0.6364795918367347	 F1: 0.6342655110609148


Training Epoch 12: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 111.42it/s]


	Train Loss: 0.004 | Train Acc: 88.71%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 371.09it/s]


[[318 155]
 [116 195]]
              precision    recall  f1-score   support

           0       0.73      0.67      0.70       473
           1       0.56      0.63      0.59       311

    accuracy                           0.65       784
   macro avg       0.64      0.65      0.65       784
weighted avg       0.66      0.65      0.66       784

Acc : 0.6543367346938775	 F1: 0.6456139590043484


Training Epoch 13: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 108.68it/s]


	Train Loss: 0.003 | Train Acc: 92.32%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 371.09it/s]


[[289 184]
 [ 95 216]]
              precision    recall  f1-score   support

           0       0.75      0.61      0.67       473
           1       0.54      0.69      0.61       311

    accuracy                           0.64       784
   macro avg       0.65      0.65      0.64       784
weighted avg       0.67      0.64      0.65       784

Acc : 0.6441326530612245	 F1: 0.6410203388328435


Training Epoch 14: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 114.18it/s]


	Train Loss: 0.003 | Train Acc: 93.63%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 360.78it/s]


[[277 196]
 [ 74 237]]
              precision    recall  f1-score   support

           0       0.79      0.59      0.67       473
           1       0.55      0.76      0.64       311

    accuracy                           0.66       784
   macro avg       0.67      0.67      0.65       784
weighted avg       0.69      0.66      0.66       784

Acc : 0.6556122448979592	 F1: 0.6547134356404636


Training Epoch 15: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 109.87it/s]


	Train Loss: 0.002 | Train Acc: 94.76%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 309.24it/s]


[[292 181]
 [102 209]]
              precision    recall  f1-score   support

           0       0.74      0.62      0.67       473
           1       0.54      0.67      0.60       311

    accuracy                           0.64       784
   macro avg       0.64      0.64      0.63       784
weighted avg       0.66      0.64      0.64       784

Acc : 0.639030612244898	 F1: 0.6349390473651909


Training Epoch 16: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 107.40it/s]


	Train Loss: 0.001 | Train Acc: 96.73%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 309.24it/s]


[[335 138]
 [125 186]]
              precision    recall  f1-score   support

           0       0.73      0.71      0.72       473
           1       0.57      0.60      0.59       311

    accuracy                           0.66       784
   macro avg       0.65      0.65      0.65       784
weighted avg       0.67      0.66      0.67       784

Acc : 0.6645408163265306	 F1: 0.6519701918289154


Training Epoch 17: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 93.52it/s]


	Train Loss: 0.001 | Train Acc: 97.17%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 302.05it/s]


[[317 156]
 [114 197]]
              precision    recall  f1-score   support

           0       0.74      0.67      0.70       473
           1       0.56      0.63      0.59       311

    accuracy                           0.66       784
   macro avg       0.65      0.65      0.65       784
weighted avg       0.67      0.66      0.66       784

Acc : 0.6556122448979592	 F1: 0.647350463802111


Training Epoch 18: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 93.86it/s]


	Train Loss: 0.001 | Train Acc: 98.09%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 276.34it/s]


[[291 182]
 [ 98 213]]
              precision    recall  f1-score   support

           0       0.75      0.62      0.68       473
           1       0.54      0.68      0.60       311

    accuracy                           0.64       784
   macro avg       0.64      0.65      0.64       784
weighted avg       0.67      0.64      0.65       784

Acc : 0.6428571428571429	 F1: 0.6392867236744378


Training Epoch 19: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 89.67it/s]


	Train Loss: 0.001 | Train Acc: 97.41%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 324.70it/s]


[[323 150]
 [120 191]]
              precision    recall  f1-score   support

           0       0.73      0.68      0.71       473
           1       0.56      0.61      0.59       311

    accuracy                           0.66       784
   macro avg       0.64      0.65      0.65       784
weighted avg       0.66      0.66      0.66       784

Acc : 0.6556122448979592	 F1: 0.6455648726123182


Training Epoch 20: 100%|[31m██████████[0m| 60/60 [00:00<00:00, 100.43it/s]


	Train Loss: 0.001 | Train Acc: 98.72%


Testing: 100%|[32m██████████[0m| 13/13 [00:00<00:00, 382.00it/s]

[[318 155]
 [114 197]]
              precision    recall  f1-score   support

           0       0.74      0.67      0.70       473
           1       0.56      0.63      0.59       311

    accuracy                           0.66       784
   macro avg       0.65      0.65      0.65       784
weighted avg       0.67      0.66      0.66       784

Acc : 0.6568877551020408	 F1: 0.6485154537803222



