In [1]:
import pandas as pd
import torchtext
from torchtext.vocab import Vectors
from torchtext.legacy import data
import torch
from torch import nn
import torch.nn.functional as F
from d2l import torch as d2l
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from sklearn import metrics
import sys
sys.path.append("D:/Experiment")
from tqdm import tqdm
from MyKu import training
from MyKu import processing
from torchtext.vocab import Vectors
from spacy.lang.en import English

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda:0


In [2]:
class textCNN(nn.Module):
    def __init__(self, vocab_size, embed_size, kernel_sizes, num_channels, dropout, **kwargs):
        super(textCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.dropout = nn.Dropout(dropout)
        self.decoder = nn.Linear(sum(num_channels), 2)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.relu = nn.ReLU()
        self.convs = nn.ModuleList()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(embed_size, c, k))
    
    def forward(self, inputs):
        inputs = inputs.permute(1, 0)
        embeddings = self.embedding(inputs)
        embeddings = embeddings.permute(0, 2, 1)
        encoding = torch.cat([torch.squeeze(self.relu(self.pool(conv(embeddings))), dim=-1) for conv in self.convs], dim=1)
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [3]:
spacy_en = English()


def tokenizer(text):  # create a tokenizer function
    """
    定义分词操作
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]


def DataLoader():
    def tokenize(x): return x.split()

    TEXT = data.Field(sequential=True, tokenize=tokenizer,
                      lower=True, include_lengths=True, fix_length=40)
    LABEL = data.Field(sequential=False, use_vocab=False)
    # 假如train.csv文件并不是只有两列，比如1、3列是review和polarity，2列是我们不需要的数据，
    # 那么就要添加一个全是None的元组， fields列表存储的Field的顺序必须和csv文件中每一列的顺序对应，

    train_fields = [(None, None), ('tweet', TEXT), ('subtask_a', LABEL)]
    # train_fields = [(None, None), (None, None), (None, None), (None, None), ('text', TEXT), ('task1', LABEL)]
    train_data = data.TabularDataset(
        path='D:/Experiment/datasets/OLID/train.tsv',
        # path='D:/Experiment/datasets/EXIST2021/train.tsv',
        format='tsv',
        fields=train_fields,
        skip_header=True  # 是否跳过文件的第一行
    )
    test_fields = [(None, None), ('tweet', TEXT), ('label', LABEL)]
    # test_fields = [(None, None), (None, None), (None, None), (None, None), ('text', TEXT), ('task1', LABEL)]
    test_data = data.TabularDataset(
        path='D:/Experiment/datasets/OLID/testA.tsv',
        # path='D:/Experiment/datasets/EXIST2021/test.tsv',
        format='tsv',
        fields=test_fields,
        skip_header=True  # 是否跳过文件的第一行
    )
    return train_data, test_data, TEXT, LABEL

train_data, test_data, TEXT, LABEL = DataLoader()

vectors = Vectors(name='glove.6B.300d.txt', cache=processing.EMBEDDING_PATH)

TEXT.build_vocab(train_data,  # 建词表是用训练集建，不要用验证集和测试集
                 max_size=400000,  # 单词表容量
                 vectors=vectors,  # 还有'glove.840B.300d'已经很多可以选
                 unk_init=torch.Tensor.normal_  # 初始化train_data中不存在预训练词向量词表中的单词
                 )


In [4]:

embed_size, kernel_sizes, nums_channels, dropout = 300, [3, 4, 5], [100, 100, 100], 0.5
model = textCNN(len(TEXT.vocab), embed_size, kernel_sizes, nums_channels, dropout)
model.to(DEVICE)

def init_weights(m):
    if type(m) in (nn.Linear, nn.Conv1d):
        nn.init.xavier_uniform_(m.weight)

model.apply(init_weights);

pretrained_embeddings = TEXT.vocab.vectors
# print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(300)
model.embedding.weight.data[PAD_IDX] = torch.zeros(300)

In [5]:
train_iter, test_iter = data.BucketIterator.splits(
    (train_data, test_data), batch_size=64, sort_within_batch=True, sort_key=lambda x: len(x.tweet), device=DEVICE
)

In [6]:
def train(model, train_iter, optimizer, loss, epoch):
    model.train()
    epoch_loss = 0
    num_sample = 0
    correct = 0
    for batch in tqdm(train_iter, desc=f"Training Epoch {epoch}", colour='red'):
        optimizer.zero_grad()
        text, text_len = batch.tweet
        label = batch.subtask_a
        # text, text_len = batch.text
        # label = batch.task1
        output = model(text)
        pred_y = torch.argmax(output, dim=1)
        correct += torch.sum(pred_y == label)
        l = loss(output, label)
        l.backward()
        epoch_loss += l.item()
        num_sample += len(batch)
        optimizer.step()
    print(
        f'\tTrain Loss: {epoch_loss / num_sample:.3f} | Train Acc: {correct.float() / num_sample* 100:.2f}%')


def test(model, test_iter):
    true_y, pred_y = [], []
    for batch in tqdm(test_iter, desc=f"Testing", colour='green'):
        text, text_len = batch.tweet
        label = batch.label
        # text, text_len = batch.text
        # label = batch.task1
        with torch.no_grad():
            output = model(text)
            pred_y.extend(output.argmax(dim=1).tolist())
            true_y.extend(label.tolist())
    print(metrics.confusion_matrix(true_y, pred_y))
    print(metrics.classification_report(true_y, pred_y))
    print(f'Acc : {metrics.accuracy_score(true_y, pred_y)}\t F1: {metrics.f1_score(true_y, pred_y, average="macro")}')

In [7]:
lr, num_epochs = 0.0001, 20
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
for epoch in range(1, num_epochs+1):
    train(model, train_iter, optimizer, loss, epoch)
    test(model, test_iter)

Training Epoch 1: 100%|[31m██████████[0m| 207/207 [00:02<00:00, 77.98it/s] 


	Train Loss: 0.011 | Train Acc: 63.29%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 482.32it/s]


[[540  80]
 [161  79]]
              precision    recall  f1-score   support

           0       0.77      0.87      0.82       620
           1       0.50      0.33      0.40       240

    accuracy                           0.72       860
   macro avg       0.63      0.60      0.61       860
weighted avg       0.69      0.72      0.70       860

Acc : 0.7197674418604652	 F1: 0.6067762138123507


Training Epoch 2: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 116.36it/s]


	Train Loss: 0.009 | Train Acc: 71.15%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 466.23it/s]


[[547  73]
 [147  93]]
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       620
           1       0.56      0.39      0.46       240

    accuracy                           0.74       860
   macro avg       0.67      0.63      0.65       860
weighted avg       0.72      0.74      0.73       860

Acc : 0.7441860465116279	 F1: 0.6453501885717284


Training Epoch 3: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.05it/s]


	Train Loss: 0.008 | Train Acc: 74.62%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 482.32it/s]


[[577  43]
 [149  91]]
              precision    recall  f1-score   support

           0       0.79      0.93      0.86       620
           1       0.68      0.38      0.49       240

    accuracy                           0.78       860
   macro avg       0.74      0.65      0.67       860
weighted avg       0.76      0.78      0.75       860

Acc : 0.7767441860465116	 F1: 0.6719930711714647


Training Epoch 4: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.11it/s]


	Train Loss: 0.008 | Train Acc: 77.45%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 482.32it/s]


[[553  67]
 [128 112]]
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       620
           1       0.63      0.47      0.53       240

    accuracy                           0.77       860
   macro avg       0.72      0.68      0.69       860
weighted avg       0.76      0.77      0.76       860

Acc : 0.7732558139534884	 F1: 0.6923607505884037


Training Epoch 5: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.11it/s]


	Train Loss: 0.007 | Train Acc: 79.74%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 466.23it/s]


[[578  42]
 [136 104]]
              precision    recall  f1-score   support

           0       0.81      0.93      0.87       620
           1       0.71      0.43      0.54       240

    accuracy                           0.79       860
   macro avg       0.76      0.68      0.70       860
weighted avg       0.78      0.79      0.78       860

Acc : 0.7930232558139535	 F1: 0.7027134101343111


Training Epoch 6: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.20it/s]


	Train Loss: 0.007 | Train Acc: 81.10%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 482.31it/s]


[[586  34]
 [129 111]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88       620
           1       0.77      0.46      0.58       240

    accuracy                           0.81       860
   macro avg       0.79      0.70      0.73       860
weighted avg       0.80      0.81      0.79       860

Acc : 0.8104651162790698	 F1: 0.7272629991731117


Training Epoch 7: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.51it/s]


	Train Loss: 0.006 | Train Acc: 82.03%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 482.31it/s]


[[572  48]
 [122 118]]
              precision    recall  f1-score   support

           0       0.82      0.92      0.87       620
           1       0.71      0.49      0.58       240

    accuracy                           0.80       860
   macro avg       0.77      0.71      0.73       860
weighted avg       0.79      0.80      0.79       860

Acc : 0.8023255813953488	 F1: 0.7259524184417903


Training Epoch 8: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.99it/s]


	Train Loss: 0.006 | Train Acc: 83.86%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 482.32it/s]


[[577  43]
 [121 119]]
              precision    recall  f1-score   support

           0       0.83      0.93      0.88       620
           1       0.73      0.50      0.59       240

    accuracy                           0.81       860
   macro avg       0.78      0.71      0.73       860
weighted avg       0.80      0.81      0.80       860

Acc : 0.8093023255813954	 F1: 0.7338044225005473


Training Epoch 9: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 119.64it/s]


	Train Loss: 0.006 | Train Acc: 84.89%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 466.24it/s]


[[574  46]
 [116 124]]
              precision    recall  f1-score   support

           0       0.83      0.93      0.88       620
           1       0.73      0.52      0.60       240

    accuracy                           0.81       860
   macro avg       0.78      0.72      0.74       860
weighted avg       0.80      0.81      0.80       860

Acc : 0.8116279069767441	 F1: 0.7406069633215415


Training Epoch 10: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 119.88it/s]


	Train Loss: 0.005 | Train Acc: 86.19%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 466.24it/s]


[[582  38]
 [115 125]]
              precision    recall  f1-score   support

           0       0.84      0.94      0.88       620
           1       0.77      0.52      0.62       240

    accuracy                           0.82       860
   macro avg       0.80      0.73      0.75       860
weighted avg       0.82      0.82      0.81       860

Acc : 0.8220930232558139	 F1: 0.7520871369060067


Training Epoch 11: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 119.88it/s]


	Train Loss: 0.005 | Train Acc: 86.99%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 466.25it/s]


[[569  51]
 [118 122]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       620
           1       0.71      0.51      0.59       240

    accuracy                           0.80       860
   macro avg       0.77      0.71      0.73       860
weighted avg       0.79      0.80      0.79       860

Acc : 0.8034883720930233	 F1: 0.7307476412166931


Training Epoch 12: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 116.86it/s]


	Train Loss: 0.005 | Train Acc: 88.31%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 466.24it/s]


[[588  32]
 [134 106]]
              precision    recall  f1-score   support

           0       0.81      0.95      0.88       620
           1       0.77      0.44      0.56       240

    accuracy                           0.81       860
   macro avg       0.79      0.70      0.72       860
weighted avg       0.80      0.81      0.79       860

Acc : 0.8069767441860465	 F1: 0.7185752923457841


Training Epoch 13: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 115.18it/s]


	Train Loss: 0.004 | Train Acc: 88.82%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 499.54it/s]


[[573  47]
 [116 124]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.88       620
           1       0.73      0.52      0.60       240

    accuracy                           0.81       860
   macro avg       0.78      0.72      0.74       860
weighted avg       0.80      0.81      0.80       860

Acc : 0.8104651162790698	 F1: 0.7394418948734105


Training Epoch 14: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 115.42it/s]


	Train Loss: 0.004 | Train Acc: 90.43%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 466.24it/s]


[[580  40]
 [121 119]]
              precision    recall  f1-score   support

           0       0.83      0.94      0.88       620
           1       0.75      0.50      0.60       240

    accuracy                           0.81       860
   macro avg       0.79      0.72      0.74       860
weighted avg       0.81      0.81      0.80       860

Acc : 0.8127906976744186	 F1: 0.7373069312190393


Training Epoch 15: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 116.30it/s]


	Train Loss: 0.004 | Train Acc: 91.06%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 466.23it/s]


[[578  42]
 [118 122]]
              precision    recall  f1-score   support

           0       0.83      0.93      0.88       620
           1       0.74      0.51      0.60       240

    accuracy                           0.81       860
   macro avg       0.79      0.72      0.74       860
weighted avg       0.81      0.81      0.80       860

Acc : 0.813953488372093	 F1: 0.7411899244635709


Training Epoch 16: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.69it/s]


	Train Loss: 0.003 | Train Acc: 92.01%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 451.20it/s]


[[584  36]
 [111 129]]
              precision    recall  f1-score   support

           0       0.84      0.94      0.89       620
           1       0.78      0.54      0.64       240

    accuracy                           0.83       860
   macro avg       0.81      0.74      0.76       860
weighted avg       0.82      0.83      0.82       860

Acc : 0.8290697674418605	 F1: 0.7626249823968455


Training Epoch 17: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.78it/s]


	Train Loss: 0.003 | Train Acc: 92.15%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 482.32it/s]


[[570  50]
 [109 131]]
              precision    recall  f1-score   support

           0       0.84      0.92      0.88       620
           1       0.72      0.55      0.62       240

    accuracy                           0.82       860
   macro avg       0.78      0.73      0.75       860
weighted avg       0.81      0.82      0.81       860

Acc : 0.8151162790697675	 F1: 0.749962971699407


Training Epoch 18: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.06it/s]


	Train Loss: 0.003 | Train Acc: 93.06%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 473.11it/s]


[[586  34]
 [112 128]]
              precision    recall  f1-score   support

           0       0.84      0.95      0.89       620
           1       0.79      0.53      0.64       240

    accuracy                           0.83       860
   macro avg       0.81      0.74      0.76       860
weighted avg       0.83      0.83      0.82       860

Acc : 0.8302325581395349	 F1: 0.7630210102748776


Training Epoch 19: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.71it/s]


	Train Loss: 0.003 | Train Acc: 94.27%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 437.09it/s]


[[555  65]
 [113 127]]
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       620
           1       0.66      0.53      0.59       240

    accuracy                           0.79       860
   macro avg       0.75      0.71      0.72       860
weighted avg       0.78      0.79      0.79       860

Acc : 0.7930232558139535	 F1: 0.724882102599494


Training Epoch 20: 100%|[31m██████████[0m| 207/207 [00:01<00:00, 118.72it/s]


	Train Loss: 0.003 | Train Acc: 94.63%


Testing: 100%|[32m██████████[0m| 14/14 [00:00<00:00, 499.54it/s]

[[570  50]
 [117 123]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       620
           1       0.71      0.51      0.60       240

    accuracy                           0.81       860
   macro avg       0.77      0.72      0.73       860
weighted avg       0.80      0.81      0.80       860

Acc : 0.8058139534883721	 F1: 0.7339340596638328



