In [2]:
import torch
import math
from d2l import torch as d2l
from torch import nn
import pandas as pd
import sys
sys.path.append("D:/Experiment")
from MyKu import training
from MyKu import processing
from tqdm import tqdm


In [3]:
from sklearn.model_selection import train_test_split

data = processing.get_HateXplain_train_data()
train_data, test_data = train_test_split(data, test_size=0.2)


In [4]:
glove_embedding = processing.TokenEmbedding('glove.6b.300d')


In [5]:
from torch.utils.data import Dataset, DataLoader
class BiLSTMDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.data_size = len(dataset)

    def __len__(self):
        return self.data_size

    def __getitem__(self, index):
        # 这里可以自行定义，Dataloader会使用__getitem__(self, index)获取数据
        # 这里我设置 self.dataset[index] 规定了数据是按序号取得，序号是多少DataLoader自己算，用户不用操心
        return self.dataset[index]


def coffate_fn(examples):
    inputs, targets = [], []
    for sent, polar in examples:
        inputs.append(sent)
        targets.append(polar)
    
    # inputs = tokenizer(inputs,
    #                    padding=True,
    #                    truncation=True,
    #                    return_tensors="pt",
    #                    max_length=80)
    inputs = d2l.tokenize(inputs, token='word')
    vocab = processing.Vocab(inputs, min_freq=3)
    inputs = [d2l.truncate_pad(vocab[line], 80, vocab['<pad>']) for line in inputs]
    inputs = torch.tensor(inputs)
    targets = torch.tensor(targets)
    return inputs, targets


In [6]:

class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, output_dim, max_length, num_layers, dropout, **kwargs):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.LSTM = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers,
                            bidirectional=True, dropout=dropout, batch_first=True)
        self.n_class = output_dim
        self.decoder1 = nn.Linear(num_hiddens * 4, max_length)
        self.decoder2 = nn.Linear(num_hiddens, self.n_class)
        self.weight_W = nn.Parameter(torch.Tensor(embed_size, embed_size))
        self.weight_proj = nn.Parameter(torch.Tensor(embed_size, max_length))
        self.U = nn.Parameter(torch.Tensor(max_length, output_dim))
        self.V = nn.Parameter(torch.Tensor(max_length, output_dim))
        self.g = nn.Parameter(torch.Tensor(output_dim))
        self.W_f = nn.Parameter(torch.Tensor(output_dim, output_dim))
        self.bias = nn.Parameter(torch.Tensor(output_dim))
        nn.init.uniform_(self.weight_W, -0.1, 0.1)
        nn.init.uniform_(self.weight_proj, -0.1, 0.1)
        nn.init.uniform_(self.U, -0.1, 0.1)
        nn.init.uniform_(self.V, -0.1, 0.1)
        nn.init.uniform_(self.g, -0.1, 0.1)
        nn.init.uniform_(self.W_f, -0.1, 0.1)
        nn.init.uniform_(self.bias, -0.1, 0.1)

    def forward(self, inputs):  # inputs torch.Size([64, 40])
        # input torch.Size([64, 40, 300])
        input = self.embedding(inputs.permute(1, 0))
        # w torch.Size([64, 40, 300])
        w = torch.tanh(torch.matmul(input, self.weight_W))
        # w torch.Size([64, 40, 40])
        self_matching = torch.matmul(w, self.weight_proj)
        # att_score torch.Size([64, 40])
        att_score, idxs = torch.max(self_matching, dim=1)
        self.LSTM.flatten_parameters()
        outputs, _ = self.LSTM(input)  # outputs torch.Size([64, 11, 400])
        # output torch.Size([64, 800])
        output = torch.cat((outputs[:, 0, :], outputs[:, -1, :]), dim=1)
        output = self.decoder1(output)      # output torch.Size([64, max])
        # outs = self.decoder2(output)
        # self_matching_out torch.Size([64, max])
        self_matching_out = att_score.mul(output)
        # outs = torch.cat((output, self_matching_out), dim=-1)
        # output torch.Size([64, 2])
        f_a = torch.matmul(self_matching_out, self.U)
        f_b = torch.matmul(output, self.V)  # output torch.Size([64, 2])
        f = f_a.mul(f_b) + self.g   # output torch.Size([64, 2])
        outs = torch.softmax(torch.matmul(f, self.W_f) + self.bias, dim=1)
        return outs


In [7]:
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = BiLSTMDataset(train_data)
test_dataset = BiLSTMDataset(test_data)
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, collate_fn=coffate_fn, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, collate_fn=coffate_fn)


inputs = []
for sent, polar in train_data:
    inputs.append(sent)
train_tokens = inputs
vocab = processing.Vocab(train_tokens, min_freq=3)
len(vocab)

num_hiddens, output_dim, max_length, num_layers, dropout = 100, 2, 60, 1, 0.5
net = BiRNN(len(vocab), 300, num_hiddens, output_dim, max_length, num_layers, dropout)
net.to(device)
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight)
    if type(m) == nn.LSTM:
        for param in m._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_normal_(m._parameters[param])


net.apply(init_weights)

embeds = glove_embedding[vocab.idx_to_token]
embeds.shape
net.embedding.weight.data.copy_(embeds)
net.embedding.weight.requires_grad = False




In [8]:
def train(model, train_iter, optimizer, loss, epoch):
    model.train()
    epoch_loss = 0
    num_sample = 0
    correct = 0
    for batch in tqdm(train_iter, desc=f"Training Epoch {epoch}", colour='red'):
        optimizer.zero_grad()
        text, text_len = batch.tweet
        label = batch.subtask_a
        # text, text_len = batch.text
        # label = batch.task1
        output = model(text)
        pred_y = torch.argmax(output, dim=1)
        correct += torch.sum(pred_y == label)
        l = loss(output, label)
        l.backward()
        epoch_loss += l.item()
        num_sample += len(batch)
        optimizer.step()
    print(
        f'\tTrain Loss: {epoch_loss / num_sample:.3f} | Train Acc: {correct.float() / num_sample* 100:.2f}%')


def test(model, test_iter):
    true_y, pred_y = [], []
    for batch in tqdm(test_iter, desc=f"Testing", colour='green'):
        text, text_len = batch.tweet
        label = batch.label
        # text, text_len = batch.text
        # label = batch.task1
        with torch.no_grad():
            output = model(text)
            pred_y.extend(output.argmax(dim=1).tolist())
            true_y.extend(label.tolist())
    print(metrics.confusion_matrix(true_y, pred_y))
    print(metrics.classification_report(true_y, pred_y))
    print(f'Acc : {metrics.accuracy_score(true_y, pred_y)}\t F1: {metrics.f1_score(true_y, pred_y, average="macro")}')


In [9]:
lr, num_epochs = 0.0001, 20
trainer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
for epoch in range(1, num_epochs + 1):
    train(net, train_dataloader, loss, trainer, epoch, device)
    test(net, test_dataloader, epoch, device)


Training Epoch 1:   0%|[31m          [0m| 0/252 [00:00<?, ?it/s]

: 

: 

In [None]:

temp = []
for index in train_dataloader:
    temp = index[0][0]
    break

print(temp.tolist())
print(vocab.to_tokens(temp.tolist()))

[0, 9, 2, 0, 0, 3, 34, 0, 11, 5, 35, 0, 3, 0, 0, 12, 0, 0, 47, 0, 13, 0, 0, 0, 0, 17, 0, 0, 0, 5, 0, 17, 3, 0, 36, 11, 18, 5, 0, 48, 17, 0, 6, 0, 3, 0, 3, 34, 0, 26, 37, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['<unk>', 'women', 'to', '<unk>', '<unk>', 'a', 'its', '<unk>', 'in', 'you', 'if', '<unk>', 'a', '<unk>', '<unk>', 'that', '<unk>', '<unk>', 'no', '<unk>', 'it', '<unk>', '<unk>', '<unk>', '<unk>', 'this', '<unk>', '<unk>', '<unk>', 'you', '<unk>', 'this', 'a', '<unk>', 'woman', 'in', 'me', 'you', '<unk>', 'what', 'this', '<unk>', 'i', '<unk>', 'a', '<unk>', 'a', 'its', '<unk>', 'dont', 'was', 'when', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']


In [None]:
import torch
X = torch.tensor([[0.9,0.1],[0.4, 0.6]])
import numpy as np

In [None]:


from d2l import torch as d2l
device = d2l.try_gpu()
tt = X.to(device)
tt = np.argmax(tt.cpu(), axis=1)
y =  torch.tensor([1, 1])
y = y.to(device)


In [None]:
from sklearn import metrics
metrics.f1_score(tt.cpu().numpy(), y.cpu().numpy())

0.6666666666666666

In [None]:
def sldd(tokens):
    return [12,12,12], [11,11,11]

ss = sldd(12)
ss + ([11,1,1],)

([12, 12, 12], [11, 11, 11], [11, 1, 1])