In [1]:
import torch
import math
from d2l import torch as d2l
from torch import nn
import pandas as pd
import sys
sys.path.append("D:/Experiment")
from MyKu import training
from MyKu import processing
from tqdm import tqdm

In [29]:
def get_train_data():
    training_data = pd.read_csv('data\\EXIST2021_training.tsv', sep='\t')
    train_data = []
    for index in range(3437):
        items = training_data.iloc[index]
        text = items['text']
        text = processing.Pre_processing_tweets().clean_unuseful(text)
        label = 1 if items['task1'] == 'sexist' else 0
        train_data.append((text, label))
    return train_data

def get_test_data():
    testing_data = pd.read_csv('data\\EXIST2021_test_labeled.tsv', sep='\t')
    test_data = []
    for index in range(1000):
        items = testing_data.iloc[index]
        text = items['text']
        text = processing.Pre_processing_tweets().clean_unuseful(text)
        label = 1 if items['task1'] == 'sexist' else 0
        test_data.append((text, label))
    return test_data

In [3]:
glove_embedding = processing.TokenEmbedding('glove.42b.300d')


In [30]:
from torch.utils.data import Dataset, DataLoader
tokenizer = processing.Pre_processing_tweets()
class BiLSTMDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.data_size = len(dataset)

    def __len__(self):
        return self.data_size

    def __getitem__(self, index):
        # 这里可以自行定义，Dataloader会使用__getitem__(self, index)获取数据
        # 这里我设置 self.dataset[index] 规定了数据是按序号取得，序号是多少DataLoader自己算，用户不用操心
        return self.dataset[index]


def coffate_fn(examples):
    inputs, targets = [], []
    for sent, polar in examples:
        inputs.append(sent)
        targets.append(polar)
    
    # inputs = tokenizer(inputs,
    #                    padding=True,
    #                    truncation=True,
    #                    return_tensors="pt",
    #                    max_length=80)
    inputs = d2l.tokenize(inputs, token='word')
    vocab = processing.Vocab(inputs, min_freq=3)
    inputs = [d2l.truncate_pad(vocab[line], 80, vocab['<pad>']) for line in inputs]
    inputs = torch.tensor(inputs)
    targets = torch.tensor(targets)
    return inputs, targets


def load_data():
    train_data = get_train_data()
    test_data = get_test_data()
    # train_data = tokenizer.tokenize_process(train_data[0])
    # train_dataset = BiLSTMDataset(train_data)
    # test_dataset = BiLSTMDataset(test_data)
    return train_data, test_data


In [37]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers, bidirectional=True, dropout=0.5)
        self.decoder = nn.Linear(num_hiddens * 4, 2)

    def forward(self, inputs):
        embeddings = self.embedding(inputs.T)
        outputs, _ = self.encoder(embeddings)
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        outs = self.decoder(encoding)
        return outs

In [65]:
batch_size = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_data, test_data = load_data()
train_dataset = BiLSTMDataset(train_data)
test_dataset = BiLSTMDataset(test_data)
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, collate_fn=coffate_fn, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=100, collate_fn=coffate_fn)

inputs = []
for sent, polar in train_data:
    inputs.append(sent)
train_tokens = tokenizer.tokenize_process(inputs)
vocab = processing.Vocab(train_tokens, min_freq=3)
len(vocab)

embed_size, num_hiddens, num_layers = 300, 100, 3

net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers)
net.to(device)
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight)
    if type(m) == nn.LSTM:
        for param in m._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_normal_(m._parameters[param])


net.apply(init_weights)

embeds = glove_embedding[vocab.idx_to_token]
embeds.shape
net.embedding.weight.data.copy_(embeds)
net.embedding.weight.requires_grad = False


In [69]:

from sklearn import metrics
def train(net, train_iter, loss, trainer, epoch, device):
    net.train()
    # 记录当前epoch的总loss
    # net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    for batch in tqdm(train_iter, desc=f"Training Epoch {epoch}", colour='red'):
        # tqdm(train_dataloader, desc=f"Training Epoch {epoch}") 会自动执行DataLoader的工作流程，
        # 想要知道内部如何工作可以在debug时将断点打在 coffate_fn 函数内部，查看数据的处理过程
        # 对batch中的每条tensor类型数据，都执行.to(device)，
        # 因为模型和数据要在同一个设备上才能运行
        inputs, targets = [x.to(device) for x in batch]
        # 清除现有的梯度
        trainer.zero_grad()
        # 模型前向传播，model(inputs)等同于model.forward(inputs)
        output = net(inputs)
        # 计算损失，交叉熵损失计算可参考：https://zhuanlan.zhihu.com/p/159477597
        l = loss(output, targets)
        # 梯度反向传播
        l.backward()
        # 根据反向传播的值更新模型的参数
        trainer.step()
        # 统计总的损失，.item()方法用于取出tensor中的值

def test(net, test_iter, epoch, device):
    true_y, pred_y = [], []
    for batch in tqdm(test_iter, desc=f"Testing", colour='green'):
        inputs, targets = [x.to(device) for x in batch]
        # with torch.no_grad(): 为固定写法，
        # 这个代码块中的全部有关tensor的操作都不产生梯度。目的是节省时间和空间，不加也没事
        with torch.no_grad():
            output = net(inputs)
            pred_y.extend(output.argmax(dim=1).tolist())
            true_y.extend(targets.tolist())
    # print(metrics.confusion_matrix(true_y, pred_y))
    print(f'epochs : {epoch}\n', metrics.confusion_matrix(
        true_y, pred_y))
    # print(f'epochs : {epoch}\n', metrics.classification_report(
    #     true_y, pred_y))
    # print(metrics.classification_report(true_y, pred_y))
    print(
        f'Acc : {metrics.accuracy_score(true_y, pred_y)}\t F1: {metrics.f1_score(true_y, pred_y, average="macro")}')
    # print(f'Acc : {metrics.accuracy_score(true_y, pred_y)}\t F1: {metrics.f1_score(true_y, pred_y)}\n',)


In [71]:
lr, num_epochs = 0.0001, 20
trainer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
for epoch in range(1, num_epochs + 1):
    train(net, train_dataloader, loss, trainer, epoch, device)
    test(net, test_dataloader, epoch, device)


Training Epoch 1: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 23.86it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 64.46it/s]


epochs : 1
 [[269 189]
 [275 267]]
Acc : 0.536	 F1: 0.5350701402805611


Training Epoch 2: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 26.42it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 68.47it/s]


epochs : 2
 [[318 140]
 [330 212]]
Acc : 0.53	 F1: 0.4742729306487695


Training Epoch 3: 100%|[31m██████████[0m| 27/27 [00:00<00:00, 27.06it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 59.48it/s]


epochs : 3
 [[287 171]
 [295 247]]
Acc : 0.534	 F1: 0.5145833333333333


Training Epoch 4: 100%|[31m██████████[0m| 27/27 [00:00<00:00, 27.26it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 68.43it/s]


epochs : 4
 [[325 133]
 [362 180]]
Acc : 0.505	 F1: 0.42105263157894735


Training Epoch 5: 100%|[31m██████████[0m| 27/27 [00:00<00:00, 27.34it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 65.06it/s]


epochs : 5
 [[311 147]
 [322 220]]
Acc : 0.531	 F1: 0.484048404840484


Training Epoch 6: 100%|[31m██████████[0m| 27/27 [00:00<00:00, 27.51it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 67.25it/s]


epochs : 6
 [[310 148]
 [322 220]]
Acc : 0.53	 F1: 0.4835164835164835


Training Epoch 7: 100%|[31m██████████[0m| 27/27 [00:00<00:00, 27.02it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 67.51it/s]


epochs : 7
 [[302 156]
 [321 221]]
Acc : 0.523	 F1: 0.48095756256800865


Training Epoch 8: 100%|[31m██████████[0m| 27/27 [00:00<00:00, 27.40it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 67.51it/s]


epochs : 8
 [[332 126]
 [360 182]]
Acc : 0.514	 F1: 0.42823529411764705


Training Epoch 9: 100%|[31m██████████[0m| 27/27 [00:00<00:00, 27.68it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 68.43it/s]


epochs : 9
 [[302 156]
 [326 216]]
Acc : 0.518	 F1: 0.47264770240700216


Training Epoch 10: 100%|[31m██████████[0m| 27/27 [00:00<00:00, 27.03it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 73.46it/s]


epochs : 10
 [[294 164]
 [319 223]]
Acc : 0.517	 F1: 0.480086114101184


Training Epoch 11: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 24.78it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 66.16it/s]


epochs : 11
 [[304 154]
 [317 225]]
Acc : 0.529	 F1: 0.48859934853420195


Training Epoch 12: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 24.61it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 65.73it/s]


epochs : 12
 [[318 140]
 [351 191]]
Acc : 0.509	 F1: 0.43757159221076747


Training Epoch 13: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 23.92it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 69.38it/s]


epochs : 13
 [[318 140]
 [346 196]]
Acc : 0.514	 F1: 0.44646924829157175


Training Epoch 14: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 24.84it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 70.36it/s]


epochs : 14
 [[259 199]
 [268 274]]
Acc : 0.533	 F1: 0.5399014778325124


Training Epoch 15: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 24.56it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 65.48it/s]


epochs : 15
 [[299 159]
 [314 228]]
Acc : 0.527	 F1: 0.4908503767491927


Training Epoch 16: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 25.41it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 71.36it/s]


epochs : 16
 [[320 138]
 [344 198]]
Acc : 0.518	 F1: 0.4510250569476082


Training Epoch 17: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 25.71it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 72.40it/s]


epochs : 17
 [[291 167]
 [308 234]]
Acc : 0.525	 F1: 0.496288441145281


Training Epoch 18: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 25.75it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 72.40it/s]


epochs : 18
 [[296 162]
 [303 239]]
Acc : 0.535	 F1: 0.5068928950159067


Training Epoch 19: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 24.56it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 71.88it/s]


epochs : 19
 [[292 166]
 [315 227]]
Acc : 0.519	 F1: 0.48556149732620324


Training Epoch 20: 100%|[31m██████████[0m| 27/27 [00:01<00:00, 25.56it/s]
Testing: 100%|[32m██████████[0m| 10/10 [00:00<00:00, 71.59it/s]

epochs : 20
 [[319 139]
 [352 190]]
Acc : 0.509	 F1: 0.4362801377726751





In [26]:

temp = []
for index in train_dataloader:
    temp = index[0][0]
    break

print(temp.tolist())
print(vocab.to_tokens(temp.tolist()))

[0, 9, 2, 0, 0, 3, 34, 0, 11, 5, 35, 0, 3, 0, 0, 12, 0, 0, 47, 0, 13, 0, 0, 0, 0, 17, 0, 0, 0, 5, 0, 17, 3, 0, 36, 11, 18, 5, 0, 48, 17, 0, 6, 0, 3, 0, 3, 34, 0, 26, 37, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['<unk>', 'women', 'to', '<unk>', '<unk>', 'a', 'its', '<unk>', 'in', 'you', 'if', '<unk>', 'a', '<unk>', '<unk>', 'that', '<unk>', '<unk>', 'no', '<unk>', 'it', '<unk>', '<unk>', '<unk>', '<unk>', 'this', '<unk>', '<unk>', '<unk>', 'you', '<unk>', 'this', 'a', '<unk>', 'woman', 'in', 'me', 'you', '<unk>', 'what', 'this', '<unk>', 'i', '<unk>', 'a', '<unk>', 'a', 'its', '<unk>', 'dont', 'was', 'when', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']


In [24]:
import torch
X = torch.tensor([[0.9,0.1],[0.4, 0.6]])
import numpy as np

In [26]:


from d2l import torch as d2l
device = d2l.try_gpu()
tt = X.to(device)
tt = np.argmax(tt.cpu(), axis=1)
y =  torch.tensor([1, 1])
y = y.to(device)


In [23]:
from sklearn import metrics
metrics.f1_score(tt.cpu().numpy(), y.cpu().numpy())

0.6666666666666666

In [6]:
def sldd(tokens):
    return [12,12,12], [11,11,11]

ss = sldd(12)
ss + ([11,1,1],)

([12, 12, 12], [11, 11, 11], [11, 1, 1])