In [1]:
!pip install transformers datasets;

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 29.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 78.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 70.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 82.3 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 108.4 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
import os
import pandas as pd
import torch
from datasets import load_dataset

In [4]:
data = load_dataset("json", data_files={split : f'/content/drive/MyDrive/advNLP/twibot20/{split}_bertweet_emb.json' 
                                        for split in ['train', 'test', 'valid']})



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-e6ed7220dcbc9308/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

   

Extracting data files #2:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-e6ed7220dcbc9308/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
data = data.remove_columns(["input_ids", "token_type_ids", "attention_mask"])

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, rep_list = [], []
    for example in batch:
      label_list.append(example['label'])
      rep_list.append(example['features'])

    label_list = torch.tensor(label_list, dtype=torch.float)
    rep_list = torch.nn.utils.rnn.pad_sequence(rep_list, batch_first=True, padding_value=0.0)
    return label_list.to(device), rep_list.to(device)

In [7]:
train_dataloader = torch.utils.data.DataLoader(data['train'].with_format("torch"), batch_size=64, collate_fn=collate_batch)
valid_dataloader = torch.utils.data.DataLoader(data['valid'].with_format("torch"), batch_size=64, collate_fn=collate_batch)
test_dataloader = torch.utils.data.DataLoader(data['test'].with_format("torch"), batch_size=64, collate_fn=collate_batch)

In [8]:
from torch import nn

class AllAttentionClassifier(nn.Module):

    def __init__(self, embed_dim, num_heads=6):
        super(AllAttentionClassifier, self).__init__()

        self.q_layer = nn.Linear(embed_dim, embed_dim)
        self.v_layer = nn.Linear(embed_dim, embed_dim)
        self.k_layer = nn.Linear(embed_dim, embed_dim)
        self.mha = nn.MultiheadAttention(embed_dim, num_heads)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, inputs):
        query = self.q_layer(inputs)
        value = self.v_layer(inputs)
        key = self.k_layer(inputs)

        attn_output, _ = self.mha(query, key, value)
        attn_output = self.dropout(attn_output)
        pooled_output = torch.mean(attn_output, dim=1)

        return torch.squeeze(self.fc(pooled_output))


class DescriptionClassifier(nn.Module):

    def __init__(self, embed_dim):
        super(DescriptionClassifier, self).__init__()

        self.pre1 = nn.Linear(embed_dim, embed_dim // 2)
        self.pre2 = nn.Linear(embed_dim // 2, embed_dim // 2)
        self.a1 = nn.ReLU()
        self.a2 = nn.ReLU()

        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(embed_dim // 2, 1)

    def forward(self, inputs):

        des, tweets = inputs[:, 0, :], inputs[:, 1:, 0]

        des = self.a1(self.pre1(des))
        des = self.a2(self.pre2(des))
        des = self.dropout(des)

        return torch.squeeze(self.fc(des))

In [12]:
LR = 4e-5
EPOCHS = 10

model = AllAttentionClassifier(embed_dim=768).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = torch.nn.BCEWithLogitsLoss()

In [13]:
import time
from tqdm import tqdm

def train(dataloader):
    model.train()
    total_loss, total_acc, total_count = 0, 0, 0
    log_interval = len(dataloader) // 3
    start_time = time.time()

    for idx, (label, inputs) in enumerate(tqdm(dataloader)):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        total_loss += label.size(0) * loss.item()
        total_acc += ((torch.sigmoid(outputs)>0.5).float() == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f} | loss {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count, total_loss/total_count))
            total_loss, total_acc, total_count = 0, 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_loss, total_acc, total_count = 0, 0, 0

    with torch.no_grad():
        for idx, (label, inputs) in enumerate(tqdm(dataloader)):
            predicted_label = model(inputs)
            loss = criterion(predicted_label, label)
            total_loss += label.size(0) * loss.item()
            total_acc += ((torch.sigmoid(predicted_label)>0.5).float() == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count, total_loss/total_count

In [14]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val, loss_val = evaluate(valid_dataloader)
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} | valid loss {:8.3f}'.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val, loss_val))
    print('-' * 59)

 34%|███▍      | 44/130 [00:13<00:25,  3.43it/s]

| epoch   1 |    43/  130 batches | accuracy    0.700 | loss    0.622


 67%|██████▋   | 87/130 [00:26<00:12,  3.33it/s]

| epoch   1 |    86/  130 batches | accuracy    0.573 | loss    0.687


100%|██████████| 130/130 [00:38<00:00,  3.36it/s]


| epoch   1 |   129/  130 batches | accuracy    0.536 | loss    0.683


100%|██████████| 37/37 [00:09<00:00,  3.82it/s]


-----------------------------------------------------------
| end of epoch   1 | time: 48.44s | valid accuracy    0.449 | valid loss    0.801
-----------------------------------------------------------


 34%|███▍      | 44/130 [00:13<00:24,  3.44it/s]

| epoch   2 |    43/  130 batches | accuracy    0.543 | loss    0.716


 67%|██████▋   | 87/130 [00:26<00:13,  3.25it/s]

| epoch   2 |    86/  130 batches | accuracy    0.573 | loss    0.685


100%|██████████| 130/130 [00:39<00:00,  3.32it/s]


| epoch   2 |   129/  130 batches | accuracy    0.547 | loss    0.691


100%|██████████| 37/37 [00:09<00:00,  3.84it/s]


-----------------------------------------------------------
| end of epoch   2 | time: 48.77s | valid accuracy    0.512 | valid loss    0.691
-----------------------------------------------------------


 34%|███▍      | 44/130 [00:13<00:24,  3.45it/s]

| epoch   3 |    43/  130 batches | accuracy    0.488 | loss    0.685


 67%|██████▋   | 87/130 [00:25<00:12,  3.31it/s]

| epoch   3 |    86/  130 batches | accuracy    0.573 | loss    0.676


100%|██████████| 130/130 [00:38<00:00,  3.37it/s]


| epoch   3 |   129/  130 batches | accuracy    0.409 | loss    0.698


100%|██████████| 37/37 [00:09<00:00,  3.85it/s]


-----------------------------------------------------------
| end of epoch   3 | time: 48.25s | valid accuracy    0.565 | valid loss    0.684
-----------------------------------------------------------


 34%|███▍      | 44/130 [00:13<00:25,  3.37it/s]

| epoch   4 |    43/  130 batches | accuracy    0.593 | loss    0.667


 67%|██████▋   | 87/130 [00:26<00:13,  3.18it/s]

| epoch   4 |    86/  130 batches | accuracy    0.573 | loss    0.677


100%|██████████| 130/130 [00:38<00:00,  3.36it/s]


| epoch   4 |   129/  130 batches | accuracy    0.397 | loss    0.698


100%|██████████| 37/37 [00:09<00:00,  3.86it/s]


-----------------------------------------------------------
| end of epoch   4 | time: 48.30s | valid accuracy    0.551 | valid loss    0.684
-----------------------------------------------------------


 34%|███▍      | 44/130 [00:13<00:25,  3.43it/s]

| epoch   5 |    43/  130 batches | accuracy    0.707 | loss    0.665


 67%|██████▋   | 87/130 [00:26<00:12,  3.37it/s]

| epoch   5 |    86/  130 batches | accuracy    0.573 | loss    0.676


100%|██████████| 130/130 [00:38<00:00,  3.36it/s]


| epoch   5 |   129/  130 batches | accuracy    0.397 | loss    0.702


100%|██████████| 37/37 [00:09<00:00,  3.81it/s]


-----------------------------------------------------------
| end of epoch   5 | time: 48.43s | valid accuracy    0.551 | valid loss    0.684
-----------------------------------------------------------


 34%|███▍      | 44/130 [00:13<00:25,  3.39it/s]

| epoch   6 |    43/  130 batches | accuracy    0.707 | loss    0.662


 67%|██████▋   | 87/130 [00:26<00:13,  3.30it/s]

| epoch   6 |    86/  130 batches | accuracy    0.573 | loss    0.675


100%|██████████| 130/130 [00:38<00:00,  3.36it/s]


| epoch   6 |   129/  130 batches | accuracy    0.397 | loss    0.697


100%|██████████| 37/37 [00:09<00:00,  3.89it/s]


-----------------------------------------------------------
| end of epoch   6 | time: 48.22s | valid accuracy    0.551 | valid loss    0.684
-----------------------------------------------------------


 34%|███▍      | 44/130 [00:13<00:25,  3.39it/s]

| epoch   7 |    43/  130 batches | accuracy    0.707 | loss    0.664


 67%|██████▋   | 87/130 [00:26<00:13,  3.17it/s]

| epoch   7 |    86/  130 batches | accuracy    0.573 | loss    0.673


100%|██████████| 130/130 [00:38<00:00,  3.35it/s]


| epoch   7 |   129/  130 batches | accuracy    0.397 | loss    0.697


100%|██████████| 37/37 [00:09<00:00,  3.82it/s]


-----------------------------------------------------------
| end of epoch   7 | time: 48.48s | valid accuracy    0.551 | valid loss    0.684
-----------------------------------------------------------


 18%|█▊        | 24/130 [00:07<00:33,  3.20it/s]


KeyboardInterrupt: ignored