In [1]:
import torch
import torch.nn as nn

import transformers

from transformers import pipeline
from datasets import load_dataset

import random

In [46]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

raw_datasets = load_dataset("yangezheng/tum-nlp-sexism-socialmedia-balanced")

raw_datasets = raw_datasets.filter(lambda x: x['text'] != None)
raw_datasets = raw_datasets.filter(lambda x: len(x['text']) <= 500)

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(row):
    return tokenizer(row["text"], truncation=True)

# apply tokenizer to dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# collation
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [47]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label_sexist", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [48]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [50]:
for batch in train_dataloader:
    break
b = {k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### Model Head

In [59]:
import torch.nn as nn
from transformers import AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss

class MyBERTModel(nn.Module):
    def __init__(self, is_frozen= True):
        super(MyBERTModel, self).__init__()
        
        self.num_labels = 2
        checkpoint = 'bert-base-uncased'
        self.base_model = AutoModel.from_pretrained(checkpoint)
        
        if is_frozen:
            self.freeze()
            
        self.classify = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(self.base_model.config.hidden_size, self.num_labels)
        )
        
    def freeze(self):
        for param in self.base_model.parameters():
            param.requires_grad = False
        
    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        model_outputs = self.base_model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
        pooled_output = model_outputs['pooler_output']

        logits = self.classify(pooled_output)
        
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        return SequenceClassifierOutput(
            loss = loss,
            logits = logits
        )
        
model = MyBERTModel()
model(**{'input_ids':batch['input_ids'], 'labels':batch['labels'],  'token_type_ids':batch['token_type_ids'], 'attention_mask':batch['attention_mask']})


SequenceClassifierOutput(loss=tensor(0.6889, grad_fn=<NllLossBackward0>), logits=tensor([[-0.3483, -0.6822],
        [ 0.2709, -0.1900],
        [ 0.1747, -0.0435],
        [ 0.3491, -0.4302],
        [ 0.4201, -0.3137],
        [ 0.6399,  0.0159],
        [ 0.4149, -0.1814],
        [ 0.4447, -0.2600]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Optimizer & Scheduler

In [78]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr = 5e-5)

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps= 0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

device


6105




device(type='cuda')

### evaluator

In [81]:
import evaluate

def eval(model, loader):
    metric = evaluate.load("precision")
    model.eval()
    for batch in loader:
        batch = {k: v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions,references=batch["labels"])
    
    return metric.compute()

eval(model, eval_dataloader)
        
            
    

{'precision': 0.625}

### Training

In [82]:
from tqdm import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss= outputs.loss
        loss.backward()
        optimizer.step()
        
        lr_scheduler.step()
        optimizer.zero_grad()
        
        progress_bar.update(1)
        
eval(model, eval_dataloader)


100%|█████████▉| 6101/6105 [00:57<00:00, 101.23it/s]

{'precision': 0.6432506887052342}

100%|██████████| 6105/6105 [01:09<00:00, 101.23it/s]