In [1]:
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
import warnings
import sys
sys.path.insert(0, '..')

warnings.filterwarnings("ignore")
device = 'cuda' if torch.cuda.is_available() else 'cpu'

T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks and for which each task is converted into a text-to-text format. T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g., for translation: translate English to German: …, for summarization.


In [2]:
model_name = 't5-small'

In [3]:
from willbedeleted.make_dataset import FilteredDataset
from torch.utils.data import DataLoader

batch_size = 32
train_dataset = FilteredDataset(test=False)
test_dataset = FilteredDataset(test=True)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

Now we can get rid off the toxicity classificator, since our future model will handle this itself

In [4]:
train_data = {'text': [], 'label': []}
test_data = {'text': [], 'label': []}

for batch, labels in tqdm(train_dataloader):
    batch = list(batch)
    labels = list(labels)
    train_data['text'].extend(batch)
    train_data['label'].extend(labels)

for batch, labels in tqdm(test_dataloader):
    batch = list(batch)
    labels = list(labels)
    test_data['text'].extend(batch)
    test_data['label'].extend(labels)


100%|██████████| 9829/9829 [00:00<00:00, 26908.18it/s]
100%|██████████| 518/518 [00:00<00:00, 28004.20it/s]


Split to validation and train

In [5]:
from datasets import Dataset

train_dataset = Dataset.from_dict(train_data).train_test_split(test_size=0.2, seed=42)
test_dataset = Dataset.from_dict(test_data)
test_dataset, train_dataset

(Dataset({
     features: ['text', 'label'],
     num_rows: 16554
 }),
 DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 251612
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 62903
     })
 }))

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
def tokenize_function(example):
    text_tokens = tokenizer(example['text'], padding='max_length', truncation=True, max_length=128)
    label_tokens = tokenizer(example['label'], padding='max_length', truncation=True, max_length=128)
    
    text_tokens["labels"] = label_tokens["input_ids"]
    
    return text_tokens


In [9]:
tokenized_traindataset = train_dataset.map(tokenize_function, batched=True)
tokenized_testdataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 251612/251612 [00:40<00:00, 6232.33 examples/s]
Map: 100%|██████████| 62903/62903 [00:09<00:00, 6904.04 examples/s]
Map: 100%|██████████| 16554/16554 [00:02<00:00, 8118.01 examples/s]


In [10]:
tokenized_traindataset = tokenized_traindataset.remove_columns(['text', 'label'])
tokenized_traindataset = tokenized_traindataset.with_format("torch")

tokenized_testdataset = tokenized_testdataset.remove_columns(['text', 'label'])
tokenized_testdataset = tokenized_testdataset.with_format("torch")

In [11]:
(tokenizer.decode(tokenized_traindataset['test'][0]['input_ids'], skip_special_tokens=True,temperature=0),
tokenizer.decode(tokenized_traindataset['test'][0]['labels'], skip_special_tokens=True,temperature=0))

("Why didn't you destroy all of them?", "why didn't you destroy everyone?")

In [12]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [13]:
batch_size = 160

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    report_to='tensorboard',
)



In [14]:
from src.metric.metric import calculate_metric
def compute_metric(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return calculate_metric(decoded_labels, decoded_preds)

In [15]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_traindataset['train'],
    eval_dataset=tokenized_traindataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

trainer.train()


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Acc,Sim,Fl,J,Blue
1,0.2277,0.202,0.626425,0.706339,0.822377,0.369215,0.499427
2,0.2148,0.193629,0.65884,0.715029,0.829025,0.39743,0.509289
3,0.2084,0.189763,0.678553,0.71872,0.831062,0.412158,0.514119
4,0.2059,0.187898,0.685357,0.720615,0.831987,0.418187,0.516174
5,0.2043,0.187309,0.689156,0.72098,0.832544,0.420949,0.516778


running on cuda
Calculating style of predictions


Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1966/1966 [01:19<00:00, 24.84it/s]


Calculating BLEU similarity
Calculating similarity by Wieting subword-embedding SIM model


100%|██████████| 1966/1966 [00:12<00:00, 156.54it/s]


Calculating CoLA acceptability stats


100%|██████████| 1966/1966 [03:24<00:00,  9.62it/s]


| ACC | SIM |  FL  |   J   | BLEU |

| --- | --- | ---- |  ---  | ---- |

|0.6264|0.7063|0.8224|0.3692|0.4994|

running on cuda
Calculating style of predictions


Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1966/1966 [01:22<00:00, 23.86it/s]


Calculating BLEU similarity
Calculating similarity by Wieting subword-embedding SIM model


100%|██████████| 1966/1966 [00:13<00:00, 146.79it/s]


Calculating CoLA acceptability stats


100%|██████████| 1966/1966 [03:25<00:00,  9.59it/s]


| ACC | SIM |  FL  |   J   | BLEU |

| --- | --- | ---- |  ---  | ---- |

|0.6588|0.7150|0.8290|0.3974|0.5093|

running on cuda
Calculating style of predictions


Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1966/1966 [01:20<00:00, 24.34it/s]


Calculating BLEU similarity
Calculating similarity by Wieting subword-embedding SIM model


100%|██████████| 1966/1966 [00:12<00:00, 162.70it/s]


Calculating CoLA acceptability stats


100%|██████████| 1966/1966 [03:16<00:00, 10.00it/s]


| ACC | SIM |  FL  |   J   | BLEU |

| --- | --- | ---- |  ---  | ---- |

|0.6786|0.7187|0.8311|0.4122|0.5141|

running on cuda
Calculating style of predictions


Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1966/1966 [01:22<00:00, 23.88it/s]


Calculating BLEU similarity
Calculating similarity by Wieting subword-embedding SIM model


100%|██████████| 1966/1966 [00:12<00:00, 151.70it/s]


Calculating CoLA acceptability stats


100%|██████████| 1966/1966 [03:23<00:00,  9.64it/s]


| ACC | SIM |  FL  |   J   | BLEU |

| --- | --- | ---- |  ---  | ---- |

|0.6854|0.7206|0.8320|0.4182|0.5162|

running on cuda
Calculating style of predictions


Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1966/1966 [01:22<00:00, 23.90it/s]


Calculating BLEU similarity
Calculating similarity by Wieting subword-embedding SIM model


100%|██████████| 1966/1966 [00:12<00:00, 157.63it/s]


Calculating CoLA acceptability stats


100%|██████████| 1966/1966 [03:25<00:00,  9.59it/s]


| ACC | SIM |  FL  |   J   | BLEU |

| --- | --- | ---- |  ---  | ---- |

|0.6892|0.7210|0.8325|0.4209|0.5168|



TrainOutput(global_step=7865, training_loss=0.2952364815471283, metrics={'train_runtime': 7020.8548, 'train_samples_per_second': 179.189, 'train_steps_per_second': 1.12, 'total_flos': 4.256702668996608e+16, 'train_loss': 0.2952364815471283, 'epoch': 5.0})

In [16]:
# # saving model
trainer.save_model('../models/t5-small-best')

In [17]:
# loading the model and run inference for it
model = AutoModelForSeq2SeqLM.from_pretrained('../models/t5-small-best')
model.eval()
model.config.use_cache = False

In [50]:
def inference(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt", padding=True).input_ids.to(device)
    outputs = model.generate(input_ids=input_ids)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True,temperature=0)

In [52]:
outputs = []
target = []
for data, label in tqdm(test_dataloader):
    out = inference(model, data, tokenizer)
    target.extend(label)
    outputs.extend(out)

100%|██████████| 518/518 [03:16<00:00,  2.64it/s]


In [53]:
predictions_hypothesis2 = pd.DataFrame({'Predictions': outputs})
predictions_hypothesis2.to_csv('hypothesis2_predictions.csv')

In [55]:
calculate_metric(target, outputs)

running on cuda
Calculating style of predictions


Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 518/518 [00:21<00:00, 23.71it/s]


Calculating BLEU similarity
Calculating similarity by Wieting subword-embedding SIM model


100%|██████████| 518/518 [00:03<00:00, 159.88it/s]


Calculating CoLA acceptability stats


100%|██████████| 518/518 [00:55<00:00,  9.40it/s]


| ACC | SIM |  FL  |   J   | BLEU |

| --- | --- | ---- |  ---  | ---- |

|0.6887|0.7233|0.8324|0.4203|0.5192|



{'ACC': 0.6887157182554066,
 'SIM': 0.723305512939735,
 'FL': 0.8324346852929216,
 'J': 0.4203377930078891,
 'BLUE': 0.519224402891123}