In [None]:
# RUN THIS CELL TO INSTALL THE REQUIRED PACKAGES
!pip install transformers
!pip install datasets
!pip install torch torchvision
!pip install nltk
!pip install sentencepiece
!pip install evaluate
!pip install pandas
!pip install numpy
!pip install scikit-learn

In [None]:
from datasets import load_dataset
clickbait_dataset = load_dataset("Tugay/clickbait-spoiling")

In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
import pytorch_lightning as pl
import torch
import pandas as pd
import numpy as np
import nltk
nltk.download("punkt")

from transformers import T5Tokenizer
from transformers import LongT5ForConditionalGeneration, T5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, TrainingArguments
from transformers import Seq2SeqTrainer, Trainer
from transformers import DataCollatorForSeq2Seq, DataCollator
from nltk.tokenize import sent_tokenize
import evaluate
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from transformers import AutoTokenizer, EarlyStoppingCallback
import gc
import re


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
model_checkpoint = "t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

gc.collect()
torch.cuda.empty_cache()

def preprocess_function(examples):
    text = f"multiclass classification: {examples['postText'][0]} {' '.join(examples['targetParagraphs'])} </s>"
    model_inputs = tokenizer(
        text,
        max_length=512,
        truncation=True,
    )
    labels = tokenizer(
        examples["tags"][0],
    )

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["labels_mask"] = labels["attention_mask"]
    return model_inputs

tokenized_datasets = clickbait_dataset.map(preprocess_function)

logging_steps = len(tokenized_datasets)

args = Seq2SeqTrainingArguments(
    output_dir=f"t5-results-2",
    evaluation_strategy="epoch",
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    logging_steps=logging_steps,
    num_train_epochs=10,
    weight_decay=0.01,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

print((tokenized_datasets["train"][0]["input_ids"]))

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    print(eval_preds.predictions)
    true_predictions = [l[1] for l in eval_preds.predictions]
    true_labels = [l[0] for l in labels]
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_predictions, average='macro')
    acc = balanced_accuracy_score(true_labels, true_predictions)
    print(tokenizer.decode(true_predictions))
    print(tokenizer.decode(true_labels))
    print(true_predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def model_init(trial):
    return T5ForConditionalGeneration.from_pretrained(model_checkpoint)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

trainer2 = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer2.train()