In [115]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import re
import copy
from tqdm.notebook import tqdm
import gc

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    classification_report
)

from transformers import (
    T5Tokenizer, 
    T5Model,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)

In [48]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df["tags"] = train_df["tags"].apply(lambda x: " ".join(x.split("_")))

labels = ['love','life','inspirational','philosophy','humor','god','truth','wisdom','happiness','people','hope','time','faith','quotes','inspirational-quotes']

Unnamed: 0,quote,tags
0,i liked being a person. i wanted to keep at it.,life
1,"my wishes before i die, to fulfill my mission ...",inspirational inspirational-quotes
2,i was just thinking that he might be willing. ...,humor
3,joyful peace pervades my being for all time.,inspirational-quotes
4,"don't let what other people think, stop you fr...",philosophy
...,...,...
129029,"just smile, breath, and give thanks (and there...",life
129030,"the dreamers, those who misread the actual sta...",truth
129031,i will gladly spend the rest of my days learni...,happiness faith love
129032,for it is beautiful only to do the thing we ar...,life


In [72]:

seed = 9
val_split = 0.1
batch_size = 20
epochs = 10
np.random.seed(seed)

dataset_size = len(train_df)
indices = list(range(dataset_size))
split = int(np.floor(val_split * dataset_size))
np.random.shuffle(indices)

train_indices, val_indices = indices[split:], indices[:split]

In [None]:
def generate_one_hot_encoding(labels, text):
    one_hot_encoding = np.zeros(len(labels), dtype=int)
    
    for i, label in enumerate(labels):
        if label in text:
            one_hot_encoding[i] = 1
    
    return one_hot_encoding

In [None]:
# created dataset class for classification

class ClassificationDataset(Dataset):
    def __init__(self, dataframe, indices, tokenizer):
        super(ClassificationDataset, self).__init__()

        df = dataframe.iloc[indices]
        self.texts = df['quote'].tolist()
        self.targets = df['tags'].tolist()
        self.tokenizer = tokenizer
        self.src_max_length = 512 # based on longest quote
        self.tgt_max_length = 20

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = str(self.targets[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.src_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target_enc = self.tokenizer.encode_plus(
            target,
            max_length=self.tgt_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "target_ids": target_enc["input_ids"].squeeze(),
            "target_attn_mask": target_enc["attention_mask"].squeeze()
        }

In [116]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:

tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [None]:
training_data = ClassificationDataset(train_df, train_indices, tokenizer)
val_data = ClassificationDataset(train_df, val_indices, tokenizer)
train_dataloader = DataLoader(training_data, batch_size= batch_size)
validation_dataloader = DataLoader(val_data, batch_size= batch_size)

#batch = next(iter(train_dataloader))

In [None]:
def train(model, train_dl, val_dl ,criterion, optimizer, scheduler, epochs):
    # we validate config.N_VALIDATE_DUR_TRAIN times during the training loop
    nv = 10
    temp = len(train_dataloader) // nv
    temp = temp - (temp % 100)
    validate_at_steps = [temp * x for x in range(1, nv + 1)]
    
    train_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader, 
                                      desc='Epoch ' + str(epochs))):
        # set model.eval() every time during training
        model.train()
        
        # unpack the batch contents and push them to the device (cuda or cpu).
        b_src_input_ids = batch['input_ids']
        b_src_attention_mask = batch['attention_mask']
    
        lm_labels = batch['target_ids']
        lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100

        b_tgt_attention_mask = batch['target_attn_mask']

        # clear accumulated gradients
        optimizer.zero_grad()

        # forward pass
        outputs = model(input_ids=b_src_input_ids, 
                        attention_mask=b_src_attention_mask,
                        labels=lm_labels,
                        decoder_attention_mask=b_tgt_attention_mask)
        loss = outputs[0]
        train_loss += loss.item()

        # backward pass
        loss.backward()

        # update weights
        optimizer.step()
        
        # update scheduler
        scheduler.step()

        if step in validate_at_steps:
            print(f'-- Step: {step}')
            _ = val(model, val_dataloader, criterion)
    
    avg_train_loss = train_loss / len(train_dataloader)
    print('Training loss:', avg_train_loss)

In [None]:
def val(model, val_dataloader, criterion):
    
    val_loss = 0
    true, pred = [], []
    
    # set model.eval() every time during evaluation
    model.eval()
    
    for step, batch in enumerate(val_dataloader):
        # unpack the batch contents and push them to the device (cuda or cpu).
        b_src_input_ids = batch['input_ids']
        b_src_attention_mask = batch['attention_mask']
    
        b_tgt_input_ids = batch['target_ids']
        lm_labels = b_tgt_input_ids
        lm_labels[lm_labels[:, :] == tokenizer.pad_token_id] = -100

        b_tgt_attention_mask = batch['target_attn_mask']

        # using torch.no_grad() during validation/inference is faster -
        # - since it does not update gradients.
        with torch.no_grad():
            # forward pass
            outputs = model(
                input_ids=b_src_input_ids, 
                attention_mask=b_src_attention_mask,
                labels=lm_labels,
                decoder_attention_mask=b_tgt_attention_mask)
            loss = outputs[0]

            val_loss += loss.item()

            # get true 
            for true_id in b_tgt_input_ids:
                true_decoded = tokenizer.decode(true_id)
                true.append(true_decoded)

            # get pred (decoder generated textual label ids)
            pred_ids = model.t5_model.generate(
                input_ids=b_src_input_ids, 
                attention_mask=b_src_attention_mask
            )
            pred_ids = pred_ids.cpu().numpy()
            for pred_id in pred_ids:
                pred_decoded = tokenizer.decode(pred_id)
                pred.append(pred_decoded)

    true_ohe = generate_one_hot_encoding(true)
    pred_ohe = generate_one_hot_encoding(pred)

    avg_val_loss = val_loss / len(val_dataloader)
    print('Val loss:', avg_val_loss)
    print('Val accuracy:', accuracy_score(true_ohe, pred_ohe))

    val_micro_f1_score = f1_score(true_ohe, pred_ohe, average='micro')
    print('Val micro f1 score:', val_micro_f1_score)
    return val_micro_f1_score


In [None]:
def run():
    # setting a seed ensures reproducible results.
    # seed may affect the performance too.
    torch.manual_seed(seed)

    criterion = nn.BCEWithLogitsLoss()
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    optimizer = optim.AdamW(optimizer_parameters, lr=2e-5)

    num_training_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    max_val_micro_f1_score = float('-inf')
    for epoch in range(epochs):
        train(model, train_dataloader, validation_dataloader, criterion, optimizer, scheduler, epoch)
        val_micro_f1_score = val(model, validation_dataloader, criterion)

        if True:
            if val_micro_f1_score > max_val_micro_f1_score:
                best_model = copy.deepcopy(model)
                best_val_micro_f1_score = val_micro_f1_score

                model_name = 't5_best_model'
                #torch.save(best_model.state_dict(), model_name + '.pt')

                print(f'--- Best Model. Val loss: {max_val_micro_f1_score} -> {val_micro_f1_score}')
                max_val_micro_f1_score = val_micro_f1_score

    return best_model, best_val_micro_f1_score

In [None]:
best_model, best_val_micro_f1_score = run()
