In [3]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import re
import copy
from tqdm.notebook import tqdm
import gc

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import GPT2Tokenizer, GPT2Model,GPT2Config
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    jaccard_score
)


In [4]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
# hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32


EPOCHS = 10
LEARNING_RATE = 2e-05
tags_to_keep = ['love', 'life', 'inspirational', 'philosophy', 'humor']
target_labels = tags_to_keep

In [6]:
class CustomData(Dataset):
    def __init__(self, dataframe, tokenizer):
        super(CustomData, self).__init__()

        self.texts = dataframe['quote'].tolist()
        self.targets = dataframe.iloc[:, 1:].values.tolist()
        self.tokenizer = tokenizer
        self.src_max_length = 256 # based on longest quote
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        quote = str(self.texts[idx])
        quote = " ".join(quote.split())

        inputs = self.tokenizer.encode_plus(
            quote,
            None,
            add_special_tokens=True,
            max_length=self.src_max_length,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )


        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[idx]),
            'quote' : quote
        }


In [8]:
class MultilabelClassifier(nn.Module):
    def __init__(self,padding_id):
        super(MultilabelClassifier,self).__init__()

        self.gpt2 = AutoModelForCausalLM.from_pretrained('distilgpt2',pad_token_id=padding_id)
        self.pre_classifier = nn.Linear(50257, 768)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768,5)



    def forward(self, input_ids, mask):
        """
        Args:
                input_id: encoded inputs ids of sent.
        """
        output = self.gpt2(input_ids=input_ids, attention_mask=mask)
        hidden_states = output[0]
        pooler = hidden_states.mean(dim=1)
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [9]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
val_targets=[]
val_outputs=[]

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer#, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [10]:
import tqdm.notebook as tq
from sklearn.metrics import jaccard_score
def train_model(train_dataloader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0

    model.train()

    loop = tq.tqdm(enumerate(train_dataloader), total=len(train_dataloader),
                      leave=True,colour='steelblue')

    count = 0
    accuracy = 0
    for batch_idx, data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        target_labels = data['targets'].to(device, dtype = torch.float)

        # forward
        outputs = model(ids, mask) # (batch,predict)=(32,8)
        loss = loss_fn(outputs, target_labels)
        losses.append(loss.item())

        # training accuracy
        sigmoid_output = torch.sigmoid(outputs)
        label_threshold = 0.5
        predicted_labels = torch.where(sigmoid_output > label_threshold, torch.tensor(1), torch.tensor(0))

        count+=1
        accuracy += jaccard_score(target_labels.cpu(), predicted_labels.cpu(), average='samples')
        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

        # Update progress bar
        loop.set_description(f"")
        loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(accuracy)/count, np.mean(losses)

In [11]:
def eval_model(validation_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    count = 0
    accuracy = 0
    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            target_labels = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask)

            loss = loss_fn(outputs, target_labels)
            losses.append(loss.item())

            # validation accuracy
            sigmoid_output = torch.sigmoid(outputs)
            label_threshold = 0.5
            predicted_labels = torch.where(sigmoid_output > label_threshold, torch.tensor(1), torch.tensor(0))

            count+=1
            accuracy += jaccard_score(target_labels.cpu(), predicted_labels.cpu(), average='samples')

    return float(accuracy)/count, np.mean(losses)

In [12]:
from sklearn.metrics import confusion_matrix, classification_report

def get_predictions(model, data_loader):
    model = model.eval()

    quotes = []
    predictions = []
    prediction_probs = []
    target_values = []

    with torch.no_grad():
      for data in data_loader:
        # quote = data["quote"]
        ids = data["input_ids"].to(device, dtype = torch.long)
        mask = data["attention_mask"].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        target_labels = data["targets"].to(device, dtype = torch.float)

        outputs = model(ids, mask)
        sigmoid_output = torch.sigmoid(outputs)
        label_threshold = 0.5
        predicted_labels = torch.where(sigmoid_output > label_threshold, torch.tensor(1), torch.tensor(0))

        # quotes.extend(quote)
        predictions.extend(predicted_labels)
        prediction_probs.extend(sigmoid_output)
        target_values.extend(target_labels)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    target_values = torch.stack(target_values).cpu()

    return quotes, predictions, prediction_probs, target_values

In [None]:



def create_multiple_runs(epochs, seeds, batch_size,final_df):
  for seed in seeds:
    # init models and datasets
    print(seed)
    cols = [f'train_acc_{seed}', f'train_loss_{seed}',f'valid_acc_{seed}',f'valid_loss_{seed}']
    result_df = pd.DataFrame(columns=cols)
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
    tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
    tokenizer.pad_token = tokenizer.eos_token
    padding_id = tokenizer.pad_token_id
    model =  MultilabelClassifier(padding_id)
    model = model.to(device)
    train_data, val_data, test_data = pd.read_csv(f"train_{seed}_1.csv"),pd.read_csv(f"valid_{seed}_1.csv"),pd.read_csv(f"test_{seed}_1.csv")
    train = CustomDataset(train_data, tokenizer)
    val = CustomDataset(val_data, tokenizer)
    train_dataloader = DataLoader(train, batch_size=batch_size)
    val_dataloader = DataLoader(val, batch_size=batch_size)
    test = CustomDataset(test_data, tokenizer)
    test_dataloader = DataLoader(test, batch_size=batch_size)

    # train
    for epoch in range(1, epochs+1):
      print(f'Epoch {epoch}/{EPOCHS}')
      model, train_acc, train_loss = train_model(train_dataloader, model, optimizer)
      val_acc, val_loss = eval_model(val_dataloader, model, optimizer)
      print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')
      new_row = {f'train_acc_{seed}':train_acc,f'train_loss_{seed}':train_loss,f'valid_acc_{seed}':val_acc,f'valid_loss_{seed}':val_loss}
      result_df = result_df._append(new_row,ignore_index = True)
    final_df = pd.concat([final_df,result_df],axis=1)
    test_acc,test_loss=  eval_model(test_dataloader, model, optimizer)
    titles, predictions, prediction_probs, target_values = get_predictions(model, test_dataloader)
    print("Test_statistics:")
    print(test_acc,test_loss)
    print(classification_report(target_values, predictions, target_names=tags_to_keep))


In [None]:
final_df = pd.DataFrame()
create_multiple_runs(EPOCHS,[9,17,204],TRAIN_BATCH_SIZE,final_df)

9


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Epoch 1/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.4777, val_loss=0.4425 train_acc=0.4805, val_acc=0.5731
Epoch 2/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.3995, val_loss=0.4208 train_acc=0.6148, val_acc=0.5983
Epoch 3/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.3346, val_loss=0.4218 train_acc=0.6912, val_acc=0.6157
Epoch 4/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.2720, val_loss=0.4362 train_acc=0.7561, val_acc=0.6223
Epoch 5/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.2149, val_loss=0.4766 train_acc=0.8119, val_acc=0.6274
Epoch 6/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.1658, val_loss=0.5561 train_acc=0.8623, val_acc=0.6275
Epoch 7/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.1265, val_loss=0.6048 train_acc=0.8969, val_acc=0.6294
Epoch 8/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.0991, val_loss=0.6591 train_acc=0.9233, val_acc=0.6294
Epoch 9/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.0785, val_loss=0.7241 train_acc=0.9384, val_acc=0.6226
Epoch 10/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.0637, val_loss=0.7916 train_acc=0.9496, val_acc=0.6088
Test_statistics:
0.6061259717675942 0.7999870193765518
               precision    recall  f1-score   support

         love       0.80      0.73      0.76      2131
         life       0.66      0.67      0.66      2310
inspirational       0.68      0.79      0.73      2440
   philosophy       0.71      0.78      0.74      1973
        humor       0.84      0.59      0.69      1414

    micro avg       0.72      0.72      0.72     10268
    macro avg       0.74      0.71      0.72     10268
 weighted avg       0.73      0.72      0.72     10268
  samples avg       0.68      0.69      0.66     10268

17


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.4775, val_loss=0.4440 train_acc=0.4848, val_acc=0.5486
Epoch 2/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.4005, val_loss=0.4237 train_acc=0.6187, val_acc=0.5869
Epoch 3/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.3347, val_loss=0.4315 train_acc=0.6941, val_acc=0.6088
Epoch 4/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.2683, val_loss=0.4575 train_acc=0.7594, val_acc=0.6170
Epoch 5/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.2132, val_loss=0.5098 train_acc=0.8167, val_acc=0.6138
Epoch 6/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.1661, val_loss=0.5513 train_acc=0.8611, val_acc=0.6222
Epoch 7/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.1303, val_loss=0.6093 train_acc=0.8956, val_acc=0.6137
Epoch 8/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.0983, val_loss=0.6709 train_acc=0.9222, val_acc=0.6082
Epoch 9/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.0774, val_loss=0.7080 train_acc=0.9389, val_acc=0.6205
Epoch 10/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.0634, val_loss=0.7822 train_acc=0.9508, val_acc=0.6223
Test_statistics:
0.6268293098745225 0.7661105911148355
               precision    recall  f1-score   support

         love       0.78      0.76      0.77      2125
         life       0.74      0.58      0.65      2278
inspirational       0.68      0.78      0.73      2365
   philosophy       0.72      0.79      0.75      1908
        humor       0.71      0.77      0.74      1362

    micro avg       0.72      0.73      0.73     10038
    macro avg       0.72      0.74      0.73     10038
 weighted avg       0.73      0.73      0.72     10038
  samples avg       0.69      0.71      0.68     10038

204


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.4754, val_loss=0.4450 train_acc=0.4919, val_acc=0.5576
Epoch 2/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.4013, val_loss=0.4389 train_acc=0.6174, val_acc=0.5813
Epoch 3/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.3383, val_loss=0.4340 train_acc=0.6926, val_acc=0.5879
Epoch 4/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.2708, val_loss=0.4545 train_acc=0.7592, val_acc=0.5957
Epoch 5/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.2148, val_loss=0.4925 train_acc=0.8145, val_acc=0.6018
Epoch 6/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.1693, val_loss=0.5406 train_acc=0.8571, val_acc=0.6041
Epoch 7/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.1323, val_loss=0.6538 train_acc=0.8911, val_acc=0.5915
Epoch 8/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.1037, val_loss=0.7135 train_acc=0.9169, val_acc=0.5956
Epoch 9/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.0827, val_loss=0.7592 train_acc=0.9342, val_acc=0.6031
Epoch 10/10


  0%|          | 0/638 [00:00<?, ?it/s]

train_loss=0.0653, val_loss=0.8168 train_acc=0.9503, val_acc=0.6035
Test_statistics:
0.6116637684124391 0.8024790724541279
               precision    recall  f1-score   support

         love       0.76      0.75      0.76      2169
         life       0.63      0.72      0.67      2253
inspirational       0.63      0.83      0.71      2370
   philosophy       0.73      0.76      0.75      1960
        humor       0.78      0.68      0.72      1404

    micro avg       0.69      0.75      0.72     10156
    macro avg       0.71      0.75      0.72     10156
 weighted avg       0.70      0.75      0.72     10156
  samples avg       0.67      0.73      0.67     10156



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
final_df.to_csv(f'result.csv', index=False)