In [6]:
import torch
from torch import cuda
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

from src import data
from src.DialoGPT.prepare_dataset import create_context, ConversationDataset
from src.DialoGPT.conversation import chat_with_me

import time
import torch
import transformers

import numpy as np
import pandas as pd

from tqdm import tqdm
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration, TrainingArguments, Trainer



In [7]:
SEED = 42
MODEL = "facebook/blenderbot_small-90M"

NAME = "Iroh"
N = 7

# TRAIN_SIZE = 0.8
#
# TRAIN_BATCH = 32
# EVAL_BATCH = 32
# EPOCHS = 100
# OUTPUT_DIR = "../outputs/blenderbot"
# OVERWRITE_OUTPUT_DIR = True
# EVAL_STRATEGY = "epoch"
# LEARNING_RATE = 1e-4
# LOAD_BEST_MODEL_AT_THE_END = True
# PREDICTION_LOSS_ONLY = True

In [8]:
torch.manual_seed(SEED)
np.random.seed(SEED)
# torch.backends.cudnn.deterministic = True
device = 'cuda' if cuda.is_available() else 'cpu'

In [9]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.response
        self.ctext = self.data.context

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length=self.source_len, padding='max_length',
                                                  return_tensors='pt', truncation=True)
        target = self.tokenizer.batch_encode_plus([text], max_length=self.summ_len, padding='max_length',
                                                  return_tensors='pt', truncation=True)

        source_ids = source['input_ids'].squeeze().to(dtype=torch.long)
        source_mask = source['attention_mask'].squeeze().to(dtype=torch.long)
        target_ids = target['input_ids'].squeeze().to(dtype=torch.long)

        y_ids = target_ids[:-1].contiguous()  # make y_ids contiguous
        lm_labels = target_ids[1:].clone().detach()  # make fast copy
        lm_labels[target_ids[1:] == self.tokenizer.pad_token_id] = -100  # replace pad tokens

        return {
            'input_ids': source_ids,
            'attention_mask': source_mask,
            'decoder_input_ids': y_ids,
            'labels': lm_labels
        }

In [10]:
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 2
TRAIN_EPOCHS = 10
VAL_EPOCHS = 1
LEARNING_RATE = 1e-4
SEED = 42
MAX_LEN = 256
SUMMARY_LEN = 64
TRAIN_SIZE = 0.9

train_size = TRAIN_SIZE

df = data.read_dataframe()
df = create_context(df, name=NAME, n=1)

max_len = max(df.astype('str').applymap(lambda x: len(x)).max())

train_dataset = df.sample(frac=train_size, random_state=SEED)
eval_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(eval_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(eval_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
}

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

model = BlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
model = model.to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

FULL Dataset: (337, 2)
TRAIN Dataset: (303, 2)
TEST Dataset: (34, 2)


In [11]:
args = TrainingArguments(output_dir="blenderbot_small-news",
                         seed=42,
                         num_train_epochs=10,
                         per_device_train_batch_size=1,
                         # max batch size without OOM exception, because of the large max token length
                         per_device_eval_batch_size=1,
                         logging_steps=2500,
                         save_steps=0,
                         )

In [12]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 303
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3030


RuntimeError: CUDA out of memory. Tried to allocate 108.00 MiB (GPU 0; 3.95 GiB total capacity; 885.23 MiB already allocated; 21.69 MiB free; 1020.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    texts = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(loader, 0)):
            y = data['decoder_input_ids'].to(device, dtype = torch.long)
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask,
                max_length = 100,
                num_beams = 2,
                repetition_penalty = 2.5,
                length_penalty = 1.0,
                early_stopping = True
                )
            preds = [tokenizer.decode(g, skip_special_tokens = True, clean_up_tokenization_spaces = True)\
                     for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens = True, clean_up_tokenization_spaces = True)\
                      for t in y]
            text = [tokenizer.decode(i, skip_special_tokens = True, clean_up_tokenization_spaces = True)\
                      for i in ids]
            if _%2500==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
            texts.extend(text)
    return predictions, actuals, texts

trainer.save_model("blenderbot_small-news/")

In [None]:
start_time = time.time()
for epoch in range(VAL_EPOCHS):
    predictions, actuals, text = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text': predictions,'Actual Text': actuals, 'Text': text})
print("Validation took " + str(time.time() - start_time) + " seconds")