In [1]:
import pandas as pd
import numpy as np
import os

import torch
from torch.utils.data import DataLoader
from torch import cuda

from transformers import T5Tokenizer, T5ForConditionalGeneration

from src import data
from src.DialoGPT import create_context, chat_with_me
from src.GoogleT5 import DatasetT5, train, validate

## Set parameters

In [2]:
SEED = 23
MODEL = 't5-base'

MAX_SOURCE_TEXT_LENGTH = 256
MAX_TARGET_TEXT_LENGTH = 128

TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2

LEARNING_RATE = 5e-5
TRAIN_EPOCHS = 50

OUTPUT_DIR = '../outputs/GoogleT5'

CHAR_NAME = 'Iroh'
CONTEXT_LENGTH = 1

## Loading and fine-tuning model

In [None]:
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED)  # pytorch random seed
np.random.seed(SEED)  # numpy random seed
device = 'cuda' if cuda.is_available() else 'cpu'

# logging
print(f"""[Model]: Loading {MODEL}...\n""")

# tokenizer for encoding the text
tokenizer = T5Tokenizer.from_pretrained(MODEL, model_max_length=MAX_SOURCE_TEXT_LENGTH)

# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained(MODEL)
model = model.to(device)

# logging
print(f"[Data]: Reading data...\n")

# Importing the raw dataset
dialogue_df = data.read_dataframe()
context_df = create_context(dialogue_df, CHAR_NAME, CONTEXT_LENGTH)

# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest for validation.
train_size = 0.8
train_dataset = context_df.sample(frac=train_size, random_state=SEED)
val_dataset = context_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print(f"FULL Dataset: {context_df.shape}")
print(f"TRAIN Dataset: {train_dataset.shape}")
print(f"TEST Dataset: {val_dataset.shape}\n")

# Creating the Training and Validation dataset for further creation of Dataloader
training_set = DatasetT5(
    train_dataset,
    tokenizer,
    MAX_SOURCE_TEXT_LENGTH,
    MAX_TARGET_TEXT_LENGTH
)
val_set = DatasetT5(
    val_dataset,
    tokenizer,
    MAX_SOURCE_TEXT_LENGTH,
    MAX_TARGET_TEXT_LENGTH
)

# Defining the parameters for creation of dataloaders
train_params = {
    "batch_size": TRAIN_BATCH_SIZE,
    "shuffle": True,
    "num_workers": 0,
}

val_params = {
    "batch_size": VALID_BATCH_SIZE,
    "shuffle": False,
    "num_workers": 0,
}

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

# Defining the optimizer that will be used to tune the weights of the network in the training session.
optimizer = torch.optim.Adam(
    params=model.parameters(), lr=LEARNING_RATE
)

# Training loop
print(f"[Initiating Fine Tuning]...\n")

for epoch in range(TRAIN_EPOCHS):
    train(tokenizer, model, device, training_loader, optimizer)

    print(f"[Saving Model]...\n")
    # Saving the model after training
    path = os.path.join(OUTPUT_DIR, "model_files", f"epoch-{epoch}")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

    contexts, predictions, actuals = validate(tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({"Context": contexts, "Generated Text": predictions, "Actual Text": actuals})
    final_df.to_csv(os.path.join(OUTPUT_DIR, "predictions.csv"))

[Model]: Loading t5-base...

[Data]: Reading data...

FULL Dataset: (337, 2)
TRAIN Dataset: (270, 2)
TEST Dataset: (67, 2)

[Initiating Fine Tuning]...

Training loss: 1.1339337825775146
[Saving Model]...

Training loss: 0.7070589661598206
[Saving Model]...

Training loss: 0.6169018149375916
[Saving Model]...

Training loss: 0.552310585975647
[Saving Model]...

Training loss: 0.507283091545105
[Saving Model]...

Training loss: 0.46700868010520935
[Saving Model]...



In [None]:
def chat_with_me(model, tokenizer, steps: int = 5) -> None:
    """
    chatting with trained model
    :param model: trained model, in general it should be an object of type GPT2LMHeadModel
    :param tokenizer: tokenizer for given model
    :param steps: the length of the talk (number of phrases we wish to write)
    """

    for step in range(steps):
        # encode the new user input, add the eos_token and return a tensor in Pytorch
        user_answer = tokenizer.encode(input(">> User:"), return_tensors='pt')
        #user_ids = user_answer["input_ids"].squeeze().to_dtype(dtype=torch.long)
        #user_mask = user_answer["attention_mask"].squeeze().to_dtype(dtype=torch.long)

        generated_ids = model.generate(
            input_ids=user_answer.to(device),
            #attention_mask=user_mask,
            max_length=150,
            num_beams=10
        )

        # pretty print last output tokens from bot
        #print("\nBot: {}".format(
        #    tokenizer.decode(generated_ids, skip_special_tokens=True)))
        print(tokenizer.decode(generated_ids.cpu().detach()[0], skip_special_tokens=True))

In [None]:
chat_with_me(model, tokenizer, 1)