In [1]:
!nvidia-smi

Sun Apr  4 16:44:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    32W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# !pip install sentencepiece
# !pip install transformers
# !pip install torch
# !pip install rich[jupyter]

In [3]:

import os
import re
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook, tnrange
from sklearn.utils import shuffle
import pickle
import math

def save_pickle(path, obj):
  with open(path, 'wb') as fp:
    pickle.dump(obj, fp)

def load_pickle(path):
  with open(path, 'rb') as fp:
    return pickle.load(fp)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data = load_pickle('/content/drive/MyDrive/T5MovieWikiTraining2_0/t5-source-target-data-2.pkl')

In [6]:
# from google.colab import drive
# drive.mount('/content/drive')

In [7]:
len(data)

138122

In [8]:
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggiazngface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# rich: for a better display on terminal
from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console = Console(record=True)

training_logger = Table(
    Column("Random Selection", justify = "center"),
    Column("Epoch", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

valid_loggger = Table(
    Column("Random Selection", justify = "center"),
    Column("Loss", justify = "center"),
    title="Validation Status",
    pad_edge=False,
    box=box.ASCII,
)

In [9]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [10]:
# from google.colab import drive
# drive.mount('/content/drive')

In [11]:
class T5Dataset(Dataset):

  def __init__(self, tokenizer, data, source_len, target_len):

    super(T5Dataset, self).__init__()
    self.tokenizer = tokenizer
    self.source_len = source_len
    self.target_len = target_len
    self.data = data

  def __len__(self):

    return len(self.data)

  def __getitem__(self, index):

    source_seq = self.data[index]['source']
    target_seq = self.data[index]['target']

    source = self.tokenizer.batch_encode_plus(
        [source_seq],
        max_length = self.source_len,
        pad_to_max_length = True,
        truncation = True,
        padding = "max_length",
        return_tensors = "pt"
    )

    target = self.tokenizer.batch_encode_plus(
        [target_seq],
        max_length = self.target_len,
        pad_to_max_length = True,
        truncation = True,
        padding = "max_length",
        return_tensors = "pt"
    )

    source_ids = source["input_ids"].squeeze()
    source_mask = source["attention_mask"].squeeze()
    target_ids = target["input_ids"].squeeze()
    target_mask = target["attention_mask"].squeeze()

    return {
        "source_ids": source_ids,
        "source_mask": source_mask,
        "target_ids": target_ids,
        "target_mask": target_mask
    }

In [12]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    model.train()

    total_loss = 0
    total_counts = 0

    for _, data in enumerate(tqdm_notebook(loader, desc = "Train DL")):

        y = data["target_ids"].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype = torch.long)
        mask = data["source_mask"].to(device, dtype = torch.long)


        optimizer.zero_grad()

        outputs = model(
            input_ids = ids, attention_mask = mask, decoder_input_ids = y_ids, labels = lm_labels
        )

        loss = outputs[0]

        total_counts += 1
        total_loss += loss.item()

        
        loss.backward()
        optimizer.step()

    
    # training_logger.add_row(str(epoch), str(f'{total_loss/total_counts}'))
    # console.log(training_logger)

    return total_loss/total_counts

In [13]:
def validate(epoch, tokenizer, model, device, loader):

    model.eval()
    total_loss = 0
    total_counts = 0

    with torch.no_grad():

        for _, data in enumerate(tqdm_notebook(loader, desc = "Valid DL")):

            y = data["target_ids"].to(device, dtype = torch.long)
            y_ids = y[:, :-1].contiguous()
            lm_labels = y[:, 1:].clone().detach()
            lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
            ids = data["source_ids"].to(device, dtype = torch.long)
            mask = data["source_mask"].to(device, dtype = torch.long)

            outputs = model(
            input_ids = ids, attention_mask = mask, decoder_input_ids = y_ids, labels = lm_labels
            )

            loss = outputs[0]

            total_loss += loss.item()
            total_counts += 1


    return total_loss / total_counts

In [14]:
from IPython.display import clear_output

In [15]:
def trainer(
    data, model_params, output_dir = "./outputs/"
):

    torch.manual_seed(model_params["SEED"])
    torch.cuda.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True
    

    console.log(f'''Model: Loading {model_params['MODEL']}.....''')

    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])

    model = model.to(device)

    console.log(f"[DATA]: READING DATA.......")

    # dataframe = dataframe[[source_text, target_text]]

    

    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    ## model save path

    path = os.path.join(output_dir, "model_files")

    
    console.log("Starting with Random Selection")
    ## random selection
    prev_loss = []
    for randomSelection in tnrange(model_params["RANDOM_TRAIN_STEPS"], desc = 'Random Selection'):
        
        copyData = data.copy()
        copyData = shuffle(copyData)
        # copyData.reset_index(inplace = True)
        # copyData = copyData[:2000]

        train_size = 0.75
        random_permuts = np.random.permutation(len(copyData))
        train_nums = round(len(random_permuts) * train_size)
        train_dataset = [copyData[i] for i in random_permuts[:train_nums]]
        valid_dataset = [copyData[i] for i in random_permuts[train_nums:]] 
        # train_dataset = copyDf.sample(frac = train_size, random_state = model_params["SEED"])
        # val_dataset = copyDf.drop(train_dataset.index).reset_index(drop=True)
        # train_dataset = train_dataset.reset_index(drop = True)

        # console.log(f'Random Selection - {randomSelection}')
        # console.log(f'FULL DATASET: {copyDf.shape}')
        # console.log(f'TRAIN DATASET: {train_dataset.shape}')
        # console.log(f'VALID DATASET: {val_dataset.shape}')

        training_set = T5Dataset(
            tokenizer, train_dataset, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"]
        )

        val_set = T5Dataset(
            tokenizer, valid_dataset, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"]
        )

        train_params = {
            "batch_size": model_params["TRAIN_BATCH_SIZE"],
            "shuffle": True,
            "num_workers": 0
        }

        val_params = {
            "batch_size": model_params["VALID_BATCH_SIZE"],
            "shuffle": False,
            "num_workers": 0
        }
        
        train_dl = DataLoader(training_set, **train_params)
        

        ## training 
        console.log(f'[MODEL TRAINING]')
        clear_output(wait = True)
        for epoch in tnrange(model_params["TRAIN_EPOCHS"], desc = "Training"):
            
            total_loss = train(epoch, tokenizer, model, device, train_dl, optimizer)

            training_logger.add_row(str(randomSelection), str(epoch), str(total_loss))
            console.log(training_logger)
            if epoch == 0:
              console.log(f"Saving Model at epoch: {epoch} with total loss: {total_loss}")
              model.save_pretrained(os.path.join(output_dir, "model_files_initial"))
              tokenizer.save_pretrained(os.path.join(output_dir, "model_files_initial"))


            if epoch > 0:

                if min(prev_loss) > total_loss:
                    console.log(f"Saving Model at epoch: {epoch} with total loss: {total_loss}")
                    model.save_pretrained(path)
                    tokenizer.save_pretrained(path)

            prev_loss.append(total_loss)
        del train_dl, training_set
        ## validation
        valid_dl = DataLoader(val_set, **val_params)
        console.log(f'[MODEL VALIDATION]')
        for epoch in tnrange(model_params["VAL_EPOCHS"], desc = "Validation"):

            val_loss = validate(epoch, tokenizer, model, device, valid_dl)
            # final_df = pd.DataFrame({"Generate": predictions, "Actual": actuals})
            # final_df.to_csv(os.path.join(output_dir, f"predictions-random-{randomSelection}.csv"))

            valid_loggger.add_row(str(randomSelection), str(val_loss))
            console.log(valid_loggger)
        console.save_text(os.path.join(output_dir, f"logs-random-{randomSelection}.txt"))

        console.log(f"[VALIDATAION DONE]")     
        del valid_dl, val_set   

In [16]:
model_params = {  
    "MODEL": "drive/MyDrive/T5MovieWikiTraining2_0/outputs-itr-3/model_files/",
    "TRAIN_BATCH_SIZE": 2,
    "VALID_BATCH_SIZE": 2,
    "TRAIN_EPOCHS": 10,
    "VAL_EPOCHS": 1,
    "LEARNING_RATE": 1e-4,
    "MAX_SOURCE_TEXT_LENGTH": 128,
    "MAX_TARGET_TEXT_LENGTH": 786,
    "SEED": 3007,
    "RANDOM_TRAIN_STEPS": 50
}

In [None]:
trainer(
    data = data,
    model_params = model_params,
    output_dir = "drive/MyDrive/T5MovieWikiTraining2_0/outputs-itr-4"
)

HBox(children=(FloatProgress(value=0.0, description='Training', max=10.0, style=ProgressStyle(description_widt…

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, description='Train DL', max=51796.0, style=ProgressStyle(description_w…

# New Section