In [1]:
import torch
import random
import pandas as pd
from torch.utils.data import DataLoader,Dataset
from models.model import Model

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
max_input_length = 512
max_target_length = 128
batch = 8
print_every = 50
prefix = "summarize:"

In [45]:
class wikiData(Dataset):
    def __init__(self, df, tokenizer, max_length=128):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attention_mask = []
        self.labels = []
        inputs = [prefix + text for text in df["body_text"]]
        input_tokenize = tokenizer( 
                                inputs,
                                add_special_tokens=True,        #Add Special tokens like [CLS] and [SEP]
                                max_length=max_length,
                                padding = 'max_length',         #for padding to max_length for equal sequence length
                                truncation = True,              #truncate the text if it is greater than max_length
                                return_attention_mask=True,     #will return attention mask
                                return_tensors="pt"             #return tensor formate
                                )

        self.input_ids = torch.tensor(input_tokenize['input_ids'])
        self.attention_mask = torch.tensor(input_tokenize['attention_mask'])
        
        with tokenizer.as_target_tokenizer():
            label_tokenize = tokenizer(
                                    list(df["title"]), 
                                    add_special_tokens=True,        #Add Special tokens like [CLS] and [SEP]
                                    max_length=max_length,
                                    padding = 'max_length',         #for padding to max_length for equal sequence length
                                    truncation = True,              #truncate the text if it is greater than max_length
                                    return_attention_mask=True,     #will return attention mask
                                    return_tensors="pt"
                                    )
                
            self.labels = torch.tensor(label_tokenize['input_ids'])
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx], self.labels[idx] 
    

In [80]:

def valid(model, valid_dataloader,tokenizer):
    model.eval()
    
    running_loss = 0
    for batch in valid_dataloader :
        input_ids = batch[0]
        masks = batch[1]
        labels = batch[2]
        # Turn off gradients for validation, will speed up inference
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=masks, labels=labels)
            loss = outputs.loss
        running_loss += loss.item()

    random_batch = random.choice(list(valid_dataloader))

    
    original_text = tokenizer.decode(random_batch[2][0], skip_special_tokens=True) 
    print("Original Title:", original_text)
    outputs = model.generate(random_batch[0])
    print("Generate title:",outputs)

    return(running_loss/len(valid_dataloader))
    

In [79]:
def train():
    # Hyperparameters
    lr = 5e-5
    epochs = 15
    batch = 8
    print_every = 50
    seed = 123
    
    train_losses = []
    valid_losses = []

    model = Model()

    # Optimizer and tokenizer 
    tokenizer = model.tokenizer
    optimizer = model.configure_optimizers()

    
    # Readfile and make to dataloader
    filepath = "../data/processed/"
    df_train = pd.read_csv(filepath+'train.csv')
    df_valid = pd.read_csv(filepath+'valid.csv')    
    train_data = wikiData(df_train.head(2000) ,tokenizer,max_target_length)
    valid_data = wikiData(df_valid.head(200) ,tokenizer,max_target_length)
    train_dataloader = DataLoader(train_data, batch_size =batch,shuffle= True)
    valid_dataloader = DataLoader(valid_data, batch_size =batch,shuffle= True)

    
    for e in range(epochs):
        train_loss = 0
        running_loss = 0
        model.train()
        print("Epoch: {}/{}.. ".format(e + 1, epochs))
        for steps, batch in enumerate(train_dataloader):
            # load data and labels in the batch
            input_ids = batch[0]
            masks = batch[1]
            labels = batch[2]

            # Training
            model.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=masks, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            running_loss += loss.item()
            train_loss += loss.item()
            if steps % print_every == 0 and not steps == 0:
                # original_text = tokenizer.decode(labels[0], skip_special_tokens=True)
                # print("Original Title:", original_text)
                # outputs = model.generate(input_ids)
                # print("Generate title:",outputs)
                print(
                    "Batch: {}/{}.. ".format(steps, len(train_dataloader)),
                    "Training Loss: {:.3f}.. ".format(running_loss / print_every))
                running_loss = 0
                
            loss.backward()
            optimizer.step()
        valid_loss = valid(model, valid_dataloader,tokenizer)
        print(
            "Training Loss: {:.3f}.. ".format(train_loss / len(train_dataloader)),
            "Valid Loss: {:.3f} ".format(valid_loss),)
        valid_losses.append(valid_loss)
        train_losses.append(train_loss / len(train_dataloader))

In [81]:
train()

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  self.input_ids = torch.tensor(input_tokenize['input_ids'])
  self.attention_mask = torch.tensor(input_tokenize['attention_mask'])
  self.labels = torch.tensor(label_tokenize['input_ids'])


Epoch: 1/15.. 
Batch: 50/250..  Training Loss: 4.359.. 
Batch: 100/250..  Training Loss: 1.109.. 
Batch: 150/250..  Training Loss: 0.762.. 
Batch: 200/250..  Training Loss: 0.692.. 
Original Title: Wikibooks: World of Warcraft/Professions




Generate title: 
Training Loss: 1.504..  Valid Loss: 0.504 
Epoch: 2/15.. 
Batch: 50/250..  Training Loss: 0.566.. 
Batch: 100/250..  Training Loss: 0.507.. 
Batch: 150/250..  Training Loss: 0.520.. 
Batch: 200/250..  Training Loss: 0.516.. 
Original Title: Wikibooks: The Rowers of Vanity Fair/Peel AW
Generate title: 
Training Loss: 0.512..  Valid Loss: 0.386 
Epoch: 3/15.. 
Batch: 50/250..  Training Loss: 0.461.. 
Batch: 100/250..  Training Loss: 0.418.. 
Batch: 150/250..  Training Loss: 0.403.. 
Batch: 200/250..  Training Loss: 0.404.. 
Original Title: Wikibooks: Handbook of Management Scales/Learning
Generate title: Wikibooks: Enterprise Resource Planning/Episode
Training Loss: 0.419..  Valid Loss: 0.340 
Epoch: 4/15.. 
Batch: 50/250..  Training Loss: 0.387.. 
Batch: 100/250..  Training Loss: 0.390.. 
Batch: 150/250..  Training Loss: 0.388.. 
Batch: 200/250..  Training Loss: 0.370.. 
Original Title: Wikibooks: Past LSAT Explained/PrepTest 48
Generate title: Wikibooks: Games/Sections