In [25]:
import pandas as pd
import os
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers.optimization import Adafactor
import time
import warnings
from IPython.display import HTML, display
warnings.filterwarnings('ignore')

In [26]:
torch.cuda.empty_cache()

In [27]:
class ContentGenerator():
    def __init__(self):
        self.batch_size = 8
        self.num_of_epochs = 10
        self.checkpoint_path = 'checkpoint/reviews-model.bin'
        self.config_path = 'data/t5-base-config.json'
    
    def getData(self):
        self.train_df = pd.read_csv('data/review_keywords_All_Beauty_5.csv')
        self.train_df = self.train_df[:500]
        self.num_of_batches = int(len(self.train_df)/self.batch_size)
        # print(self.train_df)
        
    def getDevice(self):
        if torch.cuda.is_available():
            self.dev = torch.device("cuda:0")
            print("Running on the GPU")
        else:
            self.dev = torch.device("cpu")
            print("Running on the CPU")
    
    def getTokenizer(self):
        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
    
    def getModel(self):
        self.getData()
        self.getDevice()
        self.getTokenizer()
        self.model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)
        self.model.to(self.dev)
        self.optimizer = Adafactor(
            self.model.parameters(),
            lr=1e-3,
            eps=(1e-30, 1e-3),
            clip_threshold=1.0,
            decay_rate=-0.8,
            beta1=None,
            weight_decay=0.0,
            relative_step=False,
            scale_parameter=False,
            warmup_init=False
        )
        
    def progress(self, loss,value, max=100):
        return HTML(""" Batch loss :{loss}
            <progress
                value='{value}'
                max='{max}',
                style='width: 100%'
            >
                {value}
            </progress>
        """.format(loss=loss,value=value, max=max))
    
    def fit(self):
        self.getModel()
        self.model.train()
        self.train_df = self.train_df.dropna()
        loss_per_10_steps=[]
        for epoch in range(1, self.num_of_epochs+1):
            print('Running epoch: {}'.format(epoch))
            running_loss=0

            out = display(self.progress(1, self.num_of_batches+1), display_id=True)
            for i in range(self.num_of_batches):
                inputbatch = []
                labelbatch = []
                new_df = self.train_df[i*self.batch_size:i*self.batch_size+self.batch_size]
                for indx,row in new_df.iterrows():
                    input = 'WebNLG: '+row['input_text']+'</s>' 
                    labels = row['target_text']+'</s>'   
                    inputbatch.append(input)
                    labelbatch.append(labels)

                if len(inputbatch) < 1:
                    continue
                inputbatch = self.tokenizer.batch_encode_plus(inputbatch,padding=True,max_length=400,return_tensors='pt')["input_ids"]
                labelbatch = self.tokenizer.batch_encode_plus(labelbatch,padding=True,max_length=400,return_tensors="pt") ["input_ids"]
                inputbatch = inputbatch.to(self.dev)
                labelbatch = labelbatch.to(self.dev)

                self.optimizer.zero_grad()

                outputs = self.model(input_ids=inputbatch, labels=labelbatch)
                loss = outputs.loss
                loss_num = loss.item()
                logits = outputs.logits
                running_loss += loss_num
                if i%10 ==0:
                    loss_per_10_steps.append(loss_num)
                out.update(self.progress(loss_num, i, self.num_of_batches+1))
                loss.backward()
                self.optimizer.step()

            running_loss = running_loss/int(self.num_of_batches)
            print('Epoch: {} , Running loss: {}'.format(epoch,running_loss))
        self.saveModel()
        self.emptyCudaCache()

    def saveModel(self):
        torch.save(self.model.state_dict(), self.checkpoint_path)
    
    def loadModel(self):
        return T5ForConditionalGeneration.from_pretrained(self.checkpoint_path, return_dict=True, config=self.config_path)
    
    def emptyCudaCache(self):
        torch.cuda.empty_cache()
    
    def generate(self, text):
        torch.manual_seed(0)
        model = self.loadModel()
        model.eval()
        input_ids = self.tokenizer.encode("WebNLG:{} </s>".format(text), return_tensors="pt")
        sample_outputs = model.generate(
            input_ids,
            do_sample=True,
            max_length=50, 
            top_k=4, 
            top_p=0.99,
            num_return_sequences=10
        )

        print("Output:\n" + 100 * '-')
        for i, sample_output in enumerate(sample_outputs):
          print("{}: {}".format(i, self.tokenizer.decode(sample_output, skip_special_tokens=True)))

In [28]:
obj = ContentGenerator()

In [29]:
obj.fit()

Running on the CPU
Running epoch: 1


Epoch: 1 , Running loss: 1.8760927684845463
Running epoch: 2


Epoch: 2 , Running loss: 1.0149331294721173
Running epoch: 3


Epoch: 3 , Running loss: 0.6511879577752082
Running epoch: 4


Epoch: 4 , Running loss: 0.44876767178216287
Running epoch: 5


Epoch: 5 , Running loss: 0.3281198222069971
Running epoch: 6


Epoch: 6 , Running loss: 0.25544375127121327
Running epoch: 7


Epoch: 7 , Running loss: 0.21032490617325228
Running epoch: 8


Epoch: 8 , Running loss: 0.1724404438789333
Running epoch: 9


Epoch: 9 , Running loss: 0.13932882042060937
Running epoch: 10


Epoch: 10 , Running loss: 0.11538848540775719


In [21]:
obj = ContentGenerator()
obj.getTokenizer()

In [22]:
key = 'shampoo | change | skin | harmful'
obj.generate(key)

Output:
----------------------------------------------------------------------------------------------------
0: I just LOVE this shampoo! No more chemicals, no added chemicals, no added chemicals, no added chemicals, no added chemicals, no added chemicals, no added chemicals, no added chemicals, no added chemicals, no added chemicals,
1: Shampoo is a wonderful shampoo. It makes my skin feel really clean and soft. No chemicals, no pesticides, or preservatives and not tested on animals.
2: Shampoo uses only the purest of ingredients and not chemically based on any other product. This enables me to use only the best of the best and most effective shampoo ever made. No added chemicals nor preservative
3: Shampoo uses nothing more than a simple change of the skin (except for the shampoo itself). This is not a great shampoo, it doesn't make your skin red or dry, but it does produce
4: Shampoo can cause skin changes that can cause skin cancer. Useful shampoo can be helpful but not harmful.
5: