In [15]:
import pandas as pd
import os
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers.optimization import Adafactor
import time
import warnings
from IPython.display import HTML, display
warnings.filterwarnings('ignore')

In [16]:
torch.cuda.empty_cache()

In [17]:
class ContentGenerator():
    def __init__(self):
        self.batch_size = 8
        self.num_of_epochs = 10
        self.checkpoint_path = 'checkpoint/all-10epoch-reviews-model.bin'
        self.config_path = 'data/t5-base-config.json'
    
    def getData(self):
        self.train_df = pd.read_csv('data/review_keywords_All_Beauty_5.csv')
        self.train_df = self.train_df[:500]
        self.num_of_batches = int(len(self.train_df)/self.batch_size)
        # print(self.train_df)
        
    def getDevice(self):
        if torch.cuda.is_available():
            self.dev = torch.device("cuda:0")
            print("Running on the GPU")
        else:
            self.dev = torch.device("cpu")
            print("Running on the CPU")
    
    def getTokenizer(self):
        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
    
    def getModel(self):
        self.getData()
        self.getDevice()
        self.getTokenizer()
        self.model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)
        self.model.to(self.dev)
        self.optimizer = Adafactor(
            self.model.parameters(),
            lr=1e-3,
            eps=(1e-30, 1e-3),
            clip_threshold=1.0,
            decay_rate=-0.8,
            beta1=None,
            weight_decay=0.0,
            relative_step=False,
            scale_parameter=False,
            warmup_init=False
        )
        
    def progress(self, loss,value, max=100):
        return HTML(""" Batch loss :{loss}
            <progress
                value='{value}'
                max='{max}',
                style='width: 100%'
            >
                {value}
            </progress>
        """.format(loss=loss,value=value, max=max))
    
    def fit(self):
        self.getModel()
        self.model.train()
        self.train_df = self.train_df.dropna()
        loss_per_10_steps=[]
        for epoch in range(1, self.num_of_epochs+1):
            print('Running epoch: {}'.format(epoch))
            running_loss=0

            out = display(self.progress(1, self.num_of_batches+1), display_id=True)
            for i in range(self.num_of_batches):
                inputbatch = []
                labelbatch = []
                new_df = self.train_df[i*self.batch_size:i*self.batch_size+self.batch_size]
                for indx,row in new_df.iterrows():
                    input = 'WebNLG: '+row['input_text']+'</s>' 
                    labels = row['target_text']+'</s>'   
                    inputbatch.append(input)
                    labelbatch.append(labels)

                if len(inputbatch) < 1:
                    continue
                inputbatch = self.tokenizer.batch_encode_plus(inputbatch,padding=True,max_length=400,return_tensors='pt')["input_ids"]
                labelbatch = self.tokenizer.batch_encode_plus(labelbatch,padding=True,max_length=400,return_tensors="pt") ["input_ids"]
                inputbatch = inputbatch.to(self.dev)
                labelbatch = labelbatch.to(self.dev)

                self.optimizer.zero_grad()

                outputs = self.model(input_ids=inputbatch, labels=labelbatch)
                loss = outputs.loss
                loss_num = loss.item()
                logits = outputs.logits
                running_loss += loss_num
                if i%10 ==0:
                    loss_per_10_steps.append(loss_num)
                out.update(self.progress(loss_num, i, self.num_of_batches+1))
                loss.backward()
                self.optimizer.step()

            running_loss = running_loss/int(self.num_of_batches)
            print('Epoch: {} , Running loss: {}'.format(epoch,running_loss))
        self.saveModel()
        self.emptyCudaCache()

    def saveModel(self):
        torch.save(self.model.state_dict(), self.checkpoint_path)
    
    def loadModel(self):
        return T5ForConditionalGeneration.from_pretrained(self.checkpoint_path, return_dict=True, config=self.config_path)
    
    def emptyCudaCache(self):
        torch.cuda.empty_cache()
    
    def generate(self, text):
        torch.manual_seed(0)
        model = self.loadModel()
        model.eval()
        input_ids = self.tokenizer.encode("WebNLG:{} </s>".format(text), return_tensors="pt")
        sample_outputs = model.generate(
            input_ids,
            do_sample=True,
            max_length=50, 
            top_k=4, 
            top_p=0.99,
            num_return_sequences=1
        )

        print("Output:\n" + 100 * '-')
        ans = []
        for i, sample_output in enumerate(sample_outputs):
            ans.append(self.tokenizer.decode(sample_output, skip_special_tokens=True))
        return ans
        # for i, sample_output in enumerate(sample_outputs):
        #   print("{}: {}".format(i, self.tokenizer.decode(sample_output, skip_special_tokens=True)))

In [18]:
obj = ContentGenerator()

In [19]:
obj.getTokenizer()

In [20]:
csv_data = pd.read_csv('data/review_keywords_Software_5.csv')
csv_data[:5]

Unnamed: 0,input_text,target_text
0,version | software | pay | middle | update | f...,I just recently converted to this version from...
1,tech | support | fact | method | product | res...,If you have any problems you will not be able ...
2,look | hood | information | engine | user | in...,"Because, while I'm not about to go looking und..."
3,desktop | sound | video | editing | software |...,Corel VideoStudio Ultimate X8 installed on my ...
4,experience | accounting | product | version | ...,My last experience with a Peachtree accounting...


In [24]:
generated_data_arr = {'input_text': [], 'target_text': [], 'generated_text': []}
data = csv_data.copy()
for index, row in data.iterrows():
    generated_data_arr = {'input_text': [], 'target_text': [], 'generated_text': []}
    generated_data_arr['input_text'].append(row['input_text'])
    generated_data_arr['target_text'].append(row['target_text'])
    generated_data_arr['generated_text'].append(obj.generate(row['input_text'])[0])
    
    df = pd.DataFrame(generated_data_arr)
    df.to_csv('data/generated_Software_5.csv', columns=['input_text', 'target_text', 'generated_text'], mode='a', header=False, index=False)

len(generated_data_arr['input_text'])

Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
-----------

Token indices sequence length is longer than the specified maximum sequence length for this model (783 > 512). Running this sequence through the model will result in indexing errors


Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
----------------------------------------------------------------------------------------------------
Output:
-----------

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(generated_data_arr)
df

In [12]:
df.to_csv('data/generated_Software_5.csv', columns=['input_text', 'target_text', 'generated_text'], mode='a', index=False)

In [51]:
# random inference on model
key = 'shampoo | change | skin | harmful'
ans = obj.generate(key)
ans[0]

Output:
----------------------------------------------------------------------------------------------------


"I have been using this shampoo for years and found helpful. It helps a lot of people with dry skin, but it is mostly just a shampoo that doesn't cause any damage to my skin or"