In [None]:
import os
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
path= '/content/drive/MyDrive/news dataset/children stories/scraped/'
os.chdir(path)

In [None]:
train_df = pd.read_csv('dataset_with_keys_15b.csv')

In [None]:
train_df[:1]

Unnamed: 0,title,story,class,url,keywords,key_scores
0,On the go with Zibo,zibo is a little zebra. you know what a zebra ...,adventure,https://www.bedtime.com/on-the-go-with-zibo/,"['zibo', 'animal', 'give', 'zebra', 'cow', 'st...","[0.025387348146749516, 0.09251316835653439, 0...."


In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration,Adafactor
# Define your custom dataset
class MyDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        input_str = "<title> "+ row['title'] + " <keywords> " + ",".join(row['key']) +'<class> ' + row['class']
        labels = row['story']
        
        # tokenize the input string and label
        input_encoding = self.tokenizer.encode_plus(
            input_str,
            max_length=100,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        label_encoding = self.tokenizer.encode_plus(
            labels,
            max_length=2000,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # return a dictionary of input and label tensors
        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': label_encoding['input_ids'].flatten()
        }

In [None]:
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
# # Set the maximum size of each split to 200MB
# max_split_size_mb = 200


# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

In [None]:
# Creating  DataLoader 
dataset = MyDataset(train_df[:280], tokenizer)
# Set the batch size and number of workers
batch_size = 4
num_workers = 2

# Create your DataLoader
dataloader = DataLoader(
    dataset, batch_size=batch_size, shuffle=True,
    num_workers=num_workers, pin_memory=True,
    collate_fn=lambda examples: {'input_ids': torch.stack([x['input_ids'] for x in examples]),
                                 'attention_mask': torch.stack([x['attention_mask'] for x in examples]),
                                 'labels': torch.stack([x['labels'] for x in examples])}
)

In [None]:
num_of_epochs = 100
optimizer = Adafactor(model.parameters(),lr=1e-3,
                      eps=(1e-30, 1e-3),
                      clip_threshold=1.0,
                      decay_rate=-0.8,
                      beta1=None,
                      weight_decay=0.0,
                      relative_step=False,
                      scale_parameter=False,
                      warmup_init=False)

In [None]:
# Train the model
for epoch in range(1, num_of_epochs + 1):
    print('Running epoch: {}'.format(epoch))
    running_loss = 0
    for i, batch in enumerate(dataloader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        # Clear out the gradients of all Variables 
        optimizer.zero_grad()

        # Forward propagation
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        # Backward propagation
        loss.backward()

        # Update the parameters
        optimizer.step()

        running_loss += loss.item()

    running_loss /= len(dataloader)
    print('Epoch: {}, Running loss: {}'.format(epoch, running_loss))

In [None]:
torch.save(model.state_dict(),'T5_story_gen_key.bin')

In [None]:
train_df.iloc[1]['title']

In [None]:
title= 'A good friend'
key='tulsi,winner, played games, forest,winter,tulsi knows,mobile, animals, sky, books, star and moon, morning sunshine,soil on floor'
cls='adventure'

In [None]:
# Set the input string
input_str = "<title> "+ title + " <keywords> " + ",".join(key) +'<class> ' + cls
max_length=200
# Tokenize the input string
input_ids = tokenizer.encode(input_str, return_tensors='pt')
input_ids = input_ids.to(device)
# Generate the output
output_ids = model.generate(input_ids,max_length=max_length)

# Decode the output
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the output
(output_str)

'one good friend tulsi,winner, played games, forest,winter,tulsi knows,mobile, sports, tennis, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, hockey, snowboarding, snowboarding etc. tulsi is a good friend tulsi,winner,winner,winner,winner,winner was a good friend tulsi, who loves to play games, watched snow falling from the stars sky, sky, sky, sky, sky, mountains, and even visited boulders., there are many many varieties of games, stone to choose between,winner and the prizes will be presented for the evening., as a good friend tulsi watched the video games on the floor adventure.,'