In [1]:
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPTNeoForCausalLM, TrainingArguments, Trainer
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import os
from tqdm import tqdm

In [2]:
# Set the random seed to a fixed value to get reproducible results 
torch.manual_seed(42)
# Download the pre-trained GPT-Neo model's tokenizer
# Add the custom tokens denoting the beginning and the end 
# of the sequence and a special token for padding
# tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B",    
#                             bos_token="<|startoftext|>",
#                             eos_token="<|endoftext|>",
#                             pad_token="<|pad|>")
tokenizer = GPT2Tokenizer.from_pretrained("models/tokenizer/neo")
# special_tokens_dict = {
#         "bos_token": "<BOS>",
#         "eos_token": "<EOS>",
#         "pad_token": "<PAD>",
#         "additional_special_tokens": [
#             "<endprompt>",
#         ],
#     }

# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
# Download the pre-trained GPT-Neo model and transfer it to the GPU
model = GPTNeoForCausalLM.from_pretrained("models/checkpoint-40000").cuda()
# Resize the token embeddings because we've just added 3 new tokens 
# model.resize_token_embeddings(len(tokenizer))

In [3]:
with open('data/Fairy_tales_combined (1).txt', "r", encoding='utf-8-sig') as file:
    data = file.readlines()

In [4]:
class NetflixDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            # Encode the descriptions using the GPT-Neo tokenizer
            encodings_dict = tokenizer(txt,
                                        truncation=True,
                                        max_length=max_length, 
                                        padding="max_length")
            input_ids = torch.tensor(encodings_dict['input_ids'])    
            self.input_ids.append(input_ids)
            mask = torch.tensor(encodings_dict['attention_mask'])
            self.attn_masks.append(mask)
    
    def __len__(self):
     return len(self.input_ids)

    def __getitem__(self, idx):
     return self.input_ids[idx], self.attn_masks[idx]

In [5]:
# max_length = max([len(tokenizer.encode(row)) for row in data])

dataset = NetflixDataset(data, tokenizer, 512)


In [6]:
len(dataset)

9183

In [7]:
dataset[0]

(tensor([50257, 10970,   367, 24805,    56,  4810,  1268,  5222,   220,  3334,
          2029,   262,  1748,   837,   319,   257,  7331,  5721,   837,  6204,
           262, 15207,   286,   262, 14628,   220,  9005,   764, 50260,  1544,
           373,   308, 46158,   477,   625,   351,  7888,  5667,   286,  3734,
          3869,   837,   329,  2951,   220,   339,   550,   734,  6016,   473,
           381,    71,  2387,   837,   290,   257,  1588,  2266, 43506,  1278,
          6972,   319,   465,   220,  8429,   532,   289,  2326,   764,   220,
           679,   373,   845,   881, 29382,  5600,   764,   366,   679,   318,
           355,  4950,   355,   257,  6193, 21517,   837,   366,   220, 24998,
           530,   286,   262,  8329,  3545, 20346,   669,   508, 16555,   284,
          4461,   257,  8507,   329,   220,  1719, 17290, 18221,  2162,   366,
           691,   407,  2407,   523,  4465,   837,   366,   339,  2087,   837,
         33188,   220, 27380,   661,   815,   892,  

In [8]:
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, 
                            [train_size, len(dataset) - train_size])

In [9]:
# Here I will pass the output directory where 
# the model predictions and checkpoints will be stored, 
# batch sizes for the training and validation steps, 
# and warmup_steps to gradually increase the learning rate
training_args = TrainingArguments(output_dir="./models",
                                  num_train_epochs=10,
                                  logging_steps=5000,
                                  save_steps=5000,                                   
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=2,
                                  warmup_steps=100,
                                  weight_decay=0.01,  
                                  logging_dir="./logs")

In [None]:
trainer = Trainer(model=model, args=training_args,  
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset, 
                  # This custom collate function is necessary 
                  # to built batches of data
                  data_collator=lambda data: 
              {"input_ids": torch.stack([f[0] for f in data]),       
               "attention_mask": torch.stack([f[1] for f in data]),
               "labels": torch.stack([f[0] for f in data])})
# Start training process!
trainer.train()

***** Running training *****
  Num examples = 8264
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 41320


Step,Training Loss
5000,1.4635
10000,1.4403
15000,1.277
20000,1.1177
25000,0.9553
30000,0.7905
35000,0.6642


Saving model checkpoint to ./models\checkpoint-5000
Configuration saved in ./models\checkpoint-5000\config.json
Model weights saved in ./models\checkpoint-5000\pytorch_model.bin
Saving model checkpoint to ./models\checkpoint-10000
Configuration saved in ./models\checkpoint-10000\config.json
Model weights saved in ./models\checkpoint-10000\pytorch_model.bin
Saving model checkpoint to ./models\checkpoint-15000
Configuration saved in ./models\checkpoint-15000\config.json
Model weights saved in ./models\checkpoint-15000\pytorch_model.bin
Saving model checkpoint to ./models\checkpoint-20000
Configuration saved in ./models\checkpoint-20000\config.json
Model weights saved in ./models\checkpoint-20000\pytorch_model.bin
Saving model checkpoint to ./models\checkpoint-25000
Configuration saved in ./models\checkpoint-25000\config.json
Model weights saved in ./models\checkpoint-25000\pytorch_model.bin
Saving model checkpoint to ./models\checkpoint-30000
Configuration saved in ./models\checkpoint-30

In [None]:
generated = tokenizer.encode(
    f" <BOS> MY FATHER MEETS THE CAT  <newline>  <newline>  <newline>  One cold rainy day when my father was a little boy , he met an old  <newline>  alley cat on his street . <endprompt> <EOS>",
    return_tensors="pt").cuda()
model = model.cuda()

sample_outputs = model.generate(generated, do_sample=False, top_k=50, max_length=1024, top_p=0.95,
                                temperature=0, num_return_sequences=0, repetition_penalty=1.1)
# sample_outputs = model.generate(generated, max_length=50)
predicted_text2 = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
print(predicted_text2)


In [None]:
tokenizer.save_pretrained('models/tokenizer/neo/')