In [1]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from functools import partial

import gc

In [2]:
import json
import numpy as np
import os

base_dir = '/mnt/d/Study/Thesis/thesis-implementations/'
model_name = 'gpt2'

dataset_path = os.path.join(base_dir, 'quest_generation/llama2/data')
train_file = 'train.jsonl'
val_file = 'val.jsonl'
data_files = {
	"train": train_file, 
	"val": val_file
}

In [3]:
dataset = load_dataset(dataset_path, data_files=data_files)
train_dataset = dataset['train']
val_dataset = dataset['val']

del dataset
gc.collect()

Found cached dataset json (/home/manish/.cache/huggingface/datasets/json/data-1eab3bb49fa34974/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/2 [00:00<?, ?it/s]

57

In [7]:
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
EOS_TOKEN = tokenizer.eos_token
tokenizer.pad_token = tokenizer.eos_token

In [8]:
def create_prompt_formats_with_kg(input):
    """
    Format various fields of the input quest data ('plots', 'kb', 'quest')
    Then concatenate them using two newline characters 
    :param input: input dictionary
    """

    BACKGROUND = "### Background:"
    PLOTS_KEY = "### Plots:"
    INTRO_BLURB = "The quest related to the above information is as follows."
    QUEST = "### Quest:"
    END_KEY = "### End"
    

    blurb = f"{INTRO_BLURB}"  # add intro blurb - model system instruction

    background = ''  # add background - knowledge graph as text
    for kb in input['kbs']:
        entity = kb['name']
        desc = kb['description']
        e_type = kb['type']
        relations = kb['relations']
        background += f'{entity} is a {e_type}. '
        if entity != desc:
            background+= f'{entity} is a {desc}. '
        for rel in relations:
            background += f' {entity} is {rel[0]} {rel[1]}.'
        background += '\n'
    background = f"{BACKGROUND}\n{background}"
    plots_str = '\n'.join(input['plots'])
    plots = f"{PLOTS_KEY}\n{plots_str}"  # add plots - key plot points
    
    quest_str = ''
    for k,v in input['quest'].items():
        if k == 'description':
            continue
        if k == 'tasks':
            value = '\n ' + '\n '.join(np.char.capitalize(v[:-1]))
        else:
            value = v.capitalize()
        quest_str += f'{k.capitalize()}: {value}\n' 
    quest = f"{QUEST}\n{quest_str}"  # add quest output
    
    end = f"{END_KEY}"  # add end key
    
    parts = [part for part in [background, plots, blurb, quest, end] if part]

    formatted_prompt = "\n\n".join(parts)
    input['text'] = formatted_prompt + f'\n{EOS_TOKEN}'

    return input

In [9]:
def create_prompt_formats_val_with_kg(input):
    """
    Format various fields of the input quest data ('plots', 'kb', 'quest')
    Then concatenate them using two newline characters 
    :param input: input dictionary
    """

    BACKGROUND = "### Background:"
    PLOTS_KEY = "### Plots:"
    INTRO_BLURB = "The quest related to the above information is as follows."
    QUEST = "### Quest:"
    END_KEY = "### End"
    

    blurb = f"{INTRO_BLURB}"  # add intro blurb - model system instruction

    background = ''  # add background - knowledge graph as text
    for kb in input['kbs']:
        entity = kb['name']
        desc = kb['description']
        e_type = kb['type']
        relations = kb['relations']
        background += f'{entity} is a {e_type}. '
        if entity != desc:
            background+= f'{entity} is a {desc}. '
        for rel in relations:
            background += f' {entity} is {rel[0]} {rel[1]}.'
        background += '\n'
    background = f"{BACKGROUND}\n{background}"
    plots_str = '\n'.join(input['plots'])
    plots = f"{PLOTS_KEY}\n{plots_str}"  # add plots - key plot points
    
    quest_str = ''
    for k,v in input['quest'].items():
        if k == 'description':
            continue
        if k == 'tasks':
            value = '\n ' + '\n '.join(np.char.capitalize(v[:-1]))
        else:
            value = v.capitalize()
        quest_str += f'{k.capitalize()}: {value}\n' 
    quest = f"{QUEST}\n{quest_str}"  # add quest output
    
    end = f"{END_KEY}"  # add end key
    
    parts_p = [part for part in [background, plots, blurb] if part]
    parts_o = [part for part in [quest, end] if part]
    
    formatted_prompt = "\n\n".join(parts_p)
    formatted_output = "\n\n".join(parts_o)
    input['text'] = formatted_prompt
    input['output'] = formatted_output + f'\n{EOS_TOKEN}'

    return input

In [10]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer, max_length: int, dataset: str, include_kg: bool = True):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    :param include_kg (bool): Whether to include knowledge graph in the prompt
    """
    
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats_with_kg)#, batched=True)
    
    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["id", "game", "kbs", "plots", "quest"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    return dataset

def preprocess_val_dataset(tokenizer, max_length: int, dataset: str, include_kg: bool = True):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    :param include_kg (bool): Whether to include knowledge graph in the prompt
    """
    
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats_val_with_kg if include_kg else create_prompt_formats_val_without_kg)#, batched=True)
    
    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["id", "game", "kbs", "plots", "quest"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    return dataset

In [11]:
# ## Preprocess dataset
max_length = get_max_length(model)
train_dataset = preprocess_dataset(tokenizer, max_length, train_dataset)
val_dataset = preprocess_val_dataset(tokenizer, max_length, val_dataset)

Loading cached processed dataset at /home/manish/.cache/huggingface/datasets/json/data-1eab3bb49fa34974/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-3c21b8ebdb9d8807.arrow


Found max lenth: 1024
Preprocessing dataset...


Map:   0%|          | 0/692 [00:00<?, ? examples/s]

Filter:   0%|          | 0/692 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/manish/.cache/huggingface/datasets/json/data-1eab3bb49fa34974/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-f240f2ec28fb2639.arrow


Preprocessing dataset...


Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Filter:   0%|          | 0/77 [00:00<?, ? examples/s]

In [12]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [13]:
# you need to set parameters 
output_dir = os.path.join(base_dir, 'models_local', 'gpt2', 'results', model_name, 'final_checkpoint')
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 500

In [None]:
train(
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "gpt2"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
sequence = input() # oil price
max_len = int(input()) # 20
generate_text(sequence, max_len) # oil price for July June which had been low at as low as was originally stated Prices have since resumed