<a href="https://colab.research.google.com/github/trgscott/LELA70502_Coursework/blob/main/Can_language_models_generate_more_original_ideas%3F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Can language models generate more original ideas?**

The following code tests whether fine-tuning GPT-2 on fictional content and adapting the decoding parameters can align a model to generate more original ideas. This is tested via self-METEOR and ROUGE-L for the semantic difference between generated texts and a set of baseline texts. Answers to brainteaser puzzles are also used to test the coherence and value of generated ideas.

The temperature and K values will need to be varied manually and the generation and testing codes re-run if you would like to see the differences in the parameters.

Source for book plots:

https://www.cs.cmu.edu/~dbamman/booksummaries.html

Source for ratings of books:

https://cseweb.ucsd.edu/~jmcauley/datasets/goodreads.html

Source for puzzles:

https://huggingface.co/datasets/ErfanMoosaviMonazzah/brain-teasers

Source for underlying training code framework:

https://colab.research.google.com/drive/13dZVYEOMhXhkXWfvSMVM1TTtUDrT6Aeh?usp=sharing

In [None]:
!pip install transformers -U
!pip install evaluate
!pip install rouge_score

In [None]:
import pandas as pd
import re
import gzip
import json
import numpy as np
import torch
import time
import datetime
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
import random
from tqdm import tqdm
from random import randint
import evaluate
import nltk
from nltk.translate import meteor
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.corpus import WordNetCorpusReader, wordnet
from sklearn.model_selection import train_test_split

In [None]:
# Set the random seed value for reproducibility
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# **Importing book plots, ratings and puzzles into pandas dataframes:**

In [None]:
#import plot summaries data
!wget https://www.cs.cmu.edu/~dbamman/data/booksummaries.tar.gz
!gunzip booksummaries.tar.gz
!tar -xvf booksummaries.tar
plots_df=pd.read_table("booksummaries/booksummaries.txt", header=None, names=["Wikipedia_ID", "Freebase_ID", "title", "Author", "Publication_Date", "Book_Genres", "Plot_Summary"])

In [None]:
#plots_df.loc[:,["Plot_Summary","title"]]

In [None]:
#import book reviews data
!wget https://mcauleylab.ucsd.edu/public_datasets/gdrive/goodreads/byGenre/goodreads_books_fantasy_paranormal.json.gz

In [None]:
reviews_df = pd.read_json('goodreads_books_fantasy_paranormal.json.gz', lines=True)

In [None]:
#reviews_df.loc[:,['title','average_rating']]

In [None]:
#match set of titles in both plot summaries and goodreads reviews data
plots_reviews = plots_df.merge(reviews_df[['title', 'average_rating']], 'left')
#remove any that have no ratings
plots_reviews = plots_reviews[plots_reviews['average_rating'].notnull()]
#deduplicate based on Wikipedia IDs
plots_reviews = plots_reviews.drop_duplicates(subset=['Wikipedia_ID'])

In [None]:
#filter to only 3.7+ rating
plots_reviews['average_rating'] = pd.to_numeric(plots_reviews['average_rating'])
plots_reviews = plots_reviews[plots_reviews.average_rating > 3.7]

In [None]:
#only take the summaries
plots_reviews = plots_reviews.Plot_Summary.copy()

In [None]:
#shuffle the plot reviews
random.shuffle(plots_reviews.to_list())

In [None]:
#Convert back to series after shuffle as list, otherwise won't work for training
plots_reviews = pd.Series(plots_reviews)

In [None]:
#plots_reviews.head()

In [None]:
#split the plot reviews into 90% training, 10% test
plots_reviews_train, plots_reviews_test = train_test_split(plots_reviews, test_size=0.05, random_state=42)

In [None]:
#print(len(plots_reviews_train))
#print(len(plots_reviews_test))

In [None]:
#Import the puzzles using recommended code
splits = {'sp': 'data/sp-00000-of-00001.parquet', 'wp': 'data/wp-00000-of-00001.parquet'}
puzzles = pd.read_parquet("hf://datasets/ErfanMoosaviMonazzah/brain-teasers/" + splits["sp"])

In [None]:
#shuffle the puzzles
puzzles = puzzles.sample(frac=1).reset_index(drop=True)

In [None]:
#puzzles.head()

In [None]:
#Spit out the questions and answers
puzzlesQ = puzzles.question.copy()
puzzlesA = puzzles.answer.copy()

# **Testing the Wikipedia plot summaries**

In [None]:
# Self-METEOR on the baseline test set of fictional content - only needs doing once

Total_METEOR = 0

for i in plots_reviews_test:

  METEOR = 0
  references = [word_tokenize(txt) for txt in plots_reviews_test if txt != i] #exclude hypothesis
  hypothesis = word_tokenize(i)
  METEOR = nltk.translate.meteor(references,hypothesis)
  Total_METEOR += METEOR

Total_METEOR / len (plots_reviews_test)

# **Testing the base model**

In [None]:
#Load the model and tokeniser
device="cuda"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id).to(device)

**Testing the base model on plot summary generations**

In [None]:
# encode the context/prompt that the generations will be conditioned on
input_ids = tokenizer.encode('Here is the plot summary to a new and original science fiction novel:', return_tensors='pt')

In [None]:
# Generating plot summaries with the base model
predictions_vanilla=[]

sample_outputs = model.generate(
    input_ids=input_ids.to(device),
    no_repeat_ngram_size=2,
    do_sample=True,
    max_length=768,
    top_k=50,
    top_p=0.95,
    temperature=5.0, # VARY TEMPERATURE BETWEEN 1.0-3.0-5.0
    num_return_sequences=93 # same as the test set size
)

sample_outputs = tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
predictions_vanilla.extend([sample.replace("<n>", "\n") for sample in sample_outputs])
#for i, sample_output in enumerate(sample_outputs):
#  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

In [None]:
# Self-METEOR for the base model generations

Total_METEOR = 0

for i in predictions_vanilla:

  METEOR = 0
  references = [word_tokenize(txt) for txt in predictions_vanilla if txt != i] #exclude hypothesis
  hypothesis = word_tokenize(i)
  METEOR = nltk.translate.meteor(references,hypothesis)
  Total_METEOR += METEOR

Total_METEOR / len (predictions_vanilla)

In [None]:
#ROUGE-L for the base model generations vs. the baseline fictional content test set

rouge = evaluate.load("rouge")

referencez = []
for i in range(93):
  current_refs = []
  for j in range(93):
    current_refs.append(plots_reviews_test.iloc[j])
  referencez.append(current_refs)

predictionz = predictions_vanilla[0:93]
resultz = rouge.compute(predictions=predictionz, references=referencez)
print(resultz)

**Testing the base model on brainteaser puzzles**

In [None]:
#Generating 10 answers to the first 10 shuffled brainteaser puzzles

torch.cuda.empty_cache()
predictions_puzzles=[]
for i in range(10):
  input_ = tokenizer.batch_encode_plus(puzzlesQ[i:i+1], max_length=768, pad_to_max_length=True,truncation=True, padding='longest', return_tensors="pt")
  input_ids = input_['input_ids']
  input_mask = input_['attention_mask']
  answers = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         no_repeat_ngram_size=2,
                         do_sample=True,
                         top_k=50,
                         top_p=0.95,
                         temperature=5.0, # VARY TEMPERATURE BETWEEN 1.0-5.0
                         num_return_sequences=1,
                         max_length=128,
                          )
  answers = tokenizer.batch_decode(answers, skip_special_tokens=True)
  predictions_puzzles.extend([answer.replace("<n>", "\n") for answer in answers])

In [None]:
predictions_puzzles

# **Fine-tuning of GPT2**

In [None]:
class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length", pad_to_max_length=True, return_tensors='pt')

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]

In [None]:
#use training data
dataset = GPT2Dataset(plots_reviews_train, tokenizer, max_length=768)

batch_size = 8

# Create the DataLoaders for dataset
# Take data in random order.
train_dataloader = DataLoader(
            dataset,
            sampler = RandomSampler(dataset), # Select batches randomly
            batch_size = batch_size
        )

In [None]:
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

configuration.pad_token_id = tokenizer.eos_token_id
#configuration.loss_type = ForCausalLMLoss

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration).to(device)

# this step is necessary because of the added tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

In [None]:
# hyperparameters

epochs = 5 # change depending on how fitted to the fictional content want the fine-tuned generations to be
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = total_steps)

In [None]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
#Training

total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()

        outputs = model(  b_input_ids,
                          labels=b_labels,
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Training Time': training_time,
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

# **Evaluation**

In [None]:
#Generating sample plot summaries from the fine-tuned model for the appendix of the report
model.eval()

prompt = "<|startoftext|> Here is the plot summary to a new and original science fiction novel:"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated,
                                no_repeat_ngram_size=2,
                                do_sample=True,
                                top_k=50,
                                max_length = 768,
                                top_p=0.95,
                                temperature=3.0,
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

In [None]:
#Generating the 93 sample plot summaries with the fine-tuned model to be used for testing
model.eval()

prompt = "<|startoftext|> Here is the plot summary to a new and original science fiction novel:"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

predictions_finetune=[]

sample_outputz = model.generate(
    generated,
    no_repeat_ngram_size=2,
    do_sample=True,
    max_length=768,
    top_k=25, # ONCE TEMP SET TO 3.0 VARY K FROM 25-50-100
    top_p=0.95,
    temperature=3.0, # FIRST VARY TEMPERATURE BETWEEN 1.0-1.5-5.0, THEN SET TO 3.0 AND VARY TOP K
    num_return_sequences=93
)

sample_outputz = tokenizer.batch_decode(sample_outputz, skip_special_tokens=True)
predictions_finetune.extend([sample.replace("<n>", "\n") for sample in sample_outputz])

In [None]:
#Self-METEOR for the generated examples post finetune

Total_METEOR = 0

for i in predictions_finetune:

  METEOR = 0
  references = [word_tokenize(txt) for txt in predictions_finetune if txt != i] #exclude hypothesis
  hypothesis = word_tokenize(i)
  METEOR = nltk.translate.meteor(references,hypothesis)
  Total_METEOR += METEOR

Total_METEOR / len (predictions_finetune)

In [None]:
#ROUGE-L for the fine-tuned generations vs. the baseline fictional content test set
rouge = evaluate.load("rouge")

references = []
for i in range(93):
  current_refs = []
  for j in range(93):
    current_refs.append(plots_reviews_test.iloc[j])
  references.append(current_refs)

predictions = predictions_finetune[0:93]
results = rouge.compute(predictions=predictions, references=references)
print(results)

In [None]:
#Answering brainteaser puzzles with the fine-tuned model

torch.cuda.empty_cache()
predictions_puzzlez=[]
for i in range(10):
  input_ = tokenizer.batch_encode_plus(puzzlesQ[i:i+1], max_length=768, pad_to_max_length=True,truncation=True, padding='longest', return_tensors="pt")
  input_ids = input_['input_ids']
  input_mask = input_['attention_mask']
  answerz = model.generate(input_ids=input_ids.to(device),
                         attention_mask=input_mask.to(device),
                         no_repeat_ngram_size=2,
                         do_sample=True,
                         top_k=25, # ONCE TEMP SET TO 3.0 VARY K FROM 25-50-100
                         top_p=0.95,
                         temperature=3.0, # FIRST VARY TEMPERATURE BETWEEN 1.0-5.0, THEN SET TO 3.0 AND VARY TOP K
                         num_return_sequences=1,
                         max_length=128
                         )
  answerz = tokenizer.batch_decode(answerz, skip_special_tokens=True)
  predictions_puzzlez.extend([answer.replace("<n>", "\n") for answer in answerz])

In [None]:
predictions_puzzlez