<a href="https://colab.research.google.com/github/mshumer/gpt-prompt-engineer/blob/main/gpt_prompt_engineer_Classification_Version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# gpt-prompt-engineer -- Classification Version
Original repo by Matt Shumer (https://twitter.com/mattshumer_)

Original Github repo: https://github.com/mshumer/gpt-prompt-engineer

Google PaLM fork by Tom Pakeman (https://github.com/tpakeman/palm-prompt-engineer)

Generate an optimal prompt for a given classification task that can be evaluated with 'true'/'false' outputs.

You just need to describe the task clearly, and provide some test cases (for example, if we're classifying statements as 'happy' or not, a 'true' test case could be "I had a great day!", and a 'false' test case could be "I am feeling gloomy.").

To generate a prompt:
1. In the first cell, add in your Vertex AI Project and Location
2. In the last cell, fill in the description of your task, up to 15 test cases, and the number of prompts to generate.
3. Run all the cells! The AI will generate a number of candidate prompts, and test them all to find the best one!

🪄🐝 To use [Weights & Biases logging](https://wandb.ai/site/prompts) to your LLM configs and the generated prompt outputs, just set `use_wandb = True`.

In [None]:
!pip install google-cloud-aiplatform prettytable tqdm tenacity wandb -qq

In [None]:
import itertools
from prettytable import PrettyTable
from tenacity import retry, stop_after_attempt, wait_exponential
import time
from tqdm import tqdm
from typing import Dict, Optional, Any
import vertexai
from vertexai.preview.language_models import TextGenerationModel
import wandb

VERTEX_API_PROJECT = 'CHANGEME'                 # Update this
VERTEX_API_LOCATION = 'us-central1'             # Update this

vertexai.init(project=VERTEX_API_PROJECT, location=VERTEX_API_LOCATION)

use_wandb = False # set to True if you want to use wandb to log your config and results

In [None]:
candidate_gen_system_prompt = """Your job is to generate system prompts for a Large Language Model, given a description of the use-case and some test cases.

The prompts you will be generating will be for classifiers, with 'true' and 'false' being the only possible outputs.

In your generated prompt, you should describe how the AI should behave in plain English. Include what it will see, and what it's allowed to output. Be creative in with prompts to get the best possible results. The AI knows it's an AI -- you don't need to tell it this.

You will be graded based on the performance of your prompt... but don't cheat! You cannot include specifics about the test cases in your prompt. Any prompts with examples will be disqualified.

Most importantly, output NOTHING but the prompt. Do not include anything else in your message."""

In [None]:
CANDIDATE_MODEL = 'text-bison@001'
CANDIDATE_MODEL_TEMPERATURE = 0.9

EVAL_MODEL = 'text-bison@001'
EVAL_MODEL_TEMPERATURE = 0
EVAL_MODEL_MAX_TOKENS = 1

NUMBER_OF_PROMPTS = 10 # this determines how many candidate prompts to generate... the higher, the more expensive

N_RETRIES = 3  # number of times to retry a call to the ranking model if it fails

WANDB_PROJECT_NAME = "palm-prompt-eng" # used if use_wandb is True, Weights &| Biases project name
WANDB_RUN_NAME = None # used if use_wandb is True, optionally set the Weights & Biases run name to identify this run

In [None]:
def start_wandb_run():
  # start a new wandb run and log the config
  wandb.init(
    project=WANDB_PROJECT_NAME, 
    name=WANDB_RUN_NAME,
    config={
      "candidate_gen_system_prompt": candidate_gen_system_prompt, 
      "candiate_model": CANDIDATE_MODEL,
      "candidate_model_temperature": CANDIDATE_MODEL_TEMPERATURE,
      "generation_model": EVAL_MODEL,
      "generation_model_temperature": EVAL_MODEL_TEMPERATURE,
      "generation_model_max_tokens": EVAL_MODEL_MAX_TOKENS,
      "n_retries": N_RETRIES,
      "number_of_prompts": NUMBER_OF_PROMPTS
      })
  
  return 

In [None]:
# Optional logging to Weights & Biases to reocrd the configs, prompts and results
if use_wandb:
  start_wandb_run()

In [None]:
"""Helper Class to initialise and Call Vertex models"""

class VertexModel():
    DEFAULT_PARAMS = {
        "temperature": 0.2,
        "top_p": 0.8,
        "top_k": 40,
        "max_output_tokens": 200
        }
    
    def __init__(self, model_name: str, parameters: Optional[Dict[str, Any]]={}):
       self._client = TextGenerationModel.from_pretrained(model_name)
       self._parameters = parameters | self.DEFAULT_PARAMS
    
    def __call__(self, prompt: str) -> str:
       return self._client.predict(prompt, **self._parameters).text

In [None]:
candidate_client = VertexModel(CANDIDATE_MODEL, {
    "temperature": CANDIDATE_MODEL_TEMPERATURE
    })

eval_client = VertexModel(EVAL_MODEL, {
    "temperature": EVAL_MODEL_TEMPERATURE, 
    "max_output_tokens": EVAL_MODEL_MAX_TOKENS
    })

In [None]:
# Get Score - retry up to N_RETRIES times, waiting exponentially between retries.
@retry(stop=stop_after_attempt(N_RETRIES), wait=wait_exponential(multiplier=1, min=4, max=70))
def generate_candidate_prompts(description, test_cases, number_of_prompts):
  candidate_prompt = "\n\n".join([
     candidate_gen_system_prompt,
     f"Here are some test cases:`{test_cases}`",
     f"Here is the description of the use-case: `{description.strip()}`",
     f"Respond with your prompt, and nothing else. Be creative."
     ])
  return [candidate_client(candidate_prompt) for _ in range(number_of_prompts)]

In [None]:
# NOTE - GPT's API allows for a `logit_bias` option which allows you to modify the likelihood of a given token appearing
# This is useful for constraining the model output to strings such as 'true' or 'false'.
# We cannot do this with PaLM and so we run the risk of registering many more false negatives
# This could possibly be mitigated with some postprocessing / string parsing on the output

def test_candidate_prompts(test_cases, prompts):
  prompt_results = {prompt: {'correct': 0, 'total': 0} for prompt in prompts}

  # Initialize the table
  table = PrettyTable()
  table_field_names = ["Prompt", "Expected"] + [f"Prompt {i+1}-{j+1}" for j, prompt in enumerate(prompts) for i in range(prompts.count(prompt))]
  table.field_names = table_field_names

  # Wrap the text in the "Prompt" column
  table.max_width["Prompt"] = 100

  if use_wandb:
    wandb_table = wandb.Table(columns=table_field_names)
    if wandb.run is None:
      start_wandb_run()

  for test_case in test_cases:
      row = [test_case['prompt'], test_case['answer']]
      for prompt in prompts:
          eval_prompt="\n\n".join([prompt,  test_case['prompt']])
          
          # Here is where GPT is constrained to only reply 'true' or 'false'
          x = eval_client(eval_prompt)
          
          status = "✅" if x == test_case['answer'] else "❌"
          row.append(status)

          # Update model results
          if x == test_case['answer']:
              prompt_results[prompt]['correct'] += 1
          prompt_results[prompt]['total'] += 1

      table.add_row(row)
      if use_wandb:
        wandb_table.add_data(*row)

  print(table)

  # Calculate and print the percentage of correct answers and average time for each model
  best_prompt = None
  best_percentage = 0
  if use_wandb:
    prompts_results_table = wandb.Table(columns=["Prompt Number", "Prompt", "Percentage", "Correct", "Total"])
  
  for i, prompt in enumerate(prompts):
      correct = prompt_results[prompt]['correct']
      total = prompt_results[prompt]['total']
      percentage = (correct / total) * 100
      print(f"Prompt {i+1} got {percentage:.2f}% correct.")
      if use_wandb:
         prompts_results_table.add_data(i, prompt, percentage, correct, total)
      if percentage > best_percentage:
          best_percentage = percentage
          best_prompt = prompt

  if use_wandb: # log the results to a Weights & Biases table and finsih the run
    wandb.log({"prompt_results": prompts_results_table})
    best_prompt_table = wandb.Table(columns=["Best Prompt", "Best Percentage"])
    best_prompt_table.add_data(best_prompt, best_percentage)
    wandb.log({"best_prompt": best_prompt_table})
    wandb.log({"prompt_ratings": wandb_table})
    wandb.finish()

  print(f"The best prompt was '{best_prompt}' with a correctness of {best_percentage:.2f}%.")

In [None]:
test_cases = [
    {
        'prompt': 'Find the best contact email on this site.',
        'answer': 'true'
    },
    {
        'prompt': 'who is the current president?',
        'answer': 'true'
    },
    {
        'prompt': 'order me a pizza',
        'answer': 'false'
    },
    {
        'prompt': 'what are some ways a doctor could use an assistant?',
        'answer': 'true'
    },
    {
        'prompt': 'write a speech on the danger of cults',
        'answer': 'false'
    },
    {
        'prompt': 'Make a reservation at The Accent for 9pm',
        'answer': 'false'
    },
    {
        'prompt': 'organize my google drive',
        'answer': 'false'
    },
    {
        'prompt': 'Find the highest-rated Italian restaurant near me.',
        'answer': 'true'
    },
    {
        'prompt': 'Explain the theory of relativity.',
        'answer': 'true'
    },
    {
        'prompt': 'What are the main differences between Python and Java programming languages?',
        'answer': 'true'
    },
    {
        'prompt': 'Translate the following English sentence to Spanish: "The weather today is great."',
        'answer': 'false'
    },
    {
        'prompt': 'Create a new event on my calendar for tomorrow at 2 pm.',
        'answer': 'false'
    },
    {
        'prompt': 'Write a short story about a lonely cowboy.',
        'answer': 'false'
    },
    {
        'prompt': 'Design a logo for a startup.',
        'answer': 'false'
    },
    {
        'prompt': 'Compose a catchy jingle for a new soda brand.',
        'answer': 'false'
    },
    {
        'prompt': 'Calculate the square root of 1999.',
        'answer': 'false'
    },
    {
        'prompt': 'What are the health benefits of yoga?',
        'answer': 'true'
    },
    {
        'prompt': 'find me a source of meat that can be shipped to canada',
        'answer': 'true'
    },
    {
        'prompt': 'Find the best-selling book of all time.',
        'answer': 'true'
    },
    {
        'prompt': 'What are the top 5 tourist attractions in Brazil?',
        'answer': 'true'
    },
    {
        'prompt': 'List the main ingredients in a traditional lasagna recipe.',
        'answer': 'true'
    },
    {
        'prompt': 'How does photosynthesis work in plants?',
        'answer': 'true'
    },
    {
        'prompt': 'Write a Python program to reverse a string.',
        'answer': 'false'
    },
    {
        'prompt': 'Create a workout routine for a beginner.',
        'answer': 'false'
    },
    {
        'prompt': 'Edit my resume to highlight my project management skills.',
        'answer': 'false'
    },
    {
        'prompt': 'Draft an email to a client to discuss a new proposal.',
        'answer': 'false'
    },
    {
        'prompt': 'Plan a surprise birthday party for my best friend.',
        'answer': 'false'
    }]

In [None]:
description = "Decide if a task is research-heavy." # describe the classification task clearly

# If Weights & Biases is enabled, log the description and test cases too
if use_wandb:
    if wandb.run is None:
      start_wandb_run()
      wandb.config.update({"description": description, 
                          "test_cases": test_cases})

candidate_prompts = generate_candidate_prompts(description, test_cases, NUMBER_OF_PROMPTS)
test_candidate_prompts(test_cases, candidate_prompts)