# Claude 3.5 Sonnet to gpt-4o-mini - part of the `gpt-prompt-engineer` repo

This notebook gives you the ability to go from Claude 3.5 Sonnet to GPT-4o-mini -- reducing costs massively while keeping quality high.

By Matt Shumer (https://twitter.com/mattshumer_)

Github repo: https://github.com/mshumer/gpt-prompt-engineer

In [23]:
# !pip install openai anthropic weave

In [24]:
import weave

weave.init("claude2mini")

<weave.weave_client.WeaveClient at 0x15a277450>

In [131]:
import os
import re
import json
import requests
from openai import OpenAI
from anthropic import Anthropic


openai_client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
anthropic_client = Anthropic(api_key=os.environ['ANTHROPIC_API_KEY'])

@weave.op
def generate_candidate_prompts(task, prompt_example, response_example):
    system_content = """<task>Given an example training sample, create seven additional samples for the same task that are even better. Each example should contain a <prompt> and a <response>.</task>

<rules>
1. Ensure the new examples are diverse and unique from one another.
2. They should all be perfect. If you make a mistake, this system won't work.
</rules>

Respond in this format:
<response_format>
<example_one>
<prompt>
PUT_PROMPT_HERE
</prompt>
<response>
PUT_RESPONSE_HERE
</response>
</example_one>

<example_two>
<prompt>
PUT_PROMPT_HERE
</prompt>
<response>
PUT_RESPONSE_HERE
</response>
</example_two>

...
</response_format>"""

    user_content = f"""<training_task>{task}</training_task>

<prompt_example>
{prompt_example}
</prompt_example>

<response_example>
{response_example}
</response_example>"""

    response = anthropic_client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=4000,
        temperature=0.5,
        system=system_content,
        messages=[
            {"role": "user", "content": user_content}
        ]
    )

    response_text = response.content[0].text

    # Parse out the prompts and responses
    prompts_and_responses = []
    examples = re.findall(r'<example_\w+>(.*?)</example_\w+>', response_text, re.DOTALL)
    for example in examples:
        prompt = re.findall(r'<prompt>(.*?)</prompt>', example, re.DOTALL)[0].strip()
        response = re.findall(r'<response>(.*?)</response>', example, re.DOTALL)[0].strip()
        prompts_and_responses.append({'prompt': prompt, 'response': response})

    return prompts_and_responses

@weave.op
def generate_system_prompt(task, prompt_examples):
    system_content = """<your_role>Given a user-description of their <task> a set of prompt / response pairs (it'll be in JSON for easy reading) for the types of outputs we want to generate given inputs, write a fantastic system prompt that describes the task to be done perfectly.</your_role>

<rules>
1. Do this perfectly.
2. Respond only with the system prompt, and nothing else. No other text will be allowed.
</rules>

Respond in this format:
<system_prompt>
WRITE_SYSTEM_PROMPT_HERE
</system_prompt>"""

    user_content = f"""<task>{task}</task>

<prompt_response_examples>
{str(prompt_examples)}
</prompt_response_examples>"""

    response = anthropic_client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=1000,
        temperature=0.5,
        system=system_content,
        messages=[
            {"role": "user", "content": user_content}
        ]
    )

    response_text = response.content[0].text

    # Parse out the prompt
    system_prompt = response_text.split('<system_prompt>')[1].split('</system_prompt>')[0].strip()

    return system_prompt

@weave.op
def test_mini(generated_examples, prompt_example, system_prompt, **kwargs):
    messages = [{"role": "system", "content": system_prompt}]

    for example in generated_examples:
        messages.append({"role": "user", "content": example['prompt']})
        messages.append({"role": "assistant", "content": str(example['response'])})

    messages.append({"role": "user", "content": prompt_example.strip()})

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        max_tokens=2000,
        temperature=0.5,
        **kwargs
    )

    response_text = response.choices[0].message.content

    return response_text

@weave.op
def run_mini_conversion_process(task, prompt_example, response_example):
    print('Generating the prompts / responses...')
    # Generate candidate prompts
    generated_examples = generate_candidate_prompts(task, prompt_example, response_example)

    print('Prompts / responses generated. Now generating system prompt...')

    # Generate the system prompt
    system_prompt = generate_system_prompt(task, generated_examples)

    print('System prompt generated:', system_prompt)

    print('\n\nTesting the new prompt on GPT-4o-mini, using your input example...')
    # Test the generated examples and system prompt with the GPT-4o-mini model
    mini_response = test_mini(generated_examples, prompt_example, system_prompt)

    print('GPT-4o-mini responded with:')
    print(mini_response)

    # Create a dictionary with all the relevant information
    result = {
        "task": task,
        "initial_prompt_example": prompt_example,
        "initial_response_example": response_example,
        "generated_examples": generated_examples,
        "system_prompt": system_prompt,
        "mini_response": mini_response
    }
    return result

## Reduce Hallucination task...

In [135]:
from pathlib import Path

DATA_PATH = Path("./data")
NUM_SAMPLES = 100

def read_jsonl(path):
    "returns a list of dictionaries"
    with open(path, 'r') as file:
        return [json.loads(line) for line in file]

In [136]:
train_ds = read_jsonl(DATA_PATH / "fib-train.jsonl")
val_ds = read_jsonl(DATA_PATH / "fib-val.jsonl")[0:NUM_SAMPLES]

In [185]:
import random

def format_sample(sample: dict) -> dict:
    """Convert from 
    {'premise': '...',
     'hypothesis': "...",
     'target': 0}  
    to  
    {'prompt': prompt, 'response': response}"""

    prompt = "Document: {premise}\nSummary: {hypothesis}\n".format(**sample)

    # 0 is factual inconsistency, 1 is factual consistency
    response = {'consistency': sample['target']}
    return {"prompt": prompt, "response": response}

def sample_examples(ds: list, n=5, seed=42):
    samples = random.sample(ds, n)
    return [format_sample(sample) for sample in samples]


In [186]:
prompt_examples = sample_examples(train_ds, n=10)

In [187]:
task = """Detect factual inconsistencies and hallucinations. 
Factually Inconsistent: If any statement in the summary is not supported by or contradicts the document, label it as 0
Factually Consistent: If all statements in the summary are supported by the document, label it as 1
Return in JSON format with `consistency` the given choice."""

this looks noice 😎, but... I want an eval! Let's define some metrics:

In [191]:
def accuracy(model_output, response):
    class_model_output = model_output.get('consistency') if model_output else None
    return {"accuracy": class_model_output == response.get('consistency')}

class BinaryMetrics(weave.Scorer):
    class_name: str
    eps: float = 1e-8

    @weave.op()
    def summarize(self, score_rows) -> dict:
        # filter out None rows, model may error out sometimes...
        score_rows = [score for score in score_rows if score["correct"] is not None]
        # Compute f1, precision, recall
        tp = sum([not score["negative"] and score["correct"] for score in score_rows])
        fp = sum([not score["negative"] and not score["correct"] for score in score_rows])
        fn = sum([score["negative"] and not score["correct"] for score in score_rows])
        precision = tp / (tp + fp + self.eps)
        recall = tp / (tp + fn + self.eps)
        f1 = 2 * precision * recall / (precision + recall + self.eps)
        result = {"f1": f1, "precision": precision, "recall": recall}
        return result

    @weave.op()
    def score(self, response: dict, model_output: dict) -> dict:
        class_model_output = model_output.get(self.class_name) if model_output else None  # 0 or 1
        result = {
            "correct": class_model_output == response.get(self.class_name),
            "negative": not class_model_output,
        }
        return result

F1 = BinaryMetrics(class_name="consistency")

Use Claude to generate a better system prompt...

In [None]:
system_prompt = generate_system_prompt(task, prompt_examples)
print(system_prompt)

🍩 https://wandb.ai/capecape/claude2mini/r/call/d63e59d9-7334-4208-930f-e5379d411980
You are a highly accurate fact-checking assistant specialized in detecting factual inconsistencies and hallucinations in summaries based on given documents. Your task is to carefully analyze the provided document and summary, then determine if the summary is factually consistent or inconsistent with the information presented in the document.

Follow these guidelines:

1. Read the document thoroughly to understand all the facts and details presented.
2. Examine the summary carefully, comparing each statement to the information in the document.
3. Label the summary as factually consistent (1) if all statements in the summary are supported by or can be reasonably inferred from the document.
4. Label the summary as factually inconsistent (0) if any statement in the summary contradicts the document or includes information not supported by the document.
5. Pay close attention to names, dates, numbers, and spe

and a Model wrapper...

In [192]:
class Mini(weave.Model):
    system_prompt: str
    prompt_examples: list[dict]

    def predict(self, prompt: str) -> str:
        output =  test_mini(self.prompt_examples, prompt, self.system_prompt, response_format={"type": "json_object"})
        json_output = json.loads(output)
        return json_output

## Baseline: Simple System prompt, no examples:

In [193]:
mini = Mini(system_prompt=task, prompt_examples=[])

In [194]:
evaluation = weave.Evaluation(dataset=[format_sample(sample) for sample in val_ds], scorers=[accuracy, F1])

In [195]:
await evaluation.evaluate(mini)

{'model_output': {'consistency': {'mean': 0.43}},
 'accuracy': {'accuracy': {'true_count': 67, 'true_fraction': 0.67}},
 'BinaryMetrics': {'f1': 0.6451612852121633,
  'precision': 0.6976744184424013,
  'recall': 0.59999999988},
 'model_latency': {'mean': 2.4275572729110717}}

## Model 1: Better System prompt, no examples:

In [196]:
claude_induced_mini = Mini(system_prompt=system_prompt, prompt_examples=[])
await evaluation.evaluate(claude_induced_mini)

{'model_output': {'consistency': {'mean': 0.42}},
 'accuracy': {'accuracy': {'true_count': 72, 'true_fraction': 0.72}},
 'BinaryMetrics': {'f1': 0.695652168799622,
  'precision': 0.761904761723356,
  'recall': 0.639999999872},
 'model_latency': {'mean': 2.421030819416046}}

## Model 2: Better System prompt +  examples:

In [197]:
claude_induced_mini = Mini(system_prompt=system_prompt, prompt_examples=prompt_examples)
await evaluation.evaluate(claude_induced_mini)

{'model_output': {'consistency': {'mean': 0.66}},
 'accuracy': {'accuracy': {'true_count': 76, 'true_fraction': 0.76}},
 'BinaryMetrics': {'f1': 0.7931034432342451,
  'precision': 0.6969696968640956,
  'recall': 0.919999999816},
 'model_latency': {'mean': 2.6480085039138794}}

## Bonus: Let's create some examples:

In [199]:
generated_prompt_examples = generate_candidate_prompts(task, prompt_examples[0]["prompt"], response_example=prompt_examples[0]["response"])

🍩 https://wandb.ai/capecape/claude2mini/r/call/b20408f8-7c8d-41ab-ae7c-5348826a8634


In [200]:
claude_gen_mini = Mini(system_prompt=system_prompt, prompt_examples=generated_prompt_examples)
await evaluation.evaluate(claude_gen_mini)

{'model_output': {'consistency': {'mean': 0.3}},
 'accuracy': {'accuracy': {'true_count': 68, 'true_fraction': 0.68}},
 'BinaryMetrics': {'f1': 0.5999999951625,
  'precision': 0.7999999997333334,
  'recall': 0.479999999904},
 'model_latency': {'mean': 2.5735393524169923}}