<a href="https://colab.research.google.com/github/weezymatt/tree-of-thoughts_demo/blob/main/Tree_of_Thoughts_baselines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tree of Thoughts

Implementation: https://github.com/princeton-nlp/tree-of-thought-llm

In [1]:
import os
import pdb
from google.colab import userdata
from pprint import pprint

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [3]:
%%capture
!pip install asteval
!pip install langchain-openai

In [8]:
from langchain_openai import ChatOpenAI
import pandas as pd
from asteval import Interpreter

### Game24 Baselines — Standard Prompt

In [9]:
model_name = 'gpt-4'
llm = ChatOpenAI(temperature=0.7, model_name=model_name)

In [10]:
# 5-shot
standard_prompt = '''Use numbers and basic arithmetic operations (+ - * /) to obtain 24.
Input: 4 4 6 8
Answer: (4 + 8) * (6 - 4) = 24
Input: 2 9 10 12
Answer: 2 * 12 * (10 - 9) = 24
Input: 4 9 10 13
Answer: (13 - 9) * (10 - 4) = 24
Input: 1 4 8 8
Answer: (8 / 4 + 1) * 8 = 24
Input: 5 5 5 9
Answer: 5 + 5 + 5 + 9 = 24
Input: {input}
'''

In [11]:
class Game24Task:
  """
  Game24 Class for experimenting with the various baselines in the paper.

  Baselines     : standard prompting, chain-of-thought, and self-consistency (value-based)
  """
  def __init__(self, path="https://hub.oxen.ai/api/repos/datasets/Game-of-24/file/main/24.csv"):
    self.data = pd.read_csv(path)
    self.interpreter = Interpreter()

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data.iloc[idx]

  def standard_prompt(self, x):
    return standard_prompt.format(input=x)

  def cot_prompt(self, x):
    return cot_prompt.format(input=x)

  def parse_prompt(self, candidate_answer, prompt_type="standard_prompt"):
    answer = candidate_answer.content.lower()

    if prompt_type == "standard_prompt":
      answer = answer.split("answer: ")[-1].split(" = ")[0]

    if prompt_type == "chain_of_thought":
      answer = answer.split("answer: ")[1].split(" = ")[0]

    return answer

  def evaluate_expression(self, expression):
    response = self.interpreter.eval(expression)

    return {
        "expression": expression,
        "answer": response,
        "valid": response == 24,
    }

game24 = Game24Task()

In [12]:
game24 = Game24Task()
x = game24.__getitem__(10)['Puzzles']
standard_prompt = game24.standard_prompt(x)

In [14]:
output = llm.invoke(standard_prompt)
expression = game24.parse_prompt(output)

In [16]:
pprint(game24.evaluate_expression(expression), width=50, sort_dicts=False)

{'expression': '(8 - 1) * (2 + 1)',
 'answer': 21,
 'valid': False}


### Game24 Baselines - Chain-of-Thought Prompt

In [105]:
# 5-shot
cot_prompt = '''Use numbers and basic arithmetic operations (+ - * /) to obtain 24. Each step, you are only allowed to choose two of the remaining numbers to obtain a new number.
Input: 4 4 6 8
Steps:
4 + 8 = 12 (left: 4 6 12)
6 - 4 = 2 (left: 2 12)
2 * 12 = 24 (left: 24)
Answer: (6 - 4) * (4 + 8) = 24
Input: 2 9 10 12
Steps:
12 * 2 = 24 (left: 9 10 24)
10 - 9 = 1 (left: 1 24)
24 * 1 = 24 (left: 24)
Answer: (12 * 2) * (10 - 9) = 24
Input: 4 9 10 13
Steps:
13 - 10 = 3 (left: 3 4 9)
9 - 3 = 6 (left: 4 6)
4 * 6 = 24 (left: 24)
Answer: 4 * (9 - (13 - 10)) = 24
Input: 1 4 8 8
Steps:
8 / 4 = 2 (left: 1 2 8)
1 + 2 = 3 (left: 3 8)
3 * 8 = 24 (left: 24)
Answer: (1 + 8 / 4) * 8 = 24
Input: 5 5 5 9
Steps:
5 + 5 = 10 (left: 5 9 10)
10 + 5 = 15 (left: 9 15)
15 + 9 = 24 (left: 24)
Answer: ((5 + 5) + 5) + 9 = 24
Input: {input}
'''

In [148]:
x = game24.__getitem__(10)['Puzzles']
cot_prompt = game24.cot_prompt(x)

In [155]:
output = llm.invoke(cot_prompt)

In [156]:
expression = game24.parse_prompt(output, prompt_type="chain_of_thought")
pprint(game24.evaluate_expression(expression), width=50, sort_dicts=False)

{'expression': '(1 + 1 + 1) * 8',
 'answer': 24,
 'valid': True}


In [159]:
print(output.content)

Steps:
1 + 1 = 2 (left: 1 2 8)
2 + 1 = 3 (left: 3 8)
3 * 8 = 24 (left: 24)
Answer: (1 + 1 + 1) * 8 = 24


### Game24 Baselines - Chain-of-Thought + Self-Consistency
"We also consider a CoT self-consistency baseline, which takes the majority output from 100 CoT samples, and an iterative-refine approach on
top of an IO sample for at most 10 iterations. At each iteration, the LM is conditioned on all previous history to “reflect on your mistakes and generate a refined answer” if the output is incorrect. Note that it uses groundtruth feedback signals about equation correctness."