In [1]:
import os
# os.environ["LD_LIBRARY_PATH"] = f"{os.environ['LD_LIBRARY_PATH']}:/cvmfs/ai.mila.quebec/apps/arch/common/cuda/11.7/lib64"
# !export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/cvmfs/ai.mila.quebec/apps/arch/common/cuda/11.7/lib64

In [2]:
import guidance
import transformers
import bitsandbytes
from torch import cuda, bfloat16
from bazaar.schema import Quote

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
hf_auth = "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"
# model_id = 'meta-llama/Llama-2-70b-chat-hf'
model_id = '/Tmp/slurm.3479725.0/hf_home/hub/models--meta-llama--Llama-2-70b-chat-hf/snapshots/36d9a7388cc80e5f4b3e9701ca2f250d21a96c30/'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
# model.save_model('/save_path/')

model.eval()
print(f"Model loaded on {device}")


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [01:12<00:00,  4.80s/it]


Model loaded on cuda:0


In [7]:
name = "meta-llama/Llama-2-70b-chat-hf"
tokenizer = transformers.AutoTokenizer.from_pretrained(name, use_auth_token=hf_auth)
tokenizer.pad_token_id = tokenizer.eos_token_id    # for open-ended generation




In [8]:
generation_pipe = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="auto",    # finds GPU
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
program_string = """
    {{#system~}}
    You are a Question Answering Agent operating inside an information market. You will be given a question, and a bunch of passages that might have an answer to that question in them. 

    But beware that each passage has a cost. You want to minimize the amount you spend, while maximizing the quality of your answer. You will now be presented with several options, and you will be asked how much you would want to pay for those passages, conditioned on your balance and the average price over all presented passages. 
    {{~/system}}
    
    {{#user~}}
    The question is "{{question}}?"
    
    Here are your options.
    ---{{#each options}}
    Option {{add @index 1}}: {{this.answer_block}}
    {{/each}}---
    
    Please discuss each option briefly in the context of the question that is asked. Lay out the argument for buying vs. passing. 

    After you're done laying out the arguments, you will consider that your balance is ${{balance}} and the average price of a passage is $20.0. Please respond with how much you would be willing to pay to buy each passage, conditioned on the question. The schema for this is: 
    
    OPTION 1: <minimum price you would be willing to pay> - <maximum price you would be willing to pay>
    OPTION 2: <minimum price you would be willing to pay> - <maximum price you would be willing to pay>
    ... (and so on)
    
    Let's go.
    {{~/user}}
    
    {{#assistant~}}
    {{gen "answer" temperature=0.0}}
    {{~/assistant}}
    """
    program_string = clean_program_string(program_string)
    # Run the program
    program = guidance(program_string, llm=guidance.llms.OpenAI(model_name))  # noqa
    program_output = program(
        question=question,
        options=options,
        balance=budget,
        average_quote_price=average_quote_price,
    )
    answer = program_output["answer"]

In [34]:
prompt = "Who is George Washington?"
text = f"""SYSTEM: Bobby William and Michael Burry work for a firm specializing in information acquisition. They seek answers by buying data from an information marketplace where vendors sell insights.

Bobby wants to do a really good job at answering the question. 

Michael is thrifty and financially responsible. Michael wants to make sure ensures that they don't waste money buying unnecessary or duplicate information. 

USER: The question is "How can LLMs be used for planning?"

Here are your options.
---
Option 1: More specifically, we adopt hierarchical planning models (e.g., [38, 33]), which consist of a high-level planner and a low-level planner. We use LLMs to generate high-level plans (HLPs), i.e., a sequence of subgoals (e.g., [Navigation potato, Pickup potato, Navigation microwave, ...]) that the agent needs to achieve, in the specified order, to accomplish the final goal specified by the language instruction. The lowlevel planner then maps each subgoal into a sequence of primitive actions for achieving that subgoal in the current environment and state. An important observation is that, given a high-level plan, low-level planning becomes conditionally independent of the natural language instruction. It becomes the classic object localization and navigation problem [6] (for navigation subgoals) or simply executing the specified interaction action with the right objects (for interaction subgoals). The low-level planner can be trained with data synthesized from the simulator (see, e.g., [26, 3]).

Option 2: Classical planning methods have been widely adopted in robots and embodied environments [9, 42, 8, 61, 26]. Recently, prompting LLMs to do planning direcly has gained attention and shown potential [24, 23, 53, 13, 35]. SayCan [1], for instance, combines LLMs with affordance functions to generate feasible plans. Moreover, based on LLMs' powerful programming ability [37, 29, 36], some recent works first translate natural language instructions into the executable programming languages, such as Planning Domain Description Language (PDDL), and runs classical planning algorithms, such as LLM+P [36]. However, codebased planning is constrained by its narrow domains and the predefined environment, while RAP can handle open domain problems, including numerical and logical reasoning (see Section 4.2 and 4.3).

Option 3: Robots and embodied environments have extensively utilized classical planning methods [9, 42, 8, 61, 26]. Lately, there's been a growing interest in using LLMs for direct planning, which has demonstrated promise [24, 23, 53, 13, 35]. As an example, SayCan [1] merges LLMs with affordance functions to produce viable plans. Leveraging the robust programming capabilities of LLMs [37, 29, 36], some recent studies convert natural language directives into executable programming languages like PDDL, then implement traditional planning algorithms, such as LLM+P [36]. Nonetheless, planning based on code is limited to specific domains and set environments, whereas RAP can address open domain challenges, encompassing both numerical and logical reasoning (refer to Section 4.2 and 4.3).

---

Option 1 costs $10.
Option 2 costs $30.
Option 3 costs $40.

Simulate a constructive argument between Bobby and Michael. Together, they must decide which options to buy and which ones to not buy.

Note that Bobby and Michael may choose to buy any number of options, or none at all. At the end of the argument, they must arrive at a verdict. This verdict must be printed as: 

VERDICT:

Option 1: <Buy or Pass>
Option 2: <Buy or Pass>
Option 3: <Buy or Pass>
---
"""    

sequences = generation_pipe(
    text,
    max_length=2048,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=True,
    top_k=10,
    temperature=0.4,
    top_p=0.9
)

print(sequences[0]["generated_text"])


SYSTEM: Bobby William and Michael Burry work for a firm specializing in information acquisition. They seek answers by buying data from an information marketplace where vendors sell insights.

Bobby wants to do a really good job at answering the question. 

Michael is thrifty and financially responsible. Michael wants to make sure ensures that they don't waste money buying unnecessary or duplicate information. 

USER: The question is "How can LLMs be used for planning?"

Here are your options.
---
Option 1: More specifically, we adopt hierarchical planning models (e.g., [38, 33]), which consist of a high-level planner and a low-level planner. We use LLMs to generate high-level plans (HLPs), i.e., a sequence of subgoals (e.g., [Navigation potato, Pickup potato, Navigation microwave, ...]) that the agent needs to achieve, in the specified order, to accomplish the final goal specified by the language instruction. The lowlevel planner then maps each subgoal into a sequence of primitive acti

In [11]:
guidance.llm = guidance.llms.Transformers(model=model, tokenizer=tokenizer)

In [16]:
from bazaar.lem_utils import clean_program_string
def get_closed_book_answer(question: str, model_name="gpt-3.5-turbo") -> str:
    program_string = """
    {{#system~}}
    You are an intelligent AI assistant. You will be given a question. Your task is to answer it to the best of your ability. 
    {{~/system}}
    
    {{#user~}}
    {{question}}
    {{~/user}}
    
    {{#assistant~}}
    {{gen "answer" temperature=0.0 max_tokens=512}}
    {{~/assistant}}
    """
    program_string = clean_program_string(program_string)
    # Run the program
    program = guidance(program_string, llm=guidance.llms.Transformers(model=model, tokenizer=tokenizer))  # noqa
    program_output = program(question=question)
    print(program_output)
    # answer = program_output["answer"]
    # Done
    # return answer

get_closed_book_answer(question="how much wood cood a woodchuck chuck?")

{{#system~}}
You are an intelligent AI assistant. You will be given a question. Your task is to answer it to the best of your ability. 
{{~/system}}

{{#user~}}
{{question}}
{{~/user}}

{{#assistant~}}
{{gen "answer" temperature=0.0 max_tokens=512}}
{{~/assistant}}



In [32]:
program = guidance("{{gen 'henlo' max_tokens=32}}", caching=False)
output = program()

In [33]:
output.variables()

{'llm': <guidance.llms._transformers.Transformers at 0x7fad202783a0>,
 'logging': False,
 '@raw_prefix': "{{!--GMARKER_START_gen$&#123;&#123;gen 'henlo' max_tokens=32&#125;&#125;$--}}"}

In [37]:
llama = guidance.llms.Transformers(model=model, tokenizer=tokenizer, caching=False)
llama_cash = guidance.llms.Transformers(model=model, tokenizer=tokenizer, caching=True)

In [45]:

# we can pre-define valid option sets
valid_weapons = ["sword", "axe", "mace", "spear", "bow", "crossbow"]

# define the prompt
character_maker = guidance("""The following is a character profile for an RPG game in JSON format.
```json
{
    "id": "{{id}}",
    "description": "{{description stop=','}}",
    "name": "{{gen 'name' stop=','}}",
    "age": {{gen 'age' pattern='[0-9]+' stop=','}},
    "armor": "{{#select 'armor'}}leather{{or}}chainmail{{or}}plate{{/select}}",
}```""")

# generate a character
character_maker(
    id="e1f491f7-7ab8-4dac-8c20-c92b5e7d883d",
    description="A quick and nimble fighter.",
    valid_weapons=valid_weapons, llm=llama_cash
)
