In [6]:
from rosemary import jpt_parse_args, jpt_setup; jpt_setup()

import sys
sys.path.append('/dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import argparse
import os
import tqdm
import re
import json
import random
import evaluate
from eval.utils import generate_completions, load_hf_lm_and_tokenizer, query_openai_chat_model, KeyWordsCriteria
from eval.gsm.examplars import EXAMPLARS as GSM_EXAMPLARS

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [54]:

parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", type=str, default="data/mgsm")
parser.add_argument("--max_num_examples", type=int, default=None, help="maximum number of examples to evaluate.")
parser.add_argument("--save_dir", type=str, default="results/mgsm")
parser.add_argument("--model_name_or_path", type=str, default=None, help="if specified, we will load the model to generate the predictions.")
parser.add_argument("--tokenizer_name_or_path", type=str, default=None, help="if specified, we will load the tokenizer from here.")
parser.add_argument("--openai_engine", type=str, default=None, help="if specified, we will use the OpenAI API to generate the predictions.")
parser.add_argument("--n_shot", type=int, default=8, help="max number of examples to use for demonstration.")
parser.add_argument("--no_cot", action="store_true", help="If given, we're evaluating a model without chain-of-thought.")
parser.add_argument("--eval_batch_size", type=int, default=1, help="batch size for evaluation.")
parser.add_argument("--load_in_8bit", action="store_true", help="load model in 8bit mode, which will reduce memory and speed up inference.")
parser.add_argument("--gptq", action="store_true", help="If given, we're evaluating a 4-bit quantized GPTQ model.")
parser.add_argument("--use_chat_format", action="store_true", help="If given, the prompt will be encoded as a chat format with the roles in prompt.")

model_name_or_path = '../results/gpt2-Large_human_mix'
# model_name_or_path = 't5-large'

cmd = f"""
    --data_dir ../data/eval/gsm/ \
    --max_num_examples 200 \
    --model_name_or_path {model_name_or_path} \
    --save_dir {model_name_or_path}/eval/gsm/ \
    --eval_batch_size 20 \
    --n_shot 8
"""

args = jpt_parse_args(parser, cmd)

# model_name_or_path and openai_engine cannot be both None or both not None.
assert (args.model_name_or_path is None) != (args.openai_engine is None), "Either model_name_or_path or openai_engine should be specified."
args


Namespace(data_dir='../data/eval/gsm/', max_num_examples=200, save_dir='../results/gpt2-Large_human_mix/eval/gsm/', model_name_or_path='../results/gpt2-Large_human_mix', tokenizer_name_or_path=None, openai_engine=None, n_shot=8, no_cot=False, eval_batch_size=20, load_in_8bit=False, gptq=False, use_chat_format=False)

In [4]:

exact_match = evaluate.load("exact_match")

random.seed(42)

print("Loading data...")
test_data = []
with open(os.path.join(args.data_dir, f"test.jsonl")) as fin:
    for line in fin:
        example = json.loads(line)
        test_data.append({
            "question": example["question"],
            "answer": example["answer"].split("####")[1].strip()
        })

# some numbers are in the `x,xxx` format, and we want to remove the comma
for example in test_data:
    example["answer"] = re.sub(r"(\d),(\d)", r"\1\2", example["answer"])
    assert float(example["answer"]), f"answer is not a valid number: {example['answer']}"
    

if args.max_num_examples and len(test_data) > args.max_num_examples:
    test_data = random.sample(test_data, args.max_num_examples)


Loading data...


In [5]:
if not os.path.exists(args.save_dir):
    os.makedirs(args.save_dir, exist_ok=True)
    
global GSM_EXAMPLARS
if args.n_shot:
    if len(GSM_EXAMPLARS) > args.n_shot:
        GSM_EXAMPLARS = random.sample(GSM_EXAMPLARS, args.n_shot)
    demonstrations = []
    for example in GSM_EXAMPLARS:
        if args.no_cot:
            demonstrations.append(
                "Quesion: " + example["question"] + "\n" + "Answer: " + example["short_answer"]
            )
        else:
            demonstrations.append(
                "Question: " + example["question"] + "\n" + "Answer: " + example["cot_answer"]
            )
    prompt_prefix = "Answer the following questions.\n\n" + "\n\n".join(demonstrations) + "\n\n"
else:
    prompt_prefix = "Answer the following question.\n\n"

prompts = []
for example in test_data:
    if args.use_chat_format:
        prompt = "<|user|>\n" + prompt_prefix + "Question: " + example["question"].strip() + "\n<|assistant|>\n" + "Answer:"
    else:
        prompt = prompt_prefix + "Question: " + example["question"].strip() + "\nAnswer:"
    prompts.append(prompt)

In [None]:

def load_hf_lm_and_tokenizer(
        model_name_or_path, 
        tokenizer_name_or_path=None, 
        device_map="auto", 
        gptq_model=False,
        load_in_8bit=False,
        dtype=torch.float16,
        use_fast_tokenizer=False,
        padding_side="left",
    ):
    
    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

    if not tokenizer_name_or_path:
        tokenizer_name_or_path = model_name_or_path
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name_or_path, use_fast=use_fast_tokenizer)
    # set padding side to left for batch generation
    tokenizer.padding_side = padding_side
    # set pad token to eos token if pad token is not set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    if gptq_model:
        from auto_gptq import AutoGPTQForCausalLM
        model_wrapper = AutoGPTQForCausalLM.from_quantized(
            model_name_or_path, 
            device="cuda:0", 
            use_triton=True,
        )
        model = model_wrapper.model
    else:       
        from_pretrained_kwargs = {
            'device_map': device_map,
            'torch_dtype': dtype,
            'load_in_8bit': load_in_8bit,
        }
        if  't5' in model_name_or_path:
            from transformers import T5ForConditionalGeneration
            model = T5ForConditionalGeneration.from_pretrained(
                model_name_or_path, **from_pretrained_kwargs)
        elif 'mpt' in model_name_or_path: 
            config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
            config.attn_config['attn_impl'] = 'triton'
            config.init_device = 'cuda' # For fast initialization directly on GPU!
            model = AutoModelForCausalLM.from_pretrained(
                model_name_or_path, trust_remote_code=True, config=config, **from_pretrained_kwargs)
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_name_or_path,
                **from_pretrained_kwargs,
            )
    model.eval()
    return model, tokenizer



args.load_in_8bit = False
args.model_name_or_path = 't5-Large' # 't5-large'
model, tokenizer = load_hf_lm_and_tokenizer(
    model_name_or_path=args.model_name_or_path, 
    tokenizer_name_or_path=args.tokenizer_name_or_path, 
    load_in_8bit=args.load_in_8bit, 
    dtype=torch.bfloat16,
    gptq_model=args.gptq,
    device_map='auto',
)
new_line_token = tokenizer.encode("\n")[-1] 

model.device, model.dtype

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-11b automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/45.2G [00:00<?, ?B/s]

In [70]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained('gpt2-Large')
tok.encode('\n', add_special_tokens=False), tok.encode('\n', add_special_tokens=True)

([198], [198])

In [75]:
from eval.utils import generate_completions

# get the last token because the tokenizer may add space tokens at the start.
t0 = time.time()
outputs = generate_completions(
    model=model,
    tokenizer=tokenizer,
    prompts=prompts[:3],
    batch_size=args.eval_batch_size,
    stop_id_sequences=[[new_line_token]],
#     max_new_tokens=512,
    max_length=model.config.max_position_embeddings,
)
t = time.time()-t0
print(f'Time = {t:.2f}')


# float16: 170s
# +inference mode: 162s
# 

AttributeError: 'T5Config' object has no attribute 'max_position_embeddings'

In [10]:
generation_kwargs = {'max_length': model.config.max_position_embeddings}

print(batch_input_ids.shape, batch_input_ids.device)
print(attention_mask.shape, attention_mask.device)
model = model.to('cuda')
batch_input_ids = batch_input_ids.cuda()
attention_mask = attention_mask.cuda()
print(model.device)


from transformers import StoppingCriteriaList
import time
start = time.time()

stopping_criteria = StoppingCriteriaList([KeyWordsCriteria(stop_id_sequences)]) if stop_id_sequences else None

batch_outputs = model.generate(
    input_ids=batch_input_ids,
    attention_mask=attention_mask,
    stopping_criteria=stopping_criteria,
    **generation_kwargs
)
end = time.time()
end-start
print(end-start)

if stop_id_sequences:
    for output_idx in range(batch_outputs.shape[0]):
        for token_idx in range(batch_input_ids.shape[1], batch_outputs.shape[1]):
            if any(batch_outputs[output_idx, token_idx: token_idx+len(stop_sequence)].tolist() == stop_sequence for stop_sequence in stop_id_sequences):
                batch_outputs[output_idx, token_idx:] = tokenizer.pad_token_id
                break


# 
# 15s
# 18.369792938232422


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


torch.Size([20, 783]) cuda:0
torch.Size([20, 783]) cuda:0
cuda:0


In [13]:
num_return_sequences = 1
# remove the prompt from the output
# we need to re-encode the prompt because we need to make sure the special tokens are treated the same way as in the outputs.
# we changed our previous way of truncating the output token ids dicrectly because some tokenizer (e.g., llama) won't add space token before the first token.
# space is important for some tasks (e.g., code completion).
batch_outputs = tokenizer.batch_decode(batch_outputs, skip_special_tokens=True)
batch_prompts = tokenizer.batch_decode(batch_input_ids, skip_special_tokens=True)
# duplicate the prompts to match the number of return sequences
batch_prompts = [prompt for prompt in batch_prompts for _ in range(num_return_sequences)]
batch_generations = [
    output[len(prompt):] for prompt, output in zip(batch_prompts, batch_outputs)
]

30.835227012634277

In [19]:
print([len(x) for x in batch_outputs])
print([len(x) for x in batch_prompts])

[2620, 2793, 2655, 2549, 2641, 2821, 2523, 2578, 2665, 2572, 3704, 2733, 2636, 2658, 2624, 2573, 2578, 2524, 2619, 2780]
[2531, 2542, 2468, 2488, 2519, 2642, 2407, 2453, 2495, 2459, 3008, 2577, 2490, 2523, 2456, 2471, 2488, 2435, 2471, 2668]


In [None]:
generation_kwargs = {'max_new_tokens': 512}
disable_tqdm = False
stop_id_sequences = [[new_line_token]]
batch_size=args.eval_batch_size



generations = []
if not disable_tqdm:
    progress = tqdm.tqdm(total=len(prompts), desc="Generating Completions")

num_return_sequences = generation_kwargs.get("num_return_sequences", 1)
for i in range(0, len(prompts), batch_size):
    batch_prompts = prompts[i:i+batch_size]
    tokenized_prompts = tokenizer(batch_prompts, padding="longest", return_tensors="pt", add_special_tokens=False)
    batch_input_ids = tokenized_prompts.input_ids
    attention_mask = tokenized_prompts.attention_mask

    if model.device.type == "cuda":
        print(torch.cuda.is_available())
        print(batch_input_ids.device, batch_input_ids.shape)
        batch_input_ids = batch_input_ids.cuda()
        attention_mask = attention_mask.cuda()

    try:
        batch_outputs = model.generate(
            input_ids=batch_input_ids,
            attention_mask=attention_mask,
            stopping_criteria=[KeyWordsCriteria(stop_id_sequences)] if stop_id_sequences else None,
            **generation_kwargs
        )

        # the stopping criteria is applied at batch level, so if other examples are not stopped, the entire batch will continue to generate.
        # so some outputs still have the stop sequence, which we need to remove.
        if stop_id_sequences:
            for output_idx in range(batch_outputs.shape[0]):
                for token_idx in range(batch_input_ids.shape[1], batch_outputs.shape[1]):
                    if any(batch_outputs[output_idx, token_idx: token_idx+len(stop_sequence)].tolist() == stop_sequence for stop_sequence in stop_id_sequences):
                        batch_outputs[output_idx, token_idx:] = tokenizer.pad_token_id
                        break

        # remove the prompt from the output
        # we need to re-encode the prompt because we need to make sure the special tokens are treated the same way as in the outputs.
        # we changed our previous way of truncating the output token ids dicrectly because some tokenizer (e.g., llama) won't add space token before the first token.
        # space is important for some tasks (e.g., code completion).
        batch_outputs = tokenizer.batch_decode(batch_outputs, skip_special_tokens=True)
        batch_prompts = tokenizer.batch_decode(batch_input_ids, skip_special_tokens=True)
        # duplicate the prompts to match the number of return sequences
        batch_prompts = [prompt for prompt in batch_prompts for _ in range(num_return_sequences)]
        batch_generations = [
            output[len(prompt):] for prompt, output in zip(batch_prompts, batch_outputs)
        ]
    except Exception as e:
        print("Error when generating completions for batch:")
        print(batch_prompts)
        print("Error message:")
        print(e)
        print("Use empty string as the completion.")
        batch_generations = [""] * len(batch_prompts) * num_return_sequences

    generations += batch_generations

    # for prompt, generation in zip(batch_prompts, batch_generations):
    #     print("========")
    #     print(prompt)
    #     print("--------")
    #     print(generation)

    if not disable_tqdm:
        progress.update(len(batch_prompts)//num_return_sequences)

In [None]:

if args.model_name_or_path:
    print("Loading model and tokenizer...")
    model, tokenizer = load_hf_lm_and_tokenizer(
        model_name_or_path=args.model_name_or_path, 
        tokenizer_name_or_path=args.tokenizer_name_or_path, 
        load_in_8bit=args.load_in_8bit, 
        load_in_half=True,
        gptq_model=args.gptq
    )
    new_line_token = tokenizer.encode("\n", add_special_tokens=False)[-1] 
    # get the last token because the tokenizer may add space tokens at the start.
    outputs = generate_completions(
        model=model,
        tokenizer=tokenizer,
        prompts=prompts,
        max_new_tokens=512,
        batch_size=args.eval_batch_size,
        stop_id_sequences=[[new_line_token]]
    )
else:
    instances = [{"id": prompt, "prompt": prompt} for _, prompt in enumerate(prompts)]
    results = query_openai_chat_model(
        engine=args.openai_engine,
        instances=instances,
        batch_size=args.eval_batch_size if args.eval_batch_size else 10,
        output_path=os.path.join(args.save_dir, f"openai_results.jsonl"),
    )
    outputs = [result["output"] for result in results]


In [None]:
# get the last token because the tokenizer may add space tokens at the start.
outputs = generate_completions(
    model=model,
    tokenizer=tokenizer,
    prompts=prompts,
    max_new_tokens=512,
    batch_size=args.eval_batch_size,
    stop_id_sequences=[[new_line_token]]
)
outputs

In [51]:

predictions = []
for output in outputs:
    # replace numbers like `x,xxx` with `xxxx`
    output = re.sub(r"(\d),(\d)", r"\1\2", output)
    numbers = re.findall(r"[-+]?\d*\.\d+|\d+", output)
    if numbers:
        predictions.append(numbers[-1])
    else:
        predictions.append(output)

print("Calculating accuracy...")
targets = [example["answer"] for example in test_data]

em_score = exact_match.compute(predictions=predictions, references=targets, ignore_case=True, ignore_punctuation=True)["exact_match"]
print(f"Exact match : {em_score}")

predictions = [{
    "question": example["question"],
    "answer": example["answer"],
    "model_output": output,
    "prediction": pred
} for example, output, pred in zip(test_data, outputs, predictions)]

with open(os.path.join(args.save_dir, f"predictions.jsonl"), "w") as fout:
    for prediction in predictions:
        fout.write(json.dumps(prediction) + "\n") 

with open(os.path.join(args.save_dir, "metrics.json"), "w") as fout:
    json.dump({
        "exact_match": em_score
    }, fout, indent=4)


Calculating accuracy...
Exact match : 0.025


[{'question': 'The girls are trying to raise money for a carnival. Kim raises $320 more than Alexandra, who raises $430, and Maryam raises $400 more than Sarah, who raises $300. How much money, in dollars, did they all raise in total?',
  'answer': '2280',
  'model_output': ' Kim raised 320 + 430 + 400 = $680. Maryam raised 300 + 400 = $600. So the answer is 600.',
  'prediction': '600'},
 {'question': 'Kalinda is working on a 360 piece puzzle with her mom. Kalinda can normally add 4 pieces per minute. Her mom can typically place half as many pieces per minute as Kalinda.  How many hours will it take them to complete this puzzle?',
  'answer': '1',
  'model_output': ' Kalinda can add 4 pieces per minute because 360 / 4 = 4 pieces per minute. Her mom can place half as many pieces per minute as Kalinda because 4 / 2 = 2 pieces per minute. So it will take them 2 * 2 = 4 hours to complete this puzzle. The answer is 4.',
  'prediction': '4'},
 {'question': "Tom's ship can travel at 10 miles