In [None]:
%pip install transformers SentencePiece accelerate

In [4]:
import os
import re
from tqdm import tqdm
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM, AutoModelForCausalLM

In [5]:
start_index = 0
end_index = 164
max_len = 600
STOP_SEQS = ['\nclass', '\ndef', '\n#', '\nif', '\nprint']
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
## problems in evalplus
from evalplus.data import get_human_eval_plus, write_jsonl

plus_problems = get_human_eval_plus()

plus_task_ids = sorted(plus_problems.keys())[start_index: end_index]
plus_prompts = [plus_problems[task_id]['prompt'] for task_id in plus_task_ids]
num_samples = len(plus_prompts)
print("Number of samples: {}".format(num_samples))

Number of samples: 164


In [4]:
def generate_completion_samples_codeT5(model,temp, output_file,loop):
  ## defining model
  model = model
  output_file = output_file

  tokenizer = AutoTokenizer.from_pretrained(model)

  model = AutoModelForSeq2SeqLM.from_pretrained(model,
                                                trust_remote_code=True,
                                                torch_dtype=torch.float16,
                                                low_cpu_mem_usage=True)

  model.eval()
  model.to(DEVICE)
  completion_seqs = []
  loops = loop

  for i in tqdm(range(num_samples), ncols=0, total=num_samples):
    prompt = plus_prompts[i].replace('    ', '\t')

    prompt_batch_decoder = [prompt]
    ids_batch = [plus_task_ids[i]]

    encoding_decoder = tokenizer(prompt_batch_decoder, return_tensors="pt", truncation=True, max_length=max_len).to(DEVICE)

    for _ in tqdm(range(loops), total=loops, leave=False, ncols=0):

      with torch.no_grad():
        gen_tokens = model.generate(**encoding_decoder,
                                  decoder_input_ids=encoding_decoder['input_ids'],
                                  do_sample=True,
                                  temperature=temp,
                                  max_length=max_len,
                                  decoder_start_token_id=tokenizer.pad_token_id,
                                  eos_token_id=tokenizer.eos_token_id,
                                  top_p=0.95)
      gen_tokens = gen_tokens[:, encoding_decoder['input_ids'].shape[-1]:]

      gen_seqs = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)

      if gen_seqs is not None:
        assert len(ids_batch) == 1
        task_id = ids_batch[0]

        for seq_idx, gen_seq in enumerate(gen_seqs):

          completion_seq = gen_seq
          for stop_seq in STOP_SEQS:
              index = completion_seq.find(stop_seq)
              if index != -1:
                  completion_seq = completion_seq[:index]
          completion_seq = completion_seq.replace('\t', '    ')
          all_code = prompt.replace('\t', '    ') + completion_seq

          completion_seqs.append(
              {'task_id': task_id,
                'completion': completion_seq
                }
            )



  print("Saving results to {}".format(output_file))

  write_jsonl(output_file, completion_seqs)



In [10]:
def generate_completion_samples_phi1(model,temp, output_file,loop):
    ## defining model
  model = model
  output_file = output_file

  tokenizer = AutoTokenizer.from_pretrained(model)

  model = AutoModelForCausalLM.from_pretrained(model,
                                                trust_remote_code=True,
                                                torch_dtype=torch.float16,
                                                low_cpu_mem_usage=True)

  model.eval()
  model.to(DEVICE)
  completion_seqs = []
  loops = loop

  generation_config = transformers.GenerationConfig(
    do_sample=True,
    temperature=temp,
    top_p=0.95,
    max_new_tokens = max_len,
  )

  for i in tqdm(range(num_samples), ncols=0, total=num_samples):
    prompt = plus_prompts[i].replace('    ', '\t')

    prompt_batch_decoder = [prompt]
    ids_batch = [plus_task_ids[i]]

    encoding_decoder = tokenizer(prompt_batch_decoder, return_tensors="pt", truncation=True, max_length=max_len).to(DEVICE)
    input_ids=encoding_decoder['input_ids']

    for _ in tqdm(range(loops), total=loops, leave=False, ncols=0):

      with torch.no_grad():
        gen_tokens = model.generate(
                                  input_ids=input_ids,
                                  attention_mask=torch.ones_like(input_ids),
                                  generation_config=generation_config
                                  )

      gen_tokens = gen_tokens[:, encoding_decoder['input_ids'].shape[-1]:]

      gen_seqs = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)

      if gen_seqs is not None:
        assert len(ids_batch) == 1
        task_id = ids_batch[0]

        for seq_idx, gen_seq in enumerate(gen_seqs):

          completion_seq = gen_seq
          for stop_seq in STOP_SEQS:
              index = completion_seq.find(stop_seq)
              if index != -1:
                  completion_seq = completion_seq[:index]
          completion_seq = completion_seq.replace('\t', '    ')
          all_code = prompt.replace('\t', '    ') + completion_seq

          completion_seqs.append(
              {'task_id': task_id,
                'completion': completion_seq
                }
            )



  print("Saving results to {}".format(output_file))

  write_jsonl(output_file, completion_seqs)

In [5]:
generate_completion_samples_codeT5("Salesforce/codet5p-2b",0.2,"codet5_2b_samples.jsonl",1)

  return self.fget.__get__(instance, owner)()
  0% 0/164 [00:00<?, ?it/s]
  0% 0/1 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:50256 for open-end generation.

  0% 0/164 [02:16<?, ?it/s]A


KeyboardInterrupt: 

In [11]:
generate_completion_samples_phi1("microsoft/phi-1",0.2,"phi_1_samples.jsonl",1)

  0% 0/164 [00:00<?, ?it/s]
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:16<00:00, 16.27s/it][A
  1% 1/164 [00:16<44:12, 16.27s/it]A
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:15<00:00, 15.18s/it][A
  1% 2/164 [00:31<42:13, 15.64s/it]A
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:14<00:00, 14.81s/it][A
  2% 3/164 [00:46<40:57, 15.26s/it]A
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:15<00:00, 15.25s/it][A
  2% 4/164 [01:01<40:41, 15.26s/it]A
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:14<00:00, 14.91s/it][A
  3% 5/164 [01:16<40:06, 15.14s/it]A
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:14<00:00, 14.95s/it][A
  4% 6/164 [01:31<39:42, 15.08s/it]A
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:15<00:00, 15.16s/it][A
  4% 7/164 [01:46<39:31, 15.11s/it]A
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:15<00:00, 15.21s/it][A
  5% 8/164 [02:01<39:22, 15.14s/it]A
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:14<00:00, 14.76s/it][A
  5% 9/164 [02:16<38:48, 15.02s/it]A
  0% 0/1 [00:00<?, ?it/s][A
100% 1/1 [00:14<

Saving results to phi_1_samples.jsonl





In [None]:
generate_completion_samples_llama("decapoda-research/llama-7b-hf",0.2,"llama_7b_hf_samples.jsonl",1)