In [9]:
import os, sys, json, random, io, pytz, argparse
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
from peft import (
        get_peft_model, 
        prepare_model_for_kbit_training, 
        LoraConfig, 
        PeftModel,
        AutoPeftModelForCausalLM
    )
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from pprint import pprint

In [6]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
prompt_path = "../data/feed_decoder_LM/regular/len/q_prompt.txt"
with open(prompt_path, "r") as f: prompt_template = "\n".join(f.readlines()).strip()
def preprocess(prompt_template, input_str, answer=None, eos_token="</s>"):
  prompt = prompt_template.format(input_str)
  response = f"{str(answer) + '.' + eos_token if answer else ''} "
  text = "### Question: {}\n ### Answer: {}".format(prompt, response) #(" ").join([prompt, response])
  return text

In [32]:
model_name = "meta-llama/Llama-2-7b-hf" #"/home/yingshan/data/LLM/vicuna-13b" #

model = LlamaForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map={"": 0}
)

lora_adapter = "../scripts/llama/output/1114_190018/ckpts/checkpoint-7880"
peft_model = AutoPeftModelForCausalLM.from_pretrained(
    #model, 
    lora_adapter, 
    torch_dtype=torch.float16, 
    device_map={"": 0}
    #offload_folder="lora_results/lora_7/temp"
)



Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]


In [28]:
tokenizer = AutoTokenizer.from_pretrained(lora_adapter)


In [6]:
tokenizer.add_special_tokens({"pad_token":"<pad>"})
tokenizer.padding_side = 'right'
model.resize_token_embeddings(len(tokenizer))

Embedding(32001, 4096)

In [10]:
data = load_dataset("../data/finetune/len/uniform_split", split="validation")
pprint(data[0])

{'answer': 3, 'input_str': '602'}


In [11]:
print(preprocess(prompt_template, data[140]["input_str"], data[140]["answer"]))

### Question: What is length of the string "5169"?
 ### Answer: 4.</s> 


In [12]:
dataloader = DataLoader(data, batch_size=128, shuffle=False)

In [None]:
with open("../data/output_decoder_LM/llama2-7b/len/zeroshot/val.jsonl", "w") as f:
  for batch in tqdm(dataloader):
    input_text = [preprocess(prompt_template, s) for s in batch['input_str']]
    tokenized_text = tokenizer(input_text, 
                             padding = 'longest',
                             max_length = 512,
                             truncation = True, 
                             return_tensors="pt"
                            )
    input_tokens = tokenized_text["input_ids"].to("cuda")
    attn_mask = tokenized_text["attention_mask"].to("cuda")

    with torch.cuda.amp.autocast():
      generation_output = model.generate(
          input_ids=input_tokens,
          attention_mask = attn_mask,
          max_new_tokens=10,
          do_sample=False,
          temperature=1.0,
          top_k=1,
          num_return_sequences=1,
          eos_token_id=tokenizer.eos_token_id,
        )
    for s, a, o in zip(batch['input_str'], batch['answer'], generation_output):
      op = tokenizer.decode(o, skip_special_tokens=True)
      f.write(json.dumps([s, a.item(), op]) + "\n")


In [37]:
input_text = preprocess(prompt_template, data[790]["input_str"])
input_tokens = tokenizer(input_text, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=10,
      do_sample=False,
      #do_sample=True,
      #top_k=None,
      #top_p=0.9,
      #temperature=0,
      #repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

### Question: What is length of the string "0572674832"?
 ### Answer:  
```
12
```

##


In [23]:
for name, module in peft_model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [39]:
input_text = preprocess(prompt_template, data[790]["input_str"])
input_tokens = tokenizer(input_text, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = peft_model.generate(
      input_ids=input_tokens,
      max_new_tokens=5,
      do_sample=False,
      #top_k=10,
      #top_p=0.1,
      #temperature=0.3,
      #repetition_penalty=1.15,
      #num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)



<s> ### Question: What is length of the string "0572674832"?
 ### Answer:  38.</s>


In [37]:
len("070193108739725")

15

### Sanity Check: It's indeed deterministic

In [20]:
import os, json
output_dir = "../data/output_decoder_LM/llama2-7b/len/zeroshot/"
ANS = []
for f in os.listdir(output_dir):
    if "1113_" in f:
        ANS.append([])
        lines = open(os.path.join(output_dir, f), "r").readlines()
        for l in lines:
            ANS[-1].append(json.loads(l)[-1].split("\n ### Answer:")[-1].strip())
print(len(ANS))

3


In [22]:
for i, j in zip(ANS[1], ANS[2]):
    if not i == j:
        print(i)
        print(j)

### Generate Data --- LEN

In [1]:
import os, random, json, re
import pandas as pd
import numpy as np
from tqdm import trange, tqdm

In [2]:
samples_per_num_digit = 1000
data = {}
for num_digit in trange(129, 257):
    if num_digit <= 6:
        strings = random.sample([str(x) for x in range(10**num_digit)], samples_per_num_digit)
        strings = ["0"*(num_digit-len(s)) + s for s in strings]
        
    else:
        strings = set()
        while len(strings) < samples_per_num_digit:
            s = "".join([random.choice("1234567890") for i in range(num_digit)])
            strings.add(s)
    data[num_digit] = list(strings)
json.dump(data, open("../data/finetune/len/finetune_129_256.json", "w"), indent=2)
            

In [2]:
# Uniform split
samples_per_num_digit_train, samples_per_num_digit_val = 100, 100

data = json.load(open("../data/finetune/len/finetune.json", "r"))
print("samples_per_num_digit = {}".format(len(data[list(data.keys())[0]])))

train = pd.DataFrame(columns=["input_str", "answer"])
val = pd.DataFrame(columns=["input_str", "answer"])
for num_digit in tqdm(data):
    for s in data[num_digit][:samples_per_num_digit_train]:
        row = {
            "input_str": [s],
            "answer": [num_digit]
        }
        row = pd.DataFrame(row)
        train = pd.concat([train, pd.DataFrame(row)])
    for s in data[num_digit][
        samples_per_num_digit_train:samples_per_num_digit_train+samples_per_num_digit_val
    ]:
        row = {
            "input_str": [s],
            "answer": [num_digit]
        }
        row = pd.DataFrame(row)
        val = pd.concat([val, pd.DataFrame(row)])
print(len(train), len(val))

train.to_csv("../data/finetune/len/uniform_split/train.csv", index=False)
val.to_csv("../data/finetune/len/uniform_split/val.csv", index=False)

#val.to_csv("../data/finetune/len/uniform_large_split/val.csv", index=False)


samples_per_num_digit = 1000


  0%|          | 0/126 [00:00<?, ?it/s]

100%|██████████| 126/126 [00:06<00:00, 19.12it/s]


12600 12600


In [4]:
# odd_even_3:1 split
samples_per_num_digit_val = 100

data = json.load(open("../data/finetune/len/finetune.json", "r"))
print("samples_per_num_digit = {}".format(len(data[list(data.keys())[0]])))

train = pd.DataFrame(columns=["input_str", "answer"])
val = pd.DataFrame(columns=["input_str", "answer"])
for num_digit in tqdm(data):
    if int(num_digit)%2 == 1: samples_per_num_digit_train = 150
    else: samples_per_num_digit_train = 50
    for s in data[num_digit][:samples_per_num_digit_train]:
        row = {
            "input_str": [s],
            "answer": [num_digit]
        }
        row = pd.DataFrame(row)
        train = pd.concat([train, pd.DataFrame(row)])
    for s in data[num_digit][
        samples_per_num_digit_train:samples_per_num_digit_train+samples_per_num_digit_val
    ]:
        row = {
            "input_str": [s],
            "answer": [num_digit]
        }
        row = pd.DataFrame(row)
        val = pd.concat([val, pd.DataFrame(row)])
print(len(train), len(val))

train.to_csv("../data/finetune/len/odd_even_3:1_split/train.csv", index=False)
val.to_csv("../data/finetune/len/odd_even_3:1_split/val.csv", index=False)


samples_per_num_digit = 1000


  0%|          | 0/126 [00:00<?, ?it/s]

100%|██████████| 126/126 [00:06<00:00, 18.87it/s]


12600 12600


In [18]:
# odd_even_9:1 split
samples_per_num_digit_val = 100

data = json.load(open("../data/finetune/len/finetune.json", "r"))
print("samples_per_num_digit = {}".format(len(data[list(data.keys())[0]])))

train = pd.DataFrame(columns=["input_str", "answer"])
val = pd.DataFrame(columns=["input_str", "answer"])
for num_digit in tqdm(data):
    if int(num_digit)%2 == 1: samples_per_num_digit_train = 180
    else: samples_per_num_digit_train = 20
    for s in data[num_digit][:samples_per_num_digit_train]:
        row = {
            "input_str": [s],
            "answer": [num_digit]
        }
        row = pd.DataFrame(row)
        train = pd.concat([train, pd.DataFrame(row)])
    for s in data[num_digit][
        samples_per_num_digit_train:samples_per_num_digit_train+samples_per_num_digit_val
    ]:
        row = {
            "input_str": [s],
            "answer": [num_digit]
        }
        row = pd.DataFrame(row)
        val = pd.concat([val, pd.DataFrame(row)])
print(len(train), len(val))

train.to_csv("../data/finetune/len/odd_even_9:1_split/train.csv", index=False)
val.to_csv("../data/finetune/len/odd_even_9:1_split/val.csv", index=False)


samples_per_num_digit = 1000


100%|██████████| 126/126 [00:06<00:00, 18.62it/s]


12600 12600


In [None]:
# odd_only split
samples_per_num_digit_val = 100

data = json.load(open("../data/finetune/len/finetune.json", "r"))
print("samples_per_num_digit = {}".format(len(data[list(data.keys())[0]])))

train = pd.DataFrame(columns=["input_str", "answer"])
val = pd.DataFrame(columns=["input_str", "answer"])
for num_digit in tqdm(data):
    if int(num_digit)%2 == 1: samples_per_num_digit_train = 100
    else: samples_per_num_digit_train = 0
    for s in data[num_digit][:samples_per_num_digit_train]:
        row = {
            "input_str": [s],
            "answer": [num_digit]
        }
        row = pd.DataFrame(row)
        train = pd.concat([train, pd.DataFrame(row)])
    for s in data[num_digit][
        samples_per_num_digit_train:samples_per_num_digit_train+samples_per_num_digit_val
    ]:
        row = {
            "input_str": [s],
            "answer": [num_digit]
        }
        row = pd.DataFrame(row)
        val = pd.concat([val, pd.DataFrame(row)])
print(len(train), len(val))

train.to_csv("../data/finetune/len/odd_only_split/train.csv", index=False)
val.to_csv("../data/finetune/len/odd_only_split/val.csv", index=False)


In [3]:
# length % k = 3 split
k = 20
samples_per_num_digit_val = 100

data = json.load(open("../data/finetune/len/finetune.json", "r"))
print("samples_per_num_digit = {}".format(len(data[list(data.keys())[0]])))

train = pd.DataFrame(columns=["input_str", "answer"])
val = pd.DataFrame(columns=["input_str", "answer"])
for num_digit in tqdm(data):
    if int(num_digit)%k == 3: samples_per_num_digit_train = 100
    else: samples_per_num_digit_train = 0
    for s in data[num_digit][:samples_per_num_digit_train]:
        row = {
            "input_str": [s],
            "answer": [num_digit]
        }
        row = pd.DataFrame(row)
        train = pd.concat([train, pd.DataFrame(row)])
    for s in data[num_digit][
        samples_per_num_digit_train:samples_per_num_digit_train+samples_per_num_digit_val
    ]:
        row = {
            "input_str": [s],
            "answer": [num_digit]
        }
        row = pd.DataFrame(row)
        val = pd.concat([val, pd.DataFrame(row)])
print(len(train), len(val))

train.to_csv(f"../data/finetune/len/length_mod_{k}=3_split/train.csv", index=False)
val.to_csv(f"../data/finetune/len/length_mod_{k}=3_split/val.csv", index=False)


samples_per_num_digit = 1000


  0%|          | 0/126 [00:00<?, ?it/s]

100%|██████████| 126/126 [00:03<00:00, 35.45it/s]


700 12600


### Compute ACC

In [4]:
from collections import Counter
import json, re, math
from pprint import pprint
import numpy as np

In [5]:
with open("../scripts/llama/output/1205_105919/test_samples/1205_113333.txt", "r") as f:
    lines = f.readlines()
header, lines = json.loads(lines[0]), lines[1:]
pprint(header)
em = 0
all_preds = []
for l in lines:
    gth, pred = json.loads(l)
    gth_len = int(gth.split(".")[0])
    pred_len = -1
    find = re.findall(r'(\d+)', pred)
    if find: pred_len = int(find[0])
    all_preds.append(pred_len)
    em += int(gth_len == pred_len)
print(f"testing acc = {em / len(lines)}") 

{'data_dir': '../../data/finetune/len/length_extrapolation',
 'load_from_ckpt': 'checkpoint-440'}
testing acc = 0.0


In [6]:
most_common = Counter(all_preds).most_common(10)
print(most_common)
print([x[0] for x in most_common])

[(123, 6767), (153, 4108), (103, 969), (158, 333), (122, 129), (128, 110), (1533, 76), (108, 59), (120, 58), (163, 37)]
[123, 153, 103, 158, 122, 128, 1533, 108, 120, 163]


In [8]:
with open("../scripts/llama/output/1205_105919/eval_samples/1205_111535.txt", "r") as f:
    lines = f.readlines()
em = 0
abs_errors = []
all_preds = []
for l in lines:
    gth, pred = json.loads(l)
    gth_len = int(gth.split(".")[0])
    pred_len = -1
    find = re.findall(r'(\d+)', pred)
    if find: pred_len = int(find[0])
    all_preds.append(pred_len)
    em += int(gth_len == pred_len) # int(-10<=gth_len-pred_len<=10) #
    abs_errors.append(np.abs(gth_len - pred_len))
print(f"testing acc = {em / len(lines)}") 
print(f"avg abs_error = ", np.mean(abs_errors))

testing acc = 0.06285714285714286
avg abs_error =  10.661269841269842
