In [None]:
!pip install transformers

# 1. Prompt Engineering

First, and the most obvious option we have is to choose a small model and set a series of experiments on how a model can work just as "out-of-the-box".

In [2]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

MIN_TRANSFORMERS_VERSION = '4.25.1'

# check transformers version
assert transformers.__version__ >= MIN_TRANSFORMERS_VERSION, f'Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher.'


In [None]:
# init
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-Instruct-3B-v1")
model = AutoModelForCausalLM.from_pretrained("togethercomputer/RedPajama-INCITE-Instruct-3B-v1", torch_dtype=torch.float16)
model = model.to('cuda:0')

In [13]:
# infer
prompt = "Q: one plus three\nA: four\nQ: twenty plus forty two\nA: sixty two\nQ: twelve plus thirty three\n"
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
input_length = inputs.input_ids.shape[1]
outputs = model.generate(
    **inputs, max_new_tokens=7, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
)
token = outputs.sequences[0, input_length:]
output_str = tokenizer.decode(token)
print(output_str)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


A: fifty seven
Q:


In [14]:
# infer
prompt = "Q: 1 plus 3\nA: 4\nQ: 20 plus 42\nA: 62 \nQ: 12 plus 33\n"
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
input_length = inputs.input_ids.shape[1]
outputs = model.generate(
    **inputs, max_new_tokens=10, do_sample=True, temperature=0.7, top_p=0.7, top_k=50, return_dict_in_generate=True
)
token = outputs.sequences[0, input_length:]
output_str = tokenizer.decode(token)
print(output_str)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


A: 45
Q: -1 plus 4


# 2. Fine-Tuning

## 2.1. Dataset generation

We are going to generate about 304000 pairs of numbers and the result of their addition as well.

In [None]:
# Addition up to 16 digits

pairs = \
[(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) for i in range(1,16) for j in range(i,16) for k in range(1000)] +\
[(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) for i in range(3,16) for j in range(i,16) for k in range(1000)] +\
[(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) for i in range(6,16) for j in range(i,16) for k in range(1000)] +\
[(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) for i in range(9,16) for j in range(i,16) for k in range(1000)] +\
[(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) for i in range(12,16) for j in range(i,16) for k in range(1000)] 

random.shuffle(pairs)

print("Addition:", len(pairs))

data_add = []

for num1, num2 in pairs:
    
    if random.random()<0.5:
        num1, num2 = num2, num1 

    answer = num1 + num2
    
    question = f"{num1} + {num2}" 
    output = f"{num1} + {num2} = {answer}"
    
    assert(output.split()[-1] == str(answer))
    data_add.append({"input": question, "output": output, "answer": str(answer)})



We are also going to generate 304000 pairs of numbers and the result of their subtraction, since as a human being I find these two operations as inverse operations, which means addition undoes subtraction and subtraction undoes addition. The human logic might not work well on LLM logic, but that is what we are here to try and test. We will train two different models with and without this part of dataset and compare their results.

In [None]:
pairs = \
[(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) for i in range(1,16) for j in range(i,16) for k in range(1000)] +\
[(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) for i in range(3,16) for j in range(i,16) for k in range(1000)] +\
[(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) for i in range(6,16) for j in range(i,16) for k in range(1000)] +\
[(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) for i in range(9,16) for j in range(i,16) for k in range(1000)] +\
[(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) for i in range(12,16) for j in range(i,16) for k in range(1000)] 

random.shuffle(pairs)

print("Subtraction:", len(pairs))

data_sub = []

for num1, num2 in pairs:
    
    if random.random()<0.5:
        num1, num2 = num2, num1 

    answer = num1 - num2
    
    question = f"{num1} - {num2}" 
    output = f"{num1} - {num2} = {answer}"

    assert(output.split()[-1] == str(answer))
    data_sub.append({"input": question, "output": output, "answer": str(answer)})


# with open("dataset.json", "w") as f:
#     json.dump(data_sub, f, indent=4)

In [None]:
# Generate arithmetic data

with open("dataset.json", "w") as f:
    json.dump(data_add + data_sub, f, indent=4)

print("Total:", len(data_add + data_sub))
print("Arithmetic dataset generated!")

In [None]:
### Add natural language instruction to the generated arithmetic data using template

template_name = "./templates/goat.json"
dataset_name = "dataset.json"

with open(template_name) as fp:
    template = json.load(fp)

with open(dataset_name,"rb") as test_file:
    data_original = json.load(test_file)

data_converted = []

for instance in data_original:
    
    arithmetic = instance["input"]
    
    output_dict = {}
        
    
    # add noise to instruction so that the model is robust to diverse question formats 
    if random.random() < 0.05:
        if " + " in arithmetic:
            arithmetic = "the sum of " + arithmetic.replace("+", "and")

        if " - " in arithmetic:
            arithmetic = "the difference of " + arithmetic.replace("-", "and")

        if " * " in arithmetic:
            arithmetic = "the product of " + arithmetic.replace("*", "and")

        if " / " in arithmetic:
            arithmetic = "the quotient and remainder of " + arithmetic.replace("/", "and")

    if random.random() < 0.5:
        arithmetic = arithmetic.replace("*", "x")    

    if random.random() < 0.1:
        arithmetic = arithmetic.replace("+", "plus").replace("-", "minus")
        arithmetic = arithmetic.replace(" x ", " times ").replace("*", "multiplied by").replace("/", "divided by")    

    if random.random() < 0.5:
        if "+" in arithmetic or "-" in arithmetic or "*" in arithmetic or "/" in arithmetic or "x" in arithmetic:
            arithmetic = arithmetic.replace(" ", "")        

    num = random.randint(1,500)

    instruction = template[str(num)].format(
        input = arithmetic
    )
    
    output_dict["instruction"] = instruction
    output_dict["input"] = instance["input"]
    output_dict["output"] = instance["output"]
    output_dict["answer"] = instance["answer"]
    
    data_converted.append(output_dict)

print("Total:", len(data_converted))

with open("dataset.json", "w") as f:
    json.dump(data_converted, f, indent=4)

print("Instructions added!")