In [1]:
from datasets import Dataset
from transformers import AutoTokenizer
import pandas as pd
import json
import random

MAX_LENGTH = 680

# Path to your text file
fin = open('data/random_func_array1d_arithmetic_test.json', 'r')

lines = fin.readlines()
instructions = []
responses = []

print("Loading data ...")
for line in lines:
    datum = json.loads(line)
    instruction = str(datum["results"]).replace("\'", '').replace('ZeroDivisionError', 'error').replace(':', '').replace(',', '')
    response = datum["random_code"]
    for i in range(10):
        response = response.replace('  ', ' ')
    responses.append(response)
    instructions.append(instruction)
    if len(responses) % 10000 == 0:
        print(len(responses), "lines loaded")

print("Creating dataset ...")
# Create a DataFrame and then a Dataset
data = {"instruction": instructions, "response": responses}
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

train_test_split = dataset.train_test_split(test_size=0.05, seed=42)  # 5% for test, 95% for train
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("xhyi/PT_GPTNEO350_ATG")
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token
eos_encoded = tokenizer.encode(tokenizer.eos_token)[0]

# Tokenize and process the dataset
def preprocess_function(examples):
    # Tokenize the concatenated instruction and response
    tokenized_data = tokenizer([instruction + " " + response for instruction, response in zip(examples["instruction"], examples["response"])], truncation=True, add_special_tokens=True, padding="max_length", max_length=MAX_LENGTH)

    # Create labels with -100 for the instruction part and token IDs for the response part
    labels = []
    for i, (instruction, response) in enumerate(zip(examples["instruction"], examples["response"])):
        instruction_ids = tokenizer(instruction, add_special_tokens=False)["input_ids"]
        response_ids = tokenizer(response, add_special_tokens=False)["input_ids"]
        #print(len(instruction_ids) + len(response_ids))
        label = [-100] * len(instruction_ids) + response_ids + [eos_encoded] + [-100] * (len(tokenized_data["input_ids"][i]) - len(instruction_ids) - len(response_ids) - 1)
        label = label[0:MAX_LENGTH]
        labels.append(label)

    # Update the tokenized_data with labels
    tokenized_data["labels"] = labels
    return tokenized_data

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

# The tokenized_dataset now contains input_ids, attention_mask, and labels
print(tokenized_train_dataset)
print(tokenized_test_dataset)

Loading data ...
Creating dataset ...


Map:   0%|          | 0/1900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'response', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1900
})
Dataset({
    features: ['instruction', 'response', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})


In [2]:
#tokenized_train_dataset.save_to_disk('data/random_func_array1d_arithmetic_train_tokenized_v1.dataset')
#tokenized_test_dataset.save_to_disk('data/random_func_array1d_arithmetic_test_tokenized_v1.dataset')

In [5]:
x = tokenized_train_dataset[0]

ids = x["input_ids"]
labels = x["labels"]
ids_idx = 0
while ids[ids_idx] != 50256:
    ids_idx+=1
print(ids[:ids_idx])
print(labels)

[58, 90, 15414, 25915, 1065, 513, 60, 5072, 532, 1065, 92, 1391, 15414, 685, 1954, 532, 19, 532, 18, 532, 1415, 352, 60, 5072, 532, 18, 92, 1391, 15414, 25915, 1065, 532, 20, 532, 1415, 1105, 60, 5072, 532, 20, 92, 1391, 15414, 685, 21, 532, 2624, 718, 604, 1367, 60, 5072, 718, 92, 1391, 15414, 685, 16, 532, 19, 604, 60, 5072, 352, 92, 1391, 15414, 25915, 17, 718, 60, 5072, 532, 17, 92, 1391, 15414, 25915, 1238, 532, 16, 860, 604, 513, 513, 532, 1065, 60, 5072, 513, 92, 1391, 15414, 25915, 16, 532, 19, 860, 642, 60, 5072, 532, 19, 92, 1391, 15414, 25915, 17, 532, 20, 532, 22, 678, 657, 60, 5072, 532, 22, 92, 1391, 15414, 685, 23, 513, 532, 16, 1478, 532, 16, 60, 5072, 532, 16, 92, 1391, 15414, 25915, 16, 532, 1314, 352, 532, 1238, 513, 2808, 532, 20, 60, 5072, 513, 92, 1391, 15414, 685, 1065, 657, 1105, 1367, 532, 17, 532, 22, 532, 17, 513, 60, 5072, 532, 22, 92, 1391, 15414, 685, 18, 352, 657, 1987, 60, 5072, 352, 92, 1391, 15414, 25915, 1485, 657, 642, 2681, 532, 21, 532, 23, 362, 53

In [7]:
x = tokenized_train_dataset[100]
print(tokenizer.decode(x['input_ids']))
print(len(x['labels']))
# print(tokenizer.decode([max(32,i) for i in x['labels']]))
it = 230
x=tokenized_train_dataset[it]
# input_ids = []
# for i in x["input_ids"]:
#     if i == -100 or i == 50256:
#         break
#     input_ids.append(i)
# labels = []
# for i in x["labels"]:
#     if i == 50256:
#         break
#     if i == -100:
#         continue
#     labels.append(i)
#     print(input_ids)
#     print("lens", len(input_ids), len(labels))
for i in range(len(x["labels"])):
    print("ids", i, x["input_ids"][i], tokenizer.decode(x["input_ids"][i]))
    if x["labels"][i] == -100:
        continue
    print("labels", i, x["labels"][i], tokenizer.decode(x["labels"][i]))
# for i in x["labels"]:
#     try:
#         print(tokenizer.decode([i]))
#     except:
#         if i == -100:
#             print("eos")
#         else:
#             print("err")

[{input [9 8 14] output 1} {input [2 5 4 -10 11 60] output 5} {input [-1 -2] output 1} {input [-15 -2 6 -1 4 8 1 -4] output 6} {input [-3 -5] output 2} {input [-12 -5 -2 6] output -1} {input [0 -10 0 5 15] output -8} {input [-4 -3 -9 -14 -2 -3 -9 3] output -4} {input [27 1 -3 1 -6 6] output 3} {input [-5 -5 -5 -1 -7 -8 8] output 11} {input [-20 10] output -2} {input [8 -5 20 1 -6 2 1 0] output 0} {input [-4 10 1 3] output -2} {input [8 1] output 0} {input [9 -13 4] output 5} {input [-9 -2 6 3 6] output 9} {input [-10 -5] output 2} {input [-34 4 4] output 1} {input [-17 -17 1] output 1} {input [20 14 3 -2 -12] output 6} {input [-3 -10 16 -13 15 -2] output 7} {input [0 -7 2 11 3 -5 5] output -4} {input [10 2 0 -3 3] output 0} {input [-13 -1 13 4 -34] output -2} {input [2 -1 18 15] output -1} {input [-1 0 -1 2] output 10} {input [18 4 2] output 2} {input [-1 -3] output 2} {input [4 1 12 2 6 -12 5 2] output -2} {input [14 -7 -5 -2 -9] output 2} {input [-2 27 -4 0 -9 -2 -27] output 1} {inpu

In [2]:
from datasets import load_dataset, load_from_disk

tokenized_train_dataset = load_from_disk('data/random_func_array1d_arithmetic_train_tokenized_v1.dataset')
tokenized_test_dataset = load_from_disk('data/random_func_array1d_arithmetic_test_tokenized_v1.dataset')

Loading dataset from disk:   0%|          | 0/40 [00:00<?, ?it/s]

In [2]:
i=5
print(tokenizer.decode(tokenized_test_dataset[i]['input_ids']))
print(len(tokenized_test_dataset[i]['labels']))
print(tokenizer.decode([max(i,x) for x in tokenized_test_dataset[i]['labels']]))

[{input [-9 24 9 1] output 22} {input [7 -12 3 5 -1] output 0} {input [-6 -19 -10] output 7} {input [8 2 -3 15 12 -8] output 11} {input [19 6 2 -1 18 -29] output 8} {input [-4 -6 8 3 1 -6 22] output -14} {input [-1 0 7 20 -3 0] output 20} {input [-19 5] output 0} {input [6 -1 6 -4 5 17 15] output 2} {input [11 -12 11] output 15} {input [-8 3 3 2 -8 0] output 7} {input [4 -27 4 2 0] output 0} {input [2 -6 -15 -8 -17 6 12] output -5} {input [3 -6 10 -2] output 4} {input [11 -25 -2 0 -1] output 4} {input [-4 6] output 0} {input [-24 -4] output 0} {input [8 2 -2 -3] output 7} {input [-14 0 -2 4 -1 24] output 24} {input [11 10 -4 -5 -2 5 -8] output 1} {input [-4 3 6] output 7} {input [14 6 -4 -2 -23 -3 3 24] output 31} {input [-2 2 -23 -4 8 -2 11 -5] output 3} {input [-18 7 10 -2 11] output -7} {input [-2 24 -7] output -10} {input [-3 -8 -6] output 7} {input [9 -15 -9 5] output 7} {input [34 9 11 4 0] output 23} {input [3 5] output 0} {input [2 -6 18 -7 -4 -5 15 8] output 11} {input [-8 -6 

In [5]:
input_ids = tokenized_test_dataset[0]['input_ids']
for i in range(MAX_LENGTH):
    print(i, tokenizer.decode([input_ids[i]]))

0 [
1 {
2 input
3  [
4 1
5  6
6  -
7 2
8  1
9  8
10 ]
11  output
12  1
13 }
14  {
15 input
16  [
17 0
18  3
19  -
20 10
21 ]
22  output
23  0
24 }
25  {
26 input
27  [
28 21
29  -
30 17
31  1
32 ]
33  output
34  21
35 }
36  {
37 input
38  [
39 4
40  -
41 1
42 ]
43  output
44  4
45 }
46  {
47 input
48  [-
49 7
50  -
51 9
52  -
53 4
54  -
55 2
56  3
57 ]
58  output
59  -
60 7
61 }
62  {
63 input
64  [-
65 9
66  2
67  -
68 6
69  -
70 15
71  14
72  10
73  -
74 1
75 ]
76  output
77  -
78 9
79 }
80  {
81 input
82  [-
83 10
84  4
85  4
86  -
87 1
88  17
89  11
90 ]
91  output
92  -
93 10
94 }
95  {
96 input
97  [-
98 6
99  -
100 32
101  -
102 5
103  6
104 ]
105  output
106  -
107 6
108 }
109  {
110 input
111  [-
112 4
113  -
114 8
115  8
116 ]
117  output
118  -
119 4
120 }
121  {
122 input
123  [
124 16
125  -
126 2
127  3
128  0
129 ]
130  output
131  16
132 }
133  {
134 input
135  [
136 6
137  -
138 2
139  1
140  -
141 4
142  2
143 ]
144  output
145  6
146 }
147  {
148 input
149  [-
150 16

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
import torch.nn.init as init
from transformers import AutoModel

def initialize_weights(model, init_type='xavier'):
    for module in model.modules():
        if isinstance(module, torch.nn.Linear):
            if init_type == 'xavier':
                init.xavier_uniform_(module.weight)
            elif init_type == 'he':
                init.kaiming_uniform_(module.weight, nonlinearity='relu')
            if module.bias is not None:
                module.bias.data.fill_(0)

# Load the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("xhyi/PT_GPTNEO350_ATG") #"EleutherAI/gpt-neo-1.3B")
# tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("xhyi/PT_GPTNEO350_ATG") #"EleutherAI/gpt-neo-1.3B") #

# Initialize weights
initialize_weights(model, init_type='he')  # Or 'he' for He initialization


# # Tokenize and preprocess the dataset
# def preprocess_function(examples):
#     inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=1024)
#     inputs["labels"] = inputs.input_ids.copy()
#     return inputs

# tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
# tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

  return self.fget.__get__(instance, owner)()


In [None]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./data/results-gptneo350m",
    evaluation_strategy="steps",
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    warmup_steps=500,
    weight_decay=0.01,
    save_steps=20000,
    eval_steps=20000,
    logging_steps=20000,
    logging_dir="./data/logs",
    fp16=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Train the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mxiaoxinyin[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
20000,0.1627,0.097327
40000,0.0892,0.083678
60000,0.0789,0.074212
80000,0.0719,0.069186
100000,0.0658,0.063345
120000,0.0625,0.060568
140000,0.06,0.05854
160000,0.0584,0.057395
180000,0.0568,0.055768
200000,0.0557,0.054824


In [None]:
import torch

torch.save(model, 'data/results/gptneo350m-random_func_array1d_arithmetic-v1-417600.model')

In [None]:
import torch

model = torch.load('data/results/gptneo350m-random_func_array1d_arithmetic-v1-417600.model')

In [None]:
example = tokenized_test_dataset[0]
label = example['labels']
input_ids = list(example['input_ids'])

i = 0
while label[i] == -100:
    i += 1
instruction_len = i

for i in range(instruction_len, len(input_ids)):
    input_ids[i] = 50256

In [None]:
example

In [None]:
model.eval()
example = tokenized_test_dataset[30]
label = example['labels']
input_ids = list(example['input_ids'])

i = 0
while label[i] == -100:
    i += 1
instruction_len = i + 5

for i in range(instruction_len, len(input_ids)):
    input_ids[i] = 50256

print(example['instruction'])
print(example['response'])

START_IDX = instruction_len

inputs_ids = torch.tensor(input_ids)
attention_mask = torch.ones(MAX_LENGTH)
idx = START_IDX

while idx < len(input_ids):
    #print("input_ids:", tokenizer.decode(inputs_ids))
    #print("inputs_embeds:", example['inputs_embeds'][758:770])
    #print("attention_mask:", example['attention_mask'][768:780])
    #print("labels:", tokenizer.decode(example['labels'][768:780]))
    outputs = model(input_ids=torch.unsqueeze(inputs_ids, 0).to('cuda:0'),
                    attention_mask=torch.unsqueeze(attention_mask, 0).to('cuda:0'))
    #
    logits = outputs.logits
    #print("logits:", logits[:, idx, :])
    predicted_token_id = torch.argmax(logits[:, idx, :], dim=-1)
    if predicted_token_id == eos_encoded:
        break
    # Convert the token ID to the actual token
    predicted_token = tokenizer.decode(predicted_token_id)
    print(idx, predicted_token)
    idx += 1
    #if idx > START_IDX+2:
    inputs_ids[idx] = predicted_token_id.detach().cpu().numpy()[0]

In [None]:
# Testing 

a = [1,2,-3,4,5]

def squared_sum(arr):
    return sum([x*x for x in arr])

def abs_sum(arr):
    return sum([abs(x) for x in arr])

def sum2(arr):
    return 2*sum(arr)

def sum5(arr):
    return 5*sum(arr)

def sum_plus_len(arr):
    return sum(arr) + len(arr)

def sum_plus_2len(arr):
    return sum(arr) + 2*len(arr)

def sum_minus_len(arr):
    return sum(arr) - len(arr)

def sum_positive(arr):
    return sum([x for x in arr if x > 0])

def sum_negative(arr):
    return sum([x for x in arr if x < 0])

def sum_greater_than_3(arr):
    return sum([x for x in arr if x > 3])

def sum_mod_3(arr):
    return sum([x%3 for x in arr])

def interleaving_substract(arr):
    if len(arr) == 1:
        return arr[0]
    else:
        return arr[0] - interleaving_substract(arr[1:])
    
def product(arr):
    if len(arr) == 1:
        return arr[0]
    else:
        return arr[0] * product(arr[1:])
    
def product_plus1(arr):
    if len(arr) == 1:
        return arr[0] + 1
    else:
        return (arr[0] + 1) * product(arr[1:])
    
def product_plus2(arr):
    if len(arr) == 1:
        return arr[0] + 2
    else:
        return (arr[0] + 2) * product(arr[1:])
    
def first(arr):
    return arr[0]

def last(arr):
    return arr[-1]

def second(arr):
    return arr[1]

def third(arr):
    return arr[2]

def second2last(arr):
    return arr[-2]

def third2last(arr):
    return arr[-3]
    
test_functions = [sum, max, min, squared_sum, abs_sum, sum2, sum5, sum_plus_len, sum_plus_2len, 
                  sum_minus_len, sum_positive, sum_negative, sum_greater_than_3, sum_mod_3, 
                  interleaving_substract, product, product_plus1, product_plus2, first, last]

In [None]:
import numpy as np
import random

def format_code(code):
    INDENT = "    "
    lines = [l.strip() for l in code.split('\n')]
    lines = [l for l in lines if len(l) > 2]
    formatted = lines[0] + '\n'
    current_indent = ""
    for i in range(1, len(lines)):
        if lines[i-1].endswith(':'):
            current_indent += INDENT
        else:
            current_indent = current_indent[0:-len(INDENT)]
        formatted += current_indent + lines[i] + '\n'
    return formatted

def generate_random_array():
    length = random.randint(2, 8)  # Randomly choose the length of the array
    # Generate the array with absolute values following an exponential distribution
    array = [(-1)**random.randint(0, 1) * round(np.random.exponential(scale=8.0)) for _ in range(length)]
    return array

def create_test_input(foo, num_random_array=40):
    datum = {}
    #
    datum['random_code'] = "def foo(arr):\n"
    results = []
    random_arrays = []
    random_results = []
    for j in range(num_random_array):
        random_array = generate_random_array()
        random_arrays.append(random_array)
        try:
            result = foo(random_array)
        except Exception as e:
            result = type(e).__name__
        random_results.append(result)
        results.append({"input":str(random_array), "output":result})
    datum['results'] = results
    instruction = str(datum["results"]).replace("\'", '').replace('ZeroDivisionError', 'error').replace(':', '').replace(',', '')
    response = datum["random_code"]
    for i in range(10):
        response = response.replace('  ', ' ')
    return instruction, response, random_arrays, random_results

def get_input_ids(foo, num_random_array=40):
    instruction, response, random_arrays, random_results = create_test_input(foo, num_random_array=num_random_array)
    #print(instruction)
    tokenized_data = tokenizer(instruction + " " + response, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    return tokenized_data, random_arrays, random_results

def check_infer_function(f, verbose=False):
    print("\nFUNCTION:", f.__name__)
    tokenized_data, random_arrays, random_results = get_input_ids(f)
    input_ids = tokenized_data['input_ids']
    #print(input_ids)
    #tokenizer.decode(input_ids)
    instruction_len = len([x for x in input_ids if x < 50256])
    START_IDX = instruction_len
    inputs_ids = torch.tensor(input_ids)
    attention_mask = torch.ones(MAX_LENGTH)
    idx = START_IDX
    #
    try:
        output = "def foo(arr):\n"
        while idx < len(input_ids):
            #print("input_ids:", tokenizer.decode(inputs_ids))
            #print("inputs_embeds:", example['inputs_embeds'][758:770])
            #print("attention_mask:", example['attention_mask'][768:780])
            #print("labels:", tokenizer.decode(example['labels'][768:780]))
            outputs = model(input_ids=torch.unsqueeze(inputs_ids, 0).to('cuda:0'),
                            attention_mask=torch.unsqueeze(attention_mask, 0).to('cuda:0'))
            #
            logits = outputs.logits
            #print("logits:", logits[:, idx, :])
            predicted_token_id = torch.argmax(logits[:, idx, :], dim=-1)
            if predicted_token_id == eos_encoded:
                break
            # Convert the token ID to the actual token
            predicted_token = tokenizer.decode(predicted_token_id)
            output += predicted_token
            #print(idx, predicted_token)
            idx += 1
            inputs_ids[idx] = predicted_token_id.detach().cpu().numpy()[0]
    except Exception as e:
        print(e)
        return
    #
    inferred_code = format_code(output)
    print(inferred_code)
    exec(inferred_code, globals())
    #
    count_equal = 0
    for i in range(len(random_arrays)):
        try:
            result = foo(random_arrays[i])
        except Exception as e:
            result = type(e).__name__
        if verbose:
            print("Random array:", random_arrays[i])
            print("Expected result:", random_results[i])
            print("Actual result:  ", result)
        if random_results[i] == result:
            count_equal += 1
    #
    print(count_equal, '/', len(random_arrays))

for f in test_functions:
    check_infer_function(f)

#check_infer_function(min, verbose=True)

In [None]:
# def foo(arr):
#         if len(arr) == 1:
#             return arr[0]
#         else:
#             return max([arr[0], foo(arr[1:])])

del foo
foo([8, 11, 13, -5, 6, -13, -8, 2, 8])

In [None]:
exec(inferred_code)

In [None]:
inferred_code