In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [117]:
import os
import json
from torch.utils.data import Dataset
from tqdm import tqdm
from pprint import pprint
import re
import hashlib
import torch
import torch.nn as nn

## Part 1: Load the data

In [11]:
def merge_jsons(folder_path, output_path):
    jsons = []
    # list to hold contents of jsons
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith('.json'):  # Process only JSON files
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                # load json
                data = json.load(file)
                jsons.append(json.dumps(data))

    # combine all jsosn into string
    output_content = ',\n'.join(jsons)
    output_content = f"{output_content}"

    # save combined json
    with open(output_path, 'w') as output_file:
        output_file.write(output_content)

train_folder_path = '/content/drive/MyDrive/Final Project Datasets/data_symbolic_regression/train'
train_output_path = '/content/train_merged.json'
test_folder_path = '/content/drive/MyDrive/Final Project Datasets/data_symbolic_regression/test'
test_output_path = '/content/test_merged.json'
val_folder_path = '/content/drive/MyDrive/Final Project Datasets/data_symbolic_regression/val'
val_output_path = '/content/val_merged.json'

# merge the jsons together because
merge_jsons(train_folder_path, train_output_path)
merge_jsons(test_folder_path, test_output_path)
merge_jsons(val_folder_path, val_output_path)

100%|██████████| 748/748 [00:03<00:00, 243.77it/s]
100%|██████████| 162/162 [00:00<00:00, 237.31it/s]
100%|██████████| 161/161 [00:00<00:00, 227.30it/s]


## 2. Embed the formula

In [118]:
merged_json_path = '/content/train_merged.json'

unique_formulas = set()

with open(merged_json_path, 'r') as file:
        # Read the file line by line
        for line in file:
            # Remove trailing commas and whitespace
            line = line.strip().rstrip(',')

            # Parse the line as a JSON object
            if line:  # Skip empty lines
                try:
                    data = json.loads(line)
                    if 'formula' in data:
                        unique_formulas.add(data['formula'])
                except json.JSONDecodeError as e:
                    print(f"Error decoding line: {line}\n{e}")
# unique formulas
unique_formulas = list(unique_formulas)
pprint(unique_formulas[:5])

['mult(cos(add(var_0, var_2), N(N, N)), sqrt(mult(var_1, var_0), N(N, N)))',
 'add(add(sqrt(var_2, N), sqrt(var_1, N)), tanh(pow_2(var_0, N), N(N, N)))',
 'add(cosh(reverse(var_1, N), N(N, N)), cosh(add(var_0, var_2), N(N, N)))',
 'add(tanh(mult(var_1, var_2), N(N, N)), sqrt(pow_2(var_0, N), N(N, N)))',
 'add(sinh(pow_2(var_2, N), N(N, N)), sqrt(add(var_0, var_1), N(N, N)))']


In [15]:
# function to extract tokens from a formula
def extract_tokens(formulas):
    tokens = set()
    for formula in formulas:
        tokens.update(re.findall(r"[a-zA-Z_]\w*", formula))
    return sorted(tokens)

# generate vocabulary
vocabulary = extract_tokens(unique_formulas)
print("Vocabulary:", vocabulary)

Vocabulary: ['C_0', 'C_1', 'C_2', 'N', 'add', 'cos', 'cosh', 'exp', 'gaussian', 'log', 'mult', 'neg', 'pow_2', 'reverse', 'sin', 'sinh', 'sqrt', 'tan', 'tanh', 'var_0', 'var_1', 'var_2']


Here I made a lookup table of the equations, later I'll add the equations from the training and validation set so that no possible equations are left out. Maybe we can look for a better way to embed the equations later.

In [114]:
equation_to_index = {eq: idx for idx, eq in enumerate(unique_formulas)}
index_to_equation = {idx: eq for eq, idx in equation_to_index.items()}

num_equations = len(unique_formulas)
embedding_dim = 128  # embedding dimension

# embedding layer
equation_embeddings = nn.Embedding(num_equations, embedding_dim)

# get the embedding of an equation
def get_embedding(equation):
    index = torch.tensor([equation_to_index[equation]])
    embedding = equation_embeddings(index)
    return embedding.squeeze(0)

# to map back from embedding to equation (since embeddings are unique)
def find_equation(embedding):
    # create a lookup table
    all_embeddings = equation_embeddings.weight.detach()
    distances = torch.norm(all_embeddings - embedding, dim=1)
    closest_index = torch.argmin(distances).item()
    return index_to_equation[closest_index]

In [116]:
print(unique_formulas[0])
print(get_embedding(unique_formulas[0]))
print(find_equation(get_embedding(unique_formulas[0])))

mult(cos(add(var_0, var_2), N(N, N)), sqrt(mult(var_1, var_0), N(N, N)))
tensor([ 0.1693,  0.6093, -0.8928,  0.4239, -0.1169, -0.5382, -0.8167, -0.6550,
        -0.3950,  1.1551,  0.5824,  0.6986,  1.0476, -0.2248,  1.5310, -0.7199,
        -0.0892, -1.0237,  0.4608, -0.0519, -1.2601,  1.4363,  0.5297,  0.2367,
        -1.4412, -1.0423, -0.4750,  0.0193,  0.7185,  1.4224, -0.6880,  1.0155,
        -0.7254, -0.1509,  0.0666, -1.2250,  0.5450,  1.2968, -0.1680, -0.2055,
        -0.6902,  0.8764,  0.2365, -0.4168,  1.3870,  0.7504,  1.4949, -1.1196,
        -1.6821, -0.2091, -0.6663,  2.0757, -1.3531,  0.4083, -0.7152, -0.8465,
        -1.2863, -0.1185, -0.4894,  1.8533, -0.9660, -0.0037, -0.8134,  0.8750,
         1.2679,  0.2017, -0.7468, -2.2780, -1.5452, -0.6489, -0.0457,  0.3187,
         0.0820,  0.3762, -0.5442,  1.9085, -0.6823,  0.8009,  1.2310, -0.8032,
         2.0294,  0.5197, -1.7682, -1.1168,  0.2611, -0.8051, -1.3532, -0.0782,
         0.5359,  0.5843, -0.9308,  0.7240, -0.

## 3. Dataset

In [None]:
class EquationDataset(Dataset):
    def __init__(self, data_point):
        self.formula = data_point["formula"]
        self.formula_human_readable = data_point["formula_human_readable"]
        self.formula_depth = data_point["formula_depth"]
        self.n_vars = data_point["n_vars"]
        self.n_consts = data_point["n_consts"]
        self.n_points = data_point["n_points"]
        self.var_bound_dict = data_point["var_bound_dict"]
        self.const_value_dict = data_point["const_value_dict"]
        self.meta_list = data_point["meta_list"]
        self.points = data_point["points"]
        self.target = data_point["target"]

        self. embedded_formula = get_embedding(self.formula)

        # add code here to use the pre-trained t-net to embed the points