In [None]:
import pandas as pd
import wandb
from datetime import datetime
import numpy as np

### Define Results class where the important values from the logs are saved 

In [None]:
class Results():
    def __init__(self):
        num_seeds = 5
        columns = [f"nguyen_{i}" for i in range(1, 13, 1)]
        self.result_dict = {}
        for target in ['num_states_to_0_999', 'num simulation to 0_999', '_runtime', 'num_productions 0_999]']:
            self.result_dict[target] = {}
            self.result_dict[target]['results_grammar_1_Normal'] = pd.DataFrame(index=list(range(0, num_seeds, 1)), columns=columns)
            self.result_dict[target]['results_uniform_1_Normal'] = pd.DataFrame(index=list(range(0, num_seeds, 1)), columns=columns)
            self.result_dict[target]['results_grammar_1_AmEx'] = pd.DataFrame(index=list(range(0, num_seeds, 1)), columns=columns)
            self.result_dict[target]['results_uniform_1_AmEx'] = pd.DataFrame(index=list(range(0, num_seeds, 1)), columns=columns)

            self.result_dict[target]['results_grammar_2_Normal'] = pd.DataFrame(index=list(range(0, num_seeds, 1)), columns=columns)
            self.result_dict[target]['results_uniform_2_Normal'] = pd.DataFrame(index=list(range(0, num_seeds, 1)), columns=columns)
            self.result_dict[target]['results_grammar_2_AmEx'] = pd.DataFrame(index=list(range(0, num_seeds, 1)), columns=columns)
            self.result_dict[target]['results_uniform_2_AmEx'] = pd.DataFrame(index=list(range(0, num_seeds, 1)), columns=columns)

    def fill_results(self, config, summary):
        seed = config['seed']
        data_path = config['data_path']
        grammar_search = config['grammar_search']
        engine = config['MCTS_engine']
        prior = config['prior_source']
        dataset = config['data_path'].split('/')[1]
        for target in ['num_states_to_0_999', '_runtime', 'num simulation to 0_999', 'num_productions 0_999]']:
            if target in summary:
                value_from_job = summary[target]  # run.history(keys=[target])
                if value_from_job > 0:
                    value = value_from_job
                else:
                    value = np.nan
            else:
                value = np.nan

            if '1' in grammar_search and engine == 'Endgame' and prior == 'grammar':
                self.result_dict[target]['results_grammar_1_AmEx'].loc[seed, dataset] = value
            elif '1' in grammar_search and engine == 'Normal' and prior == 'grammar':
                self.result_dict[target]['results_grammar_1_Normal'].loc[seed, dataset] = value
            elif '1' in grammar_search and engine == 'Endgame' and prior == 'uniform':
                self.result_dict[target]['results_uniform_1_AmEx'].loc[seed, dataset] = value
            elif '1' in grammar_search and engine == 'Normal' and prior == 'uniform':
                self.result_dict[target]['results_uniform_1_Normal'].loc[seed, dataset] = value

            elif '2' in grammar_search and engine == 'Endgame' and prior == 'grammar':
                self.result_dict[target]['results_grammar_2_AmEx'].loc[seed, dataset] = value
            elif '2' in grammar_search and engine == 'Normal' and prior == 'grammar':
                self.result_dict[target]['results_grammar_2_Normal'].loc[seed, dataset] = value
            elif '2' in grammar_search and engine == 'Endgame' and prior == 'uniform':
                self.result_dict[target]['results_uniform_2_AmEx'].loc[seed, dataset] = value
            elif '2' in grammar_search and engine == 'Normal' and prior == 'uniform':
                self.result_dict[target]['results_uniform_2_Normal'].loc[seed, dataset] = value

### Download logs from wandb to result object 

In [None]:
api = wandb.Api()
entity, project = "wwjbrugger", "Test_NGSR_10_05_Nguyen_with_3_constant"
# Example: January 1st, 2024

# created_at (str): ISO timestamp when the run was started
runs = api.runs(entity + "/" + project)

summary_list, config_list, name_list = [], [], []
results_obj = Results()

for run in runs:
    # .summary contains output keys/values for
    # metrics such as accuracy.
    #  We call ._json_dict to omit large files
    summary_dict = run.summary._json_dict
    config_dict = {k: v for k, v in run.config.items() if not k.startswith("_")}
    results_obj.fill_results(config_dict, summary_dict)

print('end')


In [None]:
### Define Methods to read in the dso logs and save important results 

In [None]:
def fill_dso_dict_simulations(path_to_log, dso_dict):
    dataset_name_found = False
    with open(path_to_log) as fp:
        lines = fp.readlines()
        current_seed = 0
        for line in lines:
            if 'Dataset' in line and not dataset_name_found:
                dataset_name_found = True
                dataset_name = line.split()[2]
                dataset_name = dataset_name.lower().replace('-', '_')
                dso_dict[dataset_name] = {}
            if 'Starting seed        : ' in line:
                # we search for the 1 in "Starting seed        : 1"
                current_seed = line.split()[3]
            if 'Invalid expressions: ' in line:
                # we search for the 89000 in "Invalid expressions: 42183 of 89000 (47.4%)."
                num_simulations = int(line.split()[4])
                if int(current_seed) <= 5:
                    if num_simulations == 2_000_000:
                        num_simulations = np.nan
                        dso_dict[dataset_name][current_seed] = num_simulations
                    else:
                        dso_dict[dataset_name][current_seed] = num_simulations


def fill_dso_time(path_to_log, dso_dict):
    dataset_name_found = False
    with open(path_to_log) as fp:
        lines = fp.readlines()
        current_seed = 0
        for line in lines:
            if 'Dataset' in line and not dataset_name_found:
                dataset_name_found = True
                dataset_name = line.split()[2]
                dataset_name = dataset_name.lower().replace('-', '_')
                dso_dict[dataset_name] = {}
            if 'Starting seed        : ' in line:
                # we search for the 1 in "Starting seed        : 1"
                current_seed = line.split()[3]
            if 'INFO: Completed run 1 of 1 in' in line:
                time = int(line.split()[7])
                if int(current_seed) <= 5:
                    dso_dict[dataset_name][current_seed] = time


### Fill result file with results from dso log 

In [None]:

paths_to_log = [
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_1.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_2.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_3.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_4.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_5.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_6.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_7.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_8.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_9.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_10.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_11.txt',
    '/home/jbrugger/PycharmProjects/deep-symbolic-optimization/logs/output_nguyen_12.txt',
]

dso_dict_sim = {}
for path in paths_to_log:
    fill_dso_dict_simulations(path, dso_dict_sim)
results_obj.result_dict['num simulation to 0_999']['DSO'] = pd.DataFrame(dso_dict_sim)

dso_dict_time = {}
for path in paths_to_log:
    fill_dso_time(path, dso_dict_time)
results_obj.result_dict['_runtime']['DSO'] = pd.DataFrame(dso_dict_time)

### save the mean for each approach over the nguyen dataset and the seed 

In [None]:
def fill_mean_of_approach(target, approach, mean_of_approach):
    if approach in results_obj.result_dict[target]:
        df = results_obj.result_dict[target][approach]
        nan_values = df.isna().sum().sum()
        np.nanstd(np.reshape(df.to_numpy(), -1))
        mean_of_approach.loc[approach][target] = round(np.nanmean(np.reshape(df.to_numpy(), -1)),
                                                       0)  # (f"{round(np.nanmean(np.reshape(df.to_numpy(), -1)),0)} pm  {round(np.nanstd(np.reshape(df.to_numpy(), -1 )),0)}")
        mean_of_approach.loc[approach]['unsuccessful_fits'] = nan_values


mean_of_approach = pd.DataFrame(index=['results_uniform_1_Normal',
                                       'results_uniform_1_AmEx',
                                       'results_uniform_2_Normal',
                                       'results_uniform_2_AmEx',
                                       'DSO'
                                       ],
                                columns=['num simulation to 0_999',
                                         'num_states_to_0_999',
                                         '_runtime',
                                         'unsuccessful_fits'
                                         ]
                                )

for approach in list(mean_of_approach.index):
    for target in [
        'num_states_to_0_999',
        '_runtime',
        'num simulation to 0_999']:
        fill_mean_of_approach(
            target=target,
            approach=approach,
            mean_of_approach=mean_of_approach
        )
mean_of_approach
with open('table_Amex_vs_Classic.tex', 'w') as f:
    f.write(mean_of_approach.to_latex())

### save the mean for each approach for each nguyen equation and the seed 

In [None]:
def fill_mean_for_each_nguyen(target, approach, results_per_equation_dict):
    results_per_equation_dict[(approach, '$\mu$')] = {}
    results_per_equation_dict[(approach, '$\sigma^2$')] = {}
    results_per_equation_dict[(approach, '# No Fit')] = {}
    if approach in results_obj.result_dict[target]:
        df = results_obj.result_dict[target][approach]
        nan_values = df.isna().sum()
        mean = df.mean() 
        std = df.std() 
        results_per_equation_dict[(approach, '$\mu$')] = mean
        results_per_equation_dict[(approach, '$\sigma^2$')] = std
        results_per_equation_dict[(approach, '# No Fit')]= nan_values


data_set_names_for_plot = {
    'nguyen_1': 'Nguyen 1',
    'nguyen_2': 'Nguyen 2',
    'nguyen_3': 'Nguyen 3',
    'nguyen_4': 'Nguyen 4',
    'nguyen_5': 'Nguyen 5',
    'nguyen_6': 'Nguyen 6',
    'nguyen_7': 'Nguyen 7',
    'nguyen_8': 'Nguyen 8',
    'nguyen_9': 'Nguyen 9',
    'nguyen_10': 'Nguyen 10',
    'nguyen_11': 'Nguyen 11',
    'nguyen_12': 'Nguyen 12',
}

equations = {
    'nguyen_1': '$x^3 + x^2 + x $',
    'nguyen_2': '$x^4 + x^3 + x^2 + x $',
    'nguyen_3': '$x^5 + x^4 +  x^3 + x^2 + x $',
    'nguyen_4': '$x^6 + x^5 + x^4 + x^3 + x^2 + x $',
    'nguyen_5': '$\sin (x_0^2) + \cos(x_0) -1  $',
    'nguyen_6': '$\sin (x_0) + \sin (x_0 + x_0^2)$',
    'nguyen_7': '$\log (x_0 + 1) + \log (x_0^2 + 1)$',
    'nguyen_8': '$\sqrt{x_0}$',
    'nguyen_9': '$\sin(x_0) + sin(x_1^2)$',
    'nguyen_10': '$2 \cdot sin (x_0) \cdot cos(x_1)$',
    'nguyen_11': '$x_0^{x_1}$',
    'nguyen_12': '$x_0^4 - x_0^3 + 0.5 \cdot x_1^2 - x_1$',
}
appraches = ['results_uniform_1_Normal',
                                         'results_uniform_1_AmEx',
                                         'results_uniform_2_Normal',
                                         'results_uniform_2_AmEx',
                                         'DSO']
results_per_equation_dict= {}
for approach in appraches:
    fill_mean_for_each_nguyen(
        target =  'num simulation to 0_999',
        approach= approach,
        results_per_equation_dict=results_per_equation_dict
    )
results_per_equation_df = pd.DataFrame.from_dict(results_per_equation_dict)
results_per_equation_df.loc[:,'new_index'] = [f"{data_set_names_for_plot[equation]} " for equation in equations.keys()]
results_per_equation_df.insert(0,'Equation', [f"{equations[equation]} " for equation in equations.keys()])
results_per_equation_df.set_index('new_index', inplace=True)
print(results_per_equation_df)
pd.options.display.max_colwidth = None
with open('table_each_nguyen.tex', 'w') as f:
    f.write(results_per_equation_df.round(0).filter(regex='Equation|DSO').to_latex(escape=False))
    f.write('\n')
    f.write(results_per_equation_df.round(0).filter(regex='uniform_1').to_latex(escape=False))
    f.write('\n')
    f.write(results_per_equation_df.round(0).filter(regex='uniform_2').to_latex(escape=False))