In [1]:
import sys
from ezr import *

In [2]:
import random
import matplotlib.pyplot as plt
import numpy as np
import os

config_folder = 'data/optimize/config'
datasets = [os.path.join(config_folder, file) for file in os.listdir(config_folder) if file.endswith('.csv')]

In [7]:
import numpy as np
import random
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from scipy.optimize import fmin_l_bfgs_b
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", message="Predicted variances smaller than 0. Setting those variances to 0.")

# Define the UCB_GPM function
def UCB_GPM(d, todo, done):
    the.iter = 0
    kernel = C(1.0, (1e-8, 1e8)) * RBF(1.0, (1e-8, 1e8))
    gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)

    num_indexes = [col.at for col in d.cols.x if type(col) == NUM]
    sym_indexes = [col.at for col in d.cols.x if type(col) == SYM]

    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_indexes),
            ('cat', cat_transformer, sym_indexes)])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    if sym_indexes:
        cat_data = np.array([[str(row[idx]) for idx in sym_indexes] for row in done], dtype=object)
        cat_transformer.fit(cat_data)
    
    def custom_optimizer(obj_func, initial_theta, bounds):
        theta_opt, func_min, _ = fmin_l_bfgs_b(obj_func, initial_theta, bounds=bounds, maxiter=1000)
        return theta_opt, func_min
    
    gp.optimizer = custom_optimizer

    def update_gp_model(done_set):
        X_done = np.array([x for x in done_set], dtype=object)
        y_done = np.array([-d.distance(x) for x in done_set])
        X_done_transformed = pipeline.fit_transform(X_done)
        gp.fit(X_done_transformed, y_done)
    
    def ucb(x, kappa=2.576):
        x = np.array(x).reshape(1, -1).astype(object)
        x_transformed = pipeline.transform(x)
        mean, std = gp.predict(x_transformed, return_std=True)
        return mean + kappa * std

    def _mean(x):
        x = np.array(x).reshape(1, -1).astype(object)
        x_transformed = pipeline.transform(x)
        mean, std = gp.predict(x_transformed, return_std=True)
        return mean

    def _ranked(rows):
        return d.clone(rows).distances().rows


    def model_ranking(todo, done):
        random_pairs = [(random.choice(d.rows), random.choice(d.rows)) for _ in range(100)]
        correct_guesses = 0
        
        for row1, row2 in random_pairs:
            guess = row1 if _mean(row1) > _mean(row2) else row2
            true_higher = row1 if d.distance(row1) < d.distance(row2) else row2
            if guess == true_higher:
                correct_guesses += 1
        
        return correct_guesses

    
    iteration_results = []

    while todo and len(done) < the.Last:
        update_gp_model(done)
        random.shuffle(todo)
        todo_subset = todo[:the.buffer]

        ucb_values = [ucb(row) for row in todo_subset]
        best_idx = np.argmax(ucb_values)
        best_candidate = todo.pop(best_idx)
         
        done.append(best_candidate)
        iteration_result = model_ranking(todo, done)
        iteration_results.append(iteration_result)
        print(f'{the.iter}. {iteration_result}')
        the.iter += 1
    
    return done, iteration_results

In [4]:
# Load dataset and initialize
dataset = 'data/optimize/misc/auto93.csv'
d = DATA().adds(csv(dataset))

print(dataset)
print(len(d.rows))

the.Last = 40

first_iterations = []
last_iterations = []

def _ranked(rows):
    return d.clone(rows).distances().rows

the.branch = False
random.seed(10)
        
# Run trials
for trial in range(10):
    print(f'\nTrial {trial}')
    random.shuffle(d.rows)
    todo, done = d.rows[the.label:], _ranked(d.rows[:the.label])

    if the.branch == True:
        todo, done = d.branch(used = [])
        
    result, iteration_results = UCB_GPM(d, todo, done)

    if iteration_results:  # Store the first and last iteration results
        first_iterations.append(iteration_results[0])
        last_iterations.append(iteration_results[-1])

# Calculate and print the average values
average_first = sum(first_iterations) / len(first_iterations)
average_last = sum(last_iterations) / len(last_iterations)

print(f'\nAverage value of the first iteration across 100 trials: {average_first}')
print(f'Average value of the last iteration across 100 trials: {average_last}')

data/optimize/misc/auto93.csv
398

Trial 0
0. 75
1. 71
2. 73
3. 70
4. 71
5. 81
6. 77
7. 57
8. 54
9. 58
10. 71
11. 59
12. 58
13. 70
14. 55
15. 71
16. 65
17. 66
18. 64
19. 75
20. 74
21. 68
22. 66
23. 77
24. 76
25. 62
26. 74
27. 66
28. 64
29. 70
30. 72
31. 74
32. 77
33. 74
34. 82
35. 72

Trial 1
0. 76
1. 74
2. 70
3. 76
4. 77
5. 73
6. 80
7. 74
8. 72
9. 64
10. 66
11. 77
12. 69
13. 65
14. 69
15. 67
16. 71
17. 69
18. 73
19. 72
20. 66
21. 66
22. 61
23. 66
24. 73
25. 74
26. 75
27. 76
28. 74
29. 71
30. 71
31. 70
32. 77
33. 80
34. 74
35. 71

Trial 2
0. 81
1. 65
2. 58
3. 74
4. 74
5. 72
6. 71
7. 71
8. 74
9. 73
10. 65
11. 37
12. 81
13. 58
14. 57
15. 61
16. 38
17. 63
18. 61
19. 48
20. 60
21. 58
22. 40
23. 50
24. 46
25. 69
26. 70
27. 47
28. 41
29. 64
30. 54
31. 60
32. 52
33. 66
34. 59
35. 63

Trial 3
0. 75
1. 74
2. 68
3. 69
4. 69
5. 49
6. 47
7. 64
8. 60
9. 62
10. 61
11. 61
12. 76
13. 73
14. 79
15. 78
16. 77
17. 79
18. 74
19. 71
20. 74
21. 80
22. 77
23. 84
24. 78
25. 81
26. 71
27. 78
28. 85
29. 72
30. 

In [8]:
# Load dataset and initialize
dataset = 'data/optimize/misc/auto93.csv'
d = DATA().adds(csv(dataset))

print(dataset)
print(len(d.rows))

the.Last = 40

first_iterations = []
last_iterations = []

def _ranked(rows):
    return d.clone(rows).distances().rows

the.branch = True
random.seed(10)
        
# Run trials
for trial in range(10):
    print(f'\nTrial {trial}')
    random.shuffle(d.rows)
    todo, done = d.rows[the.label:], _ranked(d.rows[:the.label])

    if the.branch == True:
        todo, done = d.branch(used = [])
        print(done)
    
    result, iteration_results = UCB_GPM(d, todo, done)

    if iteration_results:  # Store the first and last iteration results
        first_iterations.append(iteration_results[0])
        last_iterations.append(iteration_results[-1])

# Calculate and print the average values
average_first = sum(first_iterations) / len(first_iterations)
average_last = sum(last_iterations) / len(last_iterations)

print(f'\nAverage value of the first iteration across 100 trials: {average_first}')
print(f'Average value of the last iteration across 100 trials: {average_last}')

data/optimize/misc/auto93.csv
398

Trial 0
[[4, 116, 81, 76, 2, 2220, 16.9, 30], [8, 302, 140, 72, 1, 4294, 16, 10], [4, 97, 88, 71, 3, 2130, 14.5, 30], [4, 135, 84, 81, 1, 2490, 15.7, 30], [4, 88, 76, 71, 2, 2065, 14.5, 30]]
0. 82
1. 79
2. 65
3. 82
4. 69
5. 75
6. 81
7. 74
8. 72
9. 81
10. 80
11. 84
12. 72
13. 65
14. 77
15. 69
16. 71
17. 66
18. 68
19. 75
20. 69
21. 71
22. 68
23. 82
24. 73
25. 79
26. 69
27. 71
28. 71
29. 67
30. 76
31. 67
32. 75
33. 76
34. 70

Trial 1
[[4, 79, 67, 74, 2, 1963, 15.5, 30], [8, 305, 145, 77, 1, 3880, 12.5, 20], [4, 79, 58, 81, 3, 1755, 16.9, 40], [4, 98, 70, 80, 1, 2120, 15.5, 30], [4, 119, 97, 78, 3, 2405, 14.9, 20]]
0. 63
1. 66
2. 66
3. 80
4. 73
5. 73
6. 69
7. 75
8. 77
9. 77
10. 72
11. 78
12. 73
13. 81
14. 83
15. 68
16. 68
17. 69
18. 72
19. 73
20. 76
21. 70
22. 75
23. 75
24. 76
25. 74
26. 72
27. 75
28. 74
29. 78
30. 71
31. 74
32. 75
33. 81
34. 79

Trial 2
[[4, 90, 70, 76, 2, 1937, 14.2, 30], [8, 304, 150, 72, 1, 3892, 12.5, 20], [4, 108, 75, 81, 3, 2350, 1

In [3]:
dataset = 'data/optimize/misc/auto93.csv'
d = DATA().adds(csv(dataset))

todo, done = d.branch(used = [])
print(len(d.rows))
print(len(done))


39.89987468652001
398

39.89987468652001
199

39.89987468652001
99

39.89987468652001
49

39.89987468652001
24
398
5


In [4]:
dataset = 'data/optimize/config/SS-N.csv'
d = DATA().adds(csv(dataset))

todo, done = d.branch(used = [])
print(len(d.rows))
print(len(done))


463.3011979263598
53662

463.3011979263598
26831

463.3011979263598
13415

463.3011979263598
6707

463.3011979263598
3353

463.3011979263598
1676

463.3011979263598
838

463.3011979263598
419
53662
8
