<a href="https://colab.research.google.com/github/wfreinhart/sdmm-regression/blob/main/notebooks/hyperopt_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# setup

In [None]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11685 sha256=31e7aa02be7e4f4e6433239a47fa2e979ab9e907f3e7c93914219e8f5b12f6ce
  Stored in directory: /root/.cache/pip/wheels/fd/9b/71/f127d694e02eb40bcf18c7ae9613b88a6be4470f57a8528c5b
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [None]:
from bayes_opt import BayesianOptimization
from bayes_opt import SequentialDomainReductionTransformer
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs

## raw data

In [None]:
import numpy as np
import pandas as pd
import json

with open('data-10-folds.json', 'r') as fid:
    buffer = json.load(fid)

X = np.array(buffer['X'])
y = np.array(buffer['y'])

fold_data = []
for fold in buffer['folds']:
    train_index = np.array(fold['train'])
    test_index = np.array(fold['test'])

    trainX, testX = X[train_index], X[test_index]
    trainy, testy = y[train_index], y[test_index]
    
    fold_data.append({'train': {'X': trainX, 'y': trainy},
                      'test': {'X': testX, 'y': testy}
                      })

## k-mer tokens

In [None]:
import itertools
import tqdm.notebook
from math import factorial


def swap_monomers(seq):
    seq = seq.replace('A', 'C')
    seq = seq.replace('B', 'A')
    seq = seq.replace('C', 'B')
    return seq


def make_base(degree):
    # construct unique numbers of A/B monomers
    base = []
    for i in range(degree+1):
        for j in range(i):
            seq = ''.join(['A' for _ in range(i-j)] + ['B' for _ in range(j)])
            # print((i, j), seq)
            base += [''.join(x) for x in itertools.permutations(seq)]
            base += [''.join(x) for x in itertools.permutations(swap_monomers(seq))]
    base = sorted(set(base))
    pruned_base = []
    for b in base:
        if b not in pruned_base and b[::-1] not in pruned_base:
            pruned_base.append(b)
    base = pruned_base
    print(f'Finding {len(base)} patterns:', base)

    return base


def featurize(chain_sequences, base):
    X = np.zeros([len(chain_sequences), len(base)])
    for i, chain in tqdm.notebook.tqdm(enumerate(chain_sequences), total=len(chain_sequences)):
        if 'A' in str(chain):
            seq = chain
        else:
            seq = ''.join(['A' if x == 0 else 'B' for x in chain])
        X[i] = 0.5 * (np.array([seq.count(b) for b in base]) + np.array([seq[::-1].count(b) for b in base]))
    return X

## create training fold data with k-mers

In [None]:
base = make_base(degree=10)
token_X = featurize(X, base)

token_fold_data = []
for fold in buffer['folds']:
    train_index = fold['train']
    test_index = fold['test']

    trainX, testX = token_X[train_index], token_X[test_index]
    trainy, testy = y[train_index], y[test_index]
    
    token_fold_data.append({'train': {'X': trainX, 'y': trainy},
                            'test': {'X': testX, 'y': testy}
                           })

Finding 1085 patterns: ['A', 'AA', 'AAA', 'AAAA', 'AAAAA', 'AAAAAA', 'AAAAAAA', 'AAAAAAAA', 'AAAAAAAAA', 'AAAAAAAAAA', 'AAAAAAAAAB', 'AAAAAAAAB', 'AAAAAAAABA', 'AAAAAAAABB', 'AAAAAAAB', 'AAAAAAABA', 'AAAAAAABAA', 'AAAAAAABAB', 'AAAAAAABB', 'AAAAAAABBA', 'AAAAAAABBB', 'AAAAAAB', 'AAAAAABA', 'AAAAAABAA', 'AAAAAABAAA', 'AAAAAABAAB', 'AAAAAABAB', 'AAAAAABABA', 'AAAAAABABB', 'AAAAAABB', 'AAAAAABBA', 'AAAAAABBAA', 'AAAAAABBAB', 'AAAAAABBB', 'AAAAAABBBA', 'AAAAAABBBB', 'AAAAAB', 'AAAAABA', 'AAAAABAA', 'AAAAABAAA', 'AAAAABAAAA', 'AAAAABAAAB', 'AAAAABAAB', 'AAAAABAABA', 'AAAAABAABB', 'AAAAABAB', 'AAAAABABA', 'AAAAABABAA', 'AAAAABABAB', 'AAAAABABB', 'AAAAABABBA', 'AAAAABABBB', 'AAAAABB', 'AAAAABBA', 'AAAAABBAA', 'AAAAABBAAA', 'AAAAABBAAB', 'AAAAABBAB', 'AAAAABBABA', 'AAAAABBABB', 'AAAAABBB', 'AAAAABBBA', 'AAAAABBBAA', 'AAAAABBBAB', 'AAAAABBBB', 'AAAAABBBBA', 'AAAAABBBBB', 'AAAAB', 'AAAABA', 'AAAABAA', 'AAAABAAA', 'AAAABAAAA', 'AAAABAAAAB', 'AAAABAAAB', 'AAAABAAABA', 'AAAABAAABB', 'AAAABAAB', 'AA

  0%|          | 0/2038 [00:00<?, ?it/s]

# scikit-learn models

In [None]:
from sklearn import linear_model
from sklearn import neighbors
from sklearn import ensemble

models = [linear_model.LinearRegression(),
          linear_model.Lasso(),
          linear_model.Ridge(),
          neighbors.KNeighborsRegressor(),
          ensemble.RandomForestRegressor(random_state=0)]

for model in models:
    print(model)
    metrics = train_evaluate_model(model, dataset)
    print(metrics)

# for model in models:
#     plot_parity(model.predict, dataset, name=str(model))

## optimize linear models

In [None]:
from sklearn import linear_model


this_data = token_fold_data[0]

def loss_fn(**params):
    # params = {'alpha': np.exp(params['log_alpha'])}

    model = linear_model.Ridge(**params)

    this_X = this_data['train']['X']
    this_y = this_data['train']['y']

    train_X = np.vstack([this_X, np.fliplr(this_X)])
    train_y = np.vstack([this_y, this_y])

    model.fit(train_X, train_y)

    test_X = this_data['test']['X']
    test_y = this_data['test']['y']

    # forward sequences
    pred_z_fwd = model.predict(test_X)

    # reverse sequences
    pred_z_rev = model.predict(np.fliplr(test_X))

    # averarage the two predictions
    rmse_avg = np.sqrt(np.mean((0.5*(pred_z_fwd + pred_z_rev) - test_y)**2))

    return -rmse_avg


# pbounds = {'log_alpha': (-8, 8)}
pbounds = {'alpha': (0, 4)}
bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(
    f=loss_fn,
    pbounds=pbounds,
    # bounds_transformer = bounds_transformer,
    random_state=0
    )

# load_logs(optimizer, logs=["bayes-opt-log-2.json"]);

# logger = JSONLogger(path="bayes-opt-lasso.json")
# optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(init_points=5, n_iter=45,
                   acq='ei', xi=1e-2,
                #    acq='ucb', kappa=4,
                   )

print(optimizer.max)

|   iter    |  target   |   alpha   |
-------------------------------------
| [0m 1       [0m | [0m-2.876   [0m | [0m 2.195   [0m |
| [0m 2       [0m | [0m-2.883   [0m | [0m 2.861   [0m |
| [0m 3       [0m | [0m-2.878   [0m | [0m 2.411   [0m |
| [95m 4       [0m | [95m-2.876   [0m | [95m 2.18    [0m |
| [95m 5       [0m | [95m-2.871   [0m | [95m 1.695   [0m |
| [95m 6       [0m | [95m-2.862   [0m | [95m 0.4356  [0m |
| [0m 7       [0m | [0m-2.862   [0m | [0m 0.000665[0m |
| [0m 8       [0m | [0m-2.895   [0m | [0m 3.999   [0m |
| [0m 9       [0m | [0m-2.865   [0m | [0m 0.9651  [0m |
| [0m 10      [0m | [0m-2.884   [0m | [0m 2.958   [0m |
| [0m 11      [0m | [0m-2.866   [0m | [0m 1.149   [0m |
| [0m 12      [0m | [0m-2.894   [0m | [0m 3.901   [0m |
| [0m 13      [0m | [0m-2.887   [0m | [0m 3.206   [0m |
| [0m 14      [0m | [0m-2.876   [0m | [0m 2.177   [0m |
| [0m 15      [0m | [0m-2.889   [0m | [0m 

In [None]:
np.exp(optimizer.max['params']['log_alpha'])

0.00033546262790251185

In [None]:
loss_fn(**{'alpha': 0})

-2.8622375238163715

In [None]:
this_data = fold_data[0]

model = linear_model.LinearRegression()

this_X = this_data['train']['X']
this_y = this_data['train']['y']

train_X = np.vstack([this_X, np.fliplr(this_X)])
train_y = np.vstack([this_y, this_y])

%timeit model.fit(train_X, train_y)

test_X = this_data['test']['X']
test_y = this_data['test']['y']

%timeit model.predict(test_X)

100 loops, best of 5: 3.42 ms per loop
The slowest run took 4.00 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 5: 53.6 µs per loop


## linear regression for k-mers over folds

In [None]:
all_rmse = []

for i, this_data in enumerate(token_fold_data):

    model = linear_model.LinearRegression()

    this_X = this_data['train']['X']
    this_y = this_data['train']['y']

    train_X = np.vstack([this_X, np.fliplr(this_X)])
    train_y = np.vstack([this_y, this_y])

    model.fit(train_X, train_y)

    test_X = this_data['test']['X']
    test_y = this_data['test']['y']

    # forward sequences
    pred_z_fwd = model.predict(test_X)

    # reverse sequences
    pred_z_rev = model.predict(np.fliplr(test_X))

    # averarage the two predictions
    rmse_avg = np.sqrt(np.mean((0.5*(pred_z_fwd + pred_z_rev) - test_y)**2))

    all_rmse.append(rmse_avg)

print(np.mean(all_rmse), np.std(all_rmse))

3.059670594243504 0.14250770876776345


Linear: 3.0157332226636155 0.22095644516346405

Ridge: 2.780543994395317 0.1583177031987842


## k-neighbors

In [None]:
from sklearn import neighbors


prev_results = {}
this_data = token_fold_data[0]

def loss_fn(**params):
    params = {k: int(np.round(v)) for k, v in params.items()}
    if params['weights'] > 0.5:
        params['weights'] = 'uniform'
    else:
        params['weights'] = 'distance'

    tuple_key = tuple([params[k] for k in sorted(params.keys())])
    if tuple_key in prev_results:
        return prev_results[tuple_key]

    model = neighbors.KNeighborsRegressor(**params)

    this_X = this_data['train']['X']
    this_y = this_data['train']['y']

    train_X = np.vstack([this_X, np.fliplr(this_X)])
    train_y = np.vstack([this_y, this_y])

    model.fit(train_X, train_y)

    test_X = this_data['test']['X']
    test_y = this_data['test']['y']

    # forward sequences
    pred_z_fwd = model.predict(test_X)

    # reverse sequences
    pred_z_rev = model.predict(np.fliplr(test_X))

    # averarage the two predictions
    rmse_avg = np.sqrt(np.mean((0.5*(pred_z_fwd + pred_z_rev) - test_y)**2))

    # save the results
    prev_results[tuple_key] = -rmse_avg

    return -rmse_avg


pbounds = {'n_neighbors': (1, 128), 'weights': (0, 1)}
# pbounds = {'alpha': (0, 4)}
bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(
    f=loss_fn,
    pbounds=pbounds,
    # bounds_transformer = bounds_transformer,
    random_state=0
    )

# load_logs(optimizer, logs=["bayes-opt-log-2.json"]);

# logger = JSONLogger(path="bayes-opt-lasso.json")
# optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(init_points=5, n_iter=45,
                   acq='ei', xi=1e-2,
                #    acq='ucb', kappa=4,
                   )

params = optimizer.max['params']
if params['weights'] > 0.5:
    params['weights'] = 'uniform'
else:
    params['weights'] = 'distance'
print(optimizer.max['target'], params)

|   iter    |  target   | n_neig... |  weights  |
-------------------------------------------------
| [0m 1       [0m | [0m-3.733   [0m | [0m 70.7    [0m | [0m 0.7152  [0m |
| [95m 2       [0m | [95m-3.731   [0m | [95m 77.55   [0m | [95m 0.5449  [0m |
| [95m 3       [0m | [95m-3.71    [0m | [95m 54.8    [0m | [95m 0.6459  [0m |
| [0m 4       [0m | [0m-3.715   [0m | [0m 56.57   [0m | [0m 0.8918  [0m |
| [0m 5       [0m | [0m-3.758   [0m | [0m 123.4   [0m | [0m 0.3834  [0m |
| [95m 6       [0m | [95m-3.651   [0m | [95m 44.58   [0m | [95m 0.008413[0m |
| [95m 7       [0m | [95m-3.635   [0m | [95m 38.05   [0m | [95m 0.0     [0m |
| [0m 8       [0m | [0m-3.659   [0m | [0m 22.23   [0m | [0m 0.9654  [0m |
| [0m 9       [0m | [0m-5.068   [0m | [0m 1.007   [0m | [0m 0.2338  [0m |
| [0m 10      [0m | [0m-3.772   [0m | [0m 101.7   [0m | [0m 1.0     [0m |
| [0m 11      [0m | [0m-3.691   [0m | [0m 28.5    [0m | [

In [None]:
loss_fn(**{'weights': 1})

-3.592616952617676

In [None]:
from sklearn import neighbors

rmse = []
for this_data in token_fold_data:
    model = neighbors.KNeighborsRegressor(n_neighbors=15, weights='distance')

    this_X = this_data['train']['X']
    this_y = this_data['train']['y']

    train_X = np.vstack([this_X, np.fliplr(this_X)])
    train_y = np.vstack([this_y, this_y])

    model.fit(train_X, train_y)

    test_X = this_data['test']['X']
    test_y = this_data['test']['y']

    # forward sequences
    pred_z_fwd = model.predict(test_X)

    # reverse sequences
    pred_z_rev = model.predict(np.fliplr(test_X))

    # averarage the two predictions
    rmse_avg = np.sqrt(np.mean((0.5*(pred_z_fwd + pred_z_rev) - test_y)**2))

    rmse.append(rmse_avg)

print(np.mean(rmse), np.std(rmse))

3.68340842867823 0.12327479452220283


In [None]:
print(np.mean(rmse), np.std(rmse))

3.662855428853419 0.13203350578962403


## random forest

In [None]:
from sklearn import ensemble


prev_results = {}
this_data = token_fold_data[0]

named_params = {'bootstrap': [False, True],
                'max_features': ['auto', 'sqrt']}

def loss_fn(**params):
    params = {k: int(np.round(v)) for k, v in params.items()}
    for key in named_params.keys():
        params[key] = named_params[key][params[key]]

    tuple_key = tuple([params[k] for k in sorted(params.keys())])
    if tuple_key in prev_results:
        return prev_results[tuple_key]

    model = ensemble.RandomForestRegressor(**params)

    this_X = this_data['train']['X']
    this_y = this_data['train']['y']

    train_X = np.vstack([this_X, np.fliplr(this_X)])
    train_y = np.vstack([this_y, this_y])

    model.fit(train_X, train_y)

    test_X = this_data['test']['X']
    test_y = this_data['test']['y']

    # forward sequences
    pred_z_fwd = model.predict(test_X)

    # reverse sequences
    pred_z_rev = model.predict(np.fliplr(test_X))

    # averarage the two predictions
    rmse_avg = np.sqrt(np.mean((0.5*(pred_z_fwd + pred_z_rev) - test_y)**2))

    # save the results
    prev_results[tuple_key] = -rmse_avg

    return -rmse_avg


pbounds = {'max_depth': (2, 32), 'n_estimators': (4, 128),
           'min_samples_leaf': (2, 16), 'min_samples_split': (2, 16),
           'bootstrap': (0, 1), 'max_features': (0, 1)}
bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(
    f=loss_fn,
    pbounds=pbounds,
    bounds_transformer = bounds_transformer,
    random_state=0
    )

optimizer.maximize(init_points=5, n_iter=45,
                   acq='ei', xi=1e-2,
                   )

print(optimizer.max)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-3.012   [0m | [0m 18.46   [0m | [0m 12.01   [0m | [0m 10.44   [0m | [0m 71.57   [0m |
| [95m 2       [0m | [95m-3.005   [0m | [95m 14.71   [0m | [95m 11.04   [0m | [95m 8.126   [0m | [95m 114.6   [0m |
| [95m 3       [0m | [95m-2.924   [0m | [95m 30.91   [0m | [95m 7.368   [0m | [95m 13.08   [0m | [95m 69.58   [0m |
| [0m 4       [0m | [0m-3.11    [0m | [0m 19.04   [0m | [0m 14.96   [0m | [0m 2.995   [0m | [0m 14.8    [0m |
| [0m 5       [0m | [0m-4.005   [0m | [0m 2.607   [0m | [0m 13.66   [0m | [0m 12.89   [0m | [0m 111.9   [0m |
| [0m 6       [0m | [0m-2.982   [0m | [0m 29.26   [0m | [0m 9.535   [0m | [0m 14.06   [0m | [0m 70.22   [0m |
| [95m 7       [0m | [95m-2.855   [0m | [95m 25.81   [0m | [95m 4.156   [0m | [95m 10.46   [0m | 

In [None]:
loss_fn(**{})

-2.8649303153066294

# timings

In [None]:
from sklearn import neural_network

this_data = token_fold_data[0]

# model = ensemble.RandomForestRegressor()
# model = neighbors.KNeighborsRegressor(weights='distance')
model = neural_network.MLPRegressor(max_iter=2000, random_state=0)

this_X = this_data['train']['X']
this_y = this_data['train']['y']

train_X = np.vstack([this_X, np.fliplr(this_X)])
train_y = np.vstack([this_y, this_y])

%timeit model.fit(train_X, train_y)

test_X = this_data['test']['X']
test_y = this_data['test']['y']

%timeit model.predict(test_X)

1 loop, best of 5: 3min 19s per loop
100 loops, best of 5: 2.42 ms per loop
