<a href="https://colab.research.google.com/github/wfreinhart/sdmm-regression/blob/main/notebooks/hyperopt_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# setup

## load data

In [None]:
import numpy as np
import pandas as pd
import json

with open('data-10-folds.json', 'r') as fid:
    buffer = json.load(fid)

X = np.array(buffer['X'])
y = np.array(buffer['y'])

fold_data = []
for fold in buffer['folds']:
    train_index = np.array(fold['train'])
    test_index = np.array(fold['test'])

    trainX, testX = X[train_index], X[test_index]
    trainy, testy = y[train_index], y[test_index]
    
    fold_data.append({'train': {'X': trainX, 'y': trainy},
                      'test': {'X': testX, 'y': testy}
                      })

## imports

In [None]:
import torch
from torch import nn
import numpy as np
from matplotlib import pyplot as plt

!nvidia-smi -L

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"

device = torch.device(device)
print(device)

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-ec9c04e2-3e55-7ac2-c9dd-338093d8a397)
cuda:0


In [None]:
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim):
        super(MLPModel, self).__init__()

        layers = []
        for i, h in enumerate(hidden_dims):
            if i == 0:
                layers.append(nn.Linear(input_dim, h))
            else:
                layers.append(nn.Linear(hidden_dims[i-1], h))
            layers.append(nn.LeakyReLU())
        
        layers.append(nn.Linear(hidden_dims[-1], output_dim))

        # a deeper fully connected segment:
        self.dense = nn.Sequential(*layers)

    def forward(self, x):
        return self.dense(x)


class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
        super(GRUModel, self).__init__()

        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        self.rnn = nn.GRU(
            input_dim, hidden_dim, layer_dim,
            batch_first=True, dropout=dropout_prob,
            bidirectional=True
        )
        self.dense = nn.Linear(2 * hidden_dim, output_dim)

    def embed(self, x):
        out, hn = self.rnn(x)
        out = torch.cat((hn[-2, :, :], hn[-1, :, :]), dim=1)

        return out        

    def forward(self, x):
        out = self.embed(x)
        out = self.dense(out)

        return out


class CNNRegressor(nn.Module):
  def __init__(self, latent_dim, n_ch=8, width=8, n_conv=1, imgChannels=1):
    super(CNNRegressor, self).__init__()

    self.act = nn.LeakyReLU()

    conv_list = []
    for i in range(n_conv):
        if i == 0:
            in_ch = imgChannels
        else:
            in_ch = n_ch
        conv_list.append(nn.Conv1d(in_ch, n_ch, kernel_size=width, stride=1, padding=width//2))
        if i < n_conv - 1:
            conv_list.append(self.act)

    self.conv = nn.Sequential(*conv_list)
    self.dense = nn.LazyLinear(latent_dim)

  def forward(self, x):

    x = self.conv(x)

    x = x.view(x.size(0), -1) #flatten
    x = self.dense(x)

    return x

In [None]:
from torch.utils.data import DataLoader, Dataset
import tqdm.notebook


def cycle_loss(fwd, rev, y, alpha=0.5):
    "penalizes the LSTM for encoding differently on forward and backward pass"
    L_fwd = torch.nn.MSELoss()(fwd, y)
    L_rev = torch.nn.MSELoss()(rev, y)
    L_cyc = torch.nn.MSELoss()(fwd, rev)
    return L_fwd + L_rev + alpha * L_cyc

def avg_mse_loss(fwd, rev, y):
    "averages coordinates from forward and backward pass"
    return torch.nn.MSELoss()(0.5*(fwd + rev), y)

def sym_loss(fwd, rev, y, alpha=1.0):
    "averages coordinates from forward and backward pass"
    avg = torch.nn.MSELoss()(0.5*(fwd + rev), y)
    sym = torch.nn.MSELoss()(fwd, rev)
    return avg + alpha * sym

def train_model(model, train, test, loss_fn=cycle_loss, loss_args=[],
                batch_size = 128, optimizer = torch.optim.Adam,
                num_epochs = 1000, learning_rate = 0.01,
                n_print = 10, verbose = True):
    "trains the model for some number of epochs and returns the trained model"

    trainX, trainy = train
    testX, testy = test

    train_loader = DataLoader(np.arange(trainX.shape[0]), batch_size=batch_size, shuffle=True)

    optimizer = optimizer(model.parameters(), lr=learning_rate)

    # Train the model
    for epoch in tqdm.notebook.tqdm(np.arange(num_epochs), disable=(not verbose)):
        train_loss = 0.0

        for idx in train_loader:
            batchX = trainX[idx]
            batchy = trainy[idx]

            optimizer.zero_grad()

            fwd = model(batchX)
            rev = model(torch.fliplr(batchX))
            
            loss = loss_fn(fwd, rev, batchy, *loss_args)
            loss.backward()
            
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)
        if n_print > 0 and (epoch+1) % int(num_epochs / n_print) == 0:
            test_loss = loss_fn(model(testX), model(torch.fliplr(testX)), testy, *loss_args)
            fwd_loss = torch.nn.MSELoss()(model(testX), testy)
            rev_loss = torch.nn.MSELoss()(model(torch.fliplr(testX)), testy)
            test_cyc_loss = torch.nn.MSELoss()(model(testX), model(torch.fliplr(testX)))
            print("Epoch: %5d, loss: %1.5f, test: %1.5f; fwd: %1.5f, rev: %1.5f, cyc: %1.5f" % (epoch+1, train_loss, test_loss.item(), fwd_loss.item(), rev_loss.item(), test_cyc_loss.item()))

    return model

In [None]:
import os

def mse_loss_fwd_rev(fwd, rev, y):
    "averages coordinates from forward and backward pass"
    return 0.5 * (torch.nn.MSELoss()(fwd, y) + torch.nn.MSELoss()(rev, y))

model_path = 'gru-opt-cv10-mse'
if not os.path.isdir(model_path):
    os.mkdir(model_path)

all_rmse = []
trained_models = []

loss_fn = mse_loss_fwd_rev

for i, this_data in enumerate(fold_data):

    # opt_hidden_layers = np.linspace(128, 57, 12).astype(int).tolist()
    # model = MLPModel(20, opt_hidden_layers, 2).to(device)

    model = GRUModel(20, 7, 3, 2, 0).to(device)

    trainX = torch.tensor(this_data['train']['X'], dtype=torch.float).to(device).unsqueeze(1)
    testX = torch.tensor(this_data['test']['X'], dtype=torch.float).to(device).unsqueeze(1)
    trainy = torch.tensor(this_data['train']['y'], dtype=torch.float).to(device)
    testy = torch.tensor(this_data['test']['y'], dtype=torch.float).to(device)

    train_model(model, (trainX, trainy), (testX, testy), n_print=4,
                    loss_fn=loss_fn, loss_args=(),
                    optimizer=torch.optim.Adam,
                # learning_rate=0.005, num_epochs=800)
                learning_rate=0.01, num_epochs=800)
    
    # train_model(model, (trainX, trainy), (testX, testy), n_print=4,
    #             loss_fn=loss_fn, loss_args=(),
    #             optimizer=torch.optim.RMSprop, learning_rate=0.001, num_epochs=400)

    real_z = testy.to('cpu').detach().numpy()

    # forward sequences
    pred_z_fwd = model(testX).to('cpu').detach().numpy()
    rmse_fwd = np.sqrt(np.mean((pred_z_fwd - real_z)**2))

    # reverse sequences
    pred_z_rev = model(torch.fliplr(testX)).to('cpu').detach().numpy()
    rmse_rev = np.sqrt(np.mean((pred_z_rev - real_z)**2))

    # averarage the two predictions
    rmse_avg = np.sqrt(np.mean((0.5*(pred_z_fwd + pred_z_rev) - real_z)**2))

    all_rmse.append(rmse_avg)
    trained_models.append(model)

    # save to disk
    model_scripted = torch.jit.script(model.cpu())
    model_scripted.save(os.path.join(model_path, f'fold-{i:02d}-scripted.pt'))

print(np.mean(all_rmse), np.std(all_rmse))

  0%|          | 0/800 [00:00<?, ?it/s]

Epoch:   200, loss: 1.05762, test: 3.15985; fwd: 3.15985, rev: 3.15985, cyc: 0.00000
Epoch:   400, loss: 0.53932, test: 3.59971; fwd: 3.59971, rev: 3.59971, cyc: 0.00000
Epoch:   600, loss: 0.32503, test: 4.08736; fwd: 4.08736, rev: 4.08736, cyc: 0.00000
Epoch:   800, loss: 0.24911, test: 4.53972; fwd: 4.53972, rev: 4.53972, cyc: 0.00000


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch:   200, loss: 1.09449, test: 3.02290; fwd: 3.02290, rev: 3.02290, cyc: 0.00000
Epoch:   400, loss: 0.58545, test: 3.45119; fwd: 3.45119, rev: 3.45119, cyc: 0.00000
Epoch:   600, loss: 0.38344, test: 3.91845; fwd: 3.91845, rev: 3.91845, cyc: 0.00000
Epoch:   800, loss: 0.28696, test: 4.20610; fwd: 4.20610, rev: 4.20610, cyc: 0.00000


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch:   200, loss: 0.98391, test: 3.46641; fwd: 3.46641, rev: 3.46641, cyc: 0.00000
Epoch:   400, loss: 0.54818, test: 4.02075; fwd: 4.02075, rev: 4.02075, cyc: 0.00000
Epoch:   600, loss: 0.34895, test: 4.32675; fwd: 4.32675, rev: 4.32675, cyc: 0.00000
Epoch:   800, loss: 0.25146, test: 4.65811; fwd: 4.65811, rev: 4.65811, cyc: 0.00000


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch:   200, loss: 1.22267, test: 3.23509; fwd: 3.23509, rev: 3.23509, cyc: 0.00000
Epoch:   400, loss: 0.65551, test: 3.67042; fwd: 3.67042, rev: 3.67042, cyc: 0.00000
Epoch:   600, loss: 0.44686, test: 3.93717; fwd: 3.93717, rev: 3.93717, cyc: 0.00000
Epoch:   800, loss: 0.32400, test: 4.18891; fwd: 4.18891, rev: 4.18891, cyc: 0.00000


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch:   200, loss: 1.18973, test: 3.47714; fwd: 3.47714, rev: 3.47714, cyc: 0.00000
Epoch:   400, loss: 0.64288, test: 4.38680; fwd: 4.38680, rev: 4.38680, cyc: 0.00000
Epoch:   600, loss: 0.40441, test: 5.04429; fwd: 5.04429, rev: 5.04429, cyc: 0.00000
Epoch:   800, loss: 0.31751, test: 5.25611; fwd: 5.25611, rev: 5.25611, cyc: 0.00000


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch:   200, loss: 1.09026, test: 3.18100; fwd: 3.18100, rev: 3.18100, cyc: 0.00000
Epoch:   400, loss: 0.54049, test: 3.88813; fwd: 3.88813, rev: 3.88813, cyc: 0.00000
Epoch:   600, loss: 0.36391, test: 4.41701; fwd: 4.41701, rev: 4.41701, cyc: 0.00000
Epoch:   800, loss: 0.28333, test: 4.48536; fwd: 4.48536, rev: 4.48536, cyc: 0.00000


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch:   200, loss: 1.08825, test: 3.24858; fwd: 3.24858, rev: 3.24858, cyc: 0.00000
Epoch:   400, loss: 0.57851, test: 3.67791; fwd: 3.67791, rev: 3.67791, cyc: 0.00000
Epoch:   600, loss: 0.38566, test: 4.02116; fwd: 4.02116, rev: 4.02116, cyc: 0.00000
Epoch:   800, loss: 0.28820, test: 4.14120; fwd: 4.14120, rev: 4.14120, cyc: 0.00000


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch:   200, loss: 1.04398, test: 3.56526; fwd: 3.56526, rev: 3.56526, cyc: 0.00000
Epoch:   400, loss: 0.55171, test: 3.72408; fwd: 3.72408, rev: 3.72408, cyc: 0.00000
Epoch:   600, loss: 0.36126, test: 3.97374; fwd: 3.97374, rev: 3.97374, cyc: 0.00000
Epoch:   800, loss: 0.27679, test: 4.13392; fwd: 4.13392, rev: 4.13392, cyc: 0.00000


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch:   200, loss: 0.95379, test: 3.32629; fwd: 3.32629, rev: 3.32629, cyc: 0.00000
Epoch:   400, loss: 0.49897, test: 3.95067; fwd: 3.95067, rev: 3.95067, cyc: 0.00000
Epoch:   600, loss: 0.32032, test: 4.33426; fwd: 4.33426, rev: 4.33426, cyc: 0.00000
Epoch:   800, loss: 0.24099, test: 4.45434; fwd: 4.45434, rev: 4.45434, cyc: 0.00000


  0%|          | 0/800 [00:00<?, ?it/s]

Epoch:   200, loss: 1.19046, test: 3.98460; fwd: 3.98460, rev: 3.98460, cyc: 0.00000
Epoch:   400, loss: 0.63256, test: 4.47647; fwd: 4.47647, rev: 4.47647, cyc: 0.00000
Epoch:   600, loss: 0.38911, test: 4.65485; fwd: 4.65485, rev: 4.65485, cyc: 0.00000
Epoch:   800, loss: 0.29576, test: 4.78422; fwd: 4.78422, rev: 4.78422, cyc: 0.00000
2.1162992 0.07795698


In [None]:
import zipfile
import pathlib

directory = pathlib.Path(model_path)
with zipfile.ZipFile(f"{model_path}.zip", mode="w") as archive:
    for file_path in directory.iterdir():
        archive.write(file_path, arcname=file_path.name)

In [None]:
drive_prefix = 'drive/Shareddrives/Polymers-Data'
model_path = os.path.join(drive_prefix, 'models', 'gru-opt-cv10-sym')
trained_models = []
all_rmse = []
for i, this_data in enumerate(fold_data):
    model = torch.jit.load(os.path.join(model_path, f'fold-{i:02d}-scripted.pt'), map_location='cpu')
    model.eval()
    trained_models.append(model)

    testy = this_data['test']['y']
    if 'MLP' in model.original_name:  # for MLP
        testX = torch.tensor(this_data['test']['X'], dtype=torch.float)
    elif 'CNN' in model.original_name:  # for CNN: 
        testX = torch.tensor(this_data['test']['X'], dtype=torch.float).unsqueeze(1)
    elif 'GRU' in model.original_name:  # for RNN: 
        testX = torch.tensor(this_data['test']['X'], dtype=torch.float).unsqueeze(2)

    # forward sequences
    pred_z_fwd = model(testX).detach().numpy()
    rmse_fwd = np.sqrt(np.mean((pred_z_fwd - testy)**2))

    # reverse sequences
    pred_z_rev = model(torch.fliplr(testX)).detach().numpy()
    rmse_rev = np.sqrt(np.mean((pred_z_rev - testy)**2))

    # averarage the two predictions
    rmse_avg = np.sqrt(np.mean((0.5*(pred_z_fwd + pred_z_rev) - testy)**2))

    all_rmse.append(rmse_avg)

# report results    
print(np.mean(all_rmse), np.std(all_rmse))

1.407435983005784 0.09071761491331667


MSE: 1.7426296740369203 0.12961620540445742

SYM: 1.7219231569660685 0.1577552616169461

CNN: 1.7468464094376188 0.1305497250261983

GRU: 1.407435983005784 0.09071761491331667

# hyperparameter tuning

In [None]:
!pip install bayesian-optimization

## gru

In [None]:
from bayes_opt import BayesianOptimization
from bayes_opt import SequentialDomainReductionTransformer

prev_results = {}

def gru_loss_fn(**params):
    params = {k: int(np.round(v)) for k, v in params.items()}
    params['input_dim'] = 1
    params['output_dim'] = 2
    params['dropout_prob'] = 0.

    tuple_key = tuple([params[k] for k in sorted(params.keys())])
    if tuple_key in prev_results:
        return prev_results[tuple_key]

    model = GRUModel(**params).to(device)

    train_model(model, (trainX, trainy), (testX, testy), n_print=4,
                loss_fn=sym_loss, loss_args=(),
                optimizer=torch.optim.Adam, learning_rate=0.005, num_epochs=400)
    real_z = testy.to('cpu').detach().numpy()

    # forward sequences
    pred_z_fwd = model(testX).to('cpu').detach().numpy()
    rmse_fwd = np.sqrt(np.mean((pred_z_fwd - real_z)**2))

    # reverse sequences
    pred_z_rev = model(torch.fliplr(testX)).to('cpu').detach().numpy()
    rmse_rev = np.sqrt(np.mean((pred_z_rev - real_z)**2))

    # averarage the two predictions
    rmse_avg = np.sqrt(np.mean((0.5*(pred_z_fwd + pred_z_rev) - real_z)**2))

    # save the results
    prev_results[tuple_key] = -rmse_avg

    # return -0.5 * (rmse_fwd + rmse_rev)
    return -rmse_avg


pbounds = {'hidden_dim': (1, 16), 'layer_dim': (1, 8)}
bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(
    f=gru_loss_fn,
    pbounds=pbounds,
    bounds_transformer = bounds_transformer,
    random_state=0
    )

logger = JSONLogger(path="bayes-opt-gru-1.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(init_points=5, n_iter=15, acq='ei', xi=1e-2)

## mlp

In [None]:
from bayes_opt import BayesianOptimization
from bayes_opt import SequentialDomainReductionTransformer


def layers_from_params(n, m, b):
    return np.round(np.arange(int(n)) * m + b).astype(int)


def layers_from_simple_params(n, s, e):
    return np.round(np.linspace(s, e, int(n))).astype(int)


prev_results = {}


def mlp_loss_fn(**params):

    # hidden_dims = layers_from_params(**params)    
    hidden_dims = layers_from_simple_params(**params)    

    tuple_key = tuple(hidden_dims)
    if tuple_key in prev_results:
        return prev_results[tuple_key]

    model = MLPModel(20, hidden_dims, 2).to(device)

    train_model(model, (trainX.squeeze(2), trainy), (testX.squeeze(2), testy), n_print=4,
                loss_fn=sym_loss, loss_args=(),
                optimizer=torch.optim.Adam, learning_rate=0.005, num_epochs=400)
    real_z = testy.to('cpu').detach().numpy()

    # forward sequences
    pred_z_fwd = model(testX.squeeze(2)).to('cpu').detach().numpy()
    rmse_fwd = np.sqrt(np.mean((pred_z_fwd - real_z)**2))

    # reverse sequences
    pred_z_rev = model(torch.fliplr(testX.squeeze(2))).to('cpu').detach().numpy()
    rmse_rev = np.sqrt(np.mean((pred_z_rev - real_z)**2))

    # averarage the two predictions
    rmse_avg = np.sqrt(np.mean((0.5*(pred_z_fwd + pred_z_rev) - real_z)**2))

    # save the results
    prev_results[tuple_key] = -rmse_avg

    # return -0.5 * (rmse_fwd + rmse_rev)
    return -rmse_avg

pbounds = {'n': (1, 16), 's': (1, 256), 'e': (1, 128)}
bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(
    f=mlp_loss_fn,
    pbounds=pbounds,
    bounds_transformer = bounds_transformer,
    random_state=0
    )

load_logs(optimizer, logs=["bayes-opt-mlp-1.json", "bayes-opt-mlp-2.json"]);

logger = JSONLogger(path="bayes-opt-mlp-3.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(init_points=5, n_iter=45,
                   # acq='ei', xi=1e-2,
                   acq='ucb', kappa=10,
                   )

## cnn

In [None]:
from bayes_opt import BayesianOptimization
from bayes_opt import SequentialDomainReductionTransformer
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs

prev_results = {}

def cnn_loss_fn(**params):
    params = {k: int(np.round(v)) for k, v in params.items()}
    params['latent_dim'] = 2

    tuple_key = (params['n_ch'], params['width'], params['n_conv'])
    if tuple_key in prev_results:
        return prev_results[tuple_key]

    # latent_dim, n_ch=8, width=8, n_conv=1, imgChannels=1
    model = CNNRegressor(**params).to(device)

    train_model(model, (X_train, c_train), (X_test, c_test), n_print=4,
                loss_fn=sym_loss, loss_args=(),
                optimizer=torch.optim.Adam, learning_rate=0.005, num_epochs=800)
    real_z = c_test.to('cpu').detach().numpy()

    # forward sequences
    pred_z_fwd = model(X_test).to('cpu').detach().numpy()
    # rmse_fwd = np.sqrt(np.mean((pred_z_fwd - real_z)**2))

    # reverse sequences
    pred_z_rev = model(torch.fliplr(X_test)).to('cpu').detach().numpy()
    # rmse_rev = np.sqrt(np.mean((pred_z_rev - real_z)**2))

    # averarage the two predictions
    rmse_avg = np.sqrt(np.mean((0.5*(pred_z_fwd + pred_z_rev) - real_z)**2))

    # save the results
    prev_results[tuple_key] = -rmse_avg

    # return -0.5 * (rmse_fwd + rmse_rev)
    return -rmse_avg

pbounds = {'n_ch': (1, 64), 'width': (1, 20), 'n_conv': (1, 16)}
bounds_transformer = SequentialDomainReductionTransformer()
optimizer = BayesianOptimization(
    f=cnn_loss_fn,
    pbounds=pbounds,
    bounds_transformer = bounds_transformer,
    random_state=0
    )

load_logs(optimizer, logs=["bayes-opt-log-2.json"]);

logger = JSONLogger(path="bayes-opt-log-3.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

optimizer.maximize(init_points=5, n_iter=45,
                   acq='ei', xi=1e-2,
                   # acq='ucb', kappa=4,
                   )

print(optimizer.max)

# timings

In [None]:
drive_prefix = 'drive/Shareddrives/Polymers-Data'
model_path = os.path.join(drive_prefix, 'models', 'mlp-opt-cv10-sym')

i = 0
this_data = fold_data[0]

model = torch.jit.load(os.path.join(model_path, f'fold-{i:02d}-scripted.pt'), map_location='cpu')
model.eval()

testy = this_data['test']['y']
if 'MLP' in model.original_name:  # for MLP
    testX = torch.tensor(this_data['test']['X'], dtype=torch.float)
elif 'CNN' in model.original_name:  # for CNN: 
    testX = torch.tensor(this_data['test']['X'], dtype=torch.float).unsqueeze(1)
elif 'GRU' in model.original_name:  # for RNN: 
    testX = torch.tensor(this_data['test']['X'], dtype=torch.float).unsqueeze(2)

%timeit model(testX)

The slowest run took 9.00 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 5: 867 µs per loop
