In [1]:
# Libraries

import json
import numpy
import os
import shutil

In [2]:
# Tune

from ray.air                  import RunConfig
from ray.tune                 import JupyterNotebookReporter
from ray.tune                 import TuneConfig
from ray.tune                 import Tuner
from ray.tune.logger          import CSVLoggerCallback
from ray.tune.schedulers      import ASHAScheduler
from ray.tune.search.bayesopt import BayesOptSearch
from ray.tune.search.hyperopt import HyperOptSearch

from ray import tune

In [3]:
# Code

from src.cnn import raytune as cnn_raytune
from src.cnn import core    as cnn_core
from src.io  import loader  as data_loader

# 1. Setup

In [4]:
# Setup some directory paths.

OUT_SUBFOLDER = 'nbp06-raytune'
RES_SUBFOLDER = ''

CWD = os.getcwd()
OUT = os.path.join(CWD, 'out')
RES = os.path.join(CWD, 'res')

OUT_DATA  = os.path.join(OUT, 'data',  OUT_SUBFOLDER)
RES_NBP04 = os.path.join(OUT, 'data', 'nbp04-feature')
RES_NBP05 = os.path.join(OUT, 'data', 'nbp05-target')

shutil.rmtree(OUT_DATA, ignore_errors = True)
os.makedirs(OUT_DATA, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT}')
print(f' Resource Directory : {RES}')

     Root Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis
   Output Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\out
 Resource Directory : C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\res


In [5]:
# Device

DEVICE = cnn_core.get_device(only_cpu = False)

Graphic devices : 1
Selected device : cuda


In [6]:
# Lock random

RANDOM_SEED = cnn_core.lock_random(
	generate_seed = True
)

print('Random seed : {}'.format(RANDOM_SEED))

Random seed : 19546736


In [7]:
# Load the inputs and ouputs

tpm_order = data_loader.load_labels(
	filename = os.path.join(RES_NBP05, 'target-order.json')
)

# 2. Raytune

## 2.1 Config

In [8]:
# Define basic data parameters

core_config = {
	'random_seed' : RANDOM_SEED,
	'device'      : DEVICE,
	'model_name'  : 'zrimec2020r',
	'epochs'      : 5,
	'expand_dims' : None,

	'split_size' : {
		'valid' : 0.2,
		'test'  : 0.2
	},
	'input' : {
		'channels' : 1,
		'height'   : 4,
		'width'    : 2150,
		'features' : 64
	},
	'output' : {
		'group0' : 'tissue',
		'group1' : 'tissue-mean',
		'length' : 8
	},
	'files' : {
		'sequences' : lambda : data_loader.load_fasta(filename = os.path.join(RES_NBP04, 'features-bp2150.fasta'), to_string = True),
		'frequency' : lambda : data_loader.load_npz(filename = os.path.join(RES_NBP04, 'features-frequency.npz')),
		'stability' : lambda : data_loader.load_npz(filename = os.path.join(RES_NBP04, 'features-stability.npz')),
		'values'    : lambda : data_loader.load_labels(filename = os.path.join(RES_NBP05, 'target-values.json'), to_numpy = True),
		'order'     : lambda : data_loader.load_labels(filename = os.path.join(RES_NBP05, 'target-order.json'))
	}
}

In [9]:
# Ensure correct

core_config['output']['length'] = len(tpm_order[core_config['output']['group0']])

## 2.2 Search Space

In [10]:
# Define search space
# https://docs.ray.io/en/latest/tune/api_docs/search_space.html

# Notes
# batch_size = 256 : a bit too much for my system

param_space = {
	# Dataset
	'dataset/batch_size' : tune.choice([32, 64, 128]),
	# Optimizer
	'optimizer/lr' : tune.choice([1e-5, 1e-4, 1e-3, 1e-2]),
	'optimizer/l2' : tune.choice([0, 0.01, 0.05]),
	# Dropout
	'model/dropout' : tune.choice([0.1, 0.2, 0.3, 0.4]),
	# Convolution
	'model/conv1/filters' : tune.choice([32, 64, 128, 256]),
	'model/conv1/kernel'  : tune.choice([5, 11, 21, 31, 41]),
	'model/conv1/padding' : 0,
	'model/conv2/filters' : tune.choice([32, 64, 128, 256]),
	'model/conv2/kernel'  : tune.choice([5, 11, 21, 31, 41]),
	'model/conv2/padding' : tune.choice([0, 'same']),
	'model/conv3/filters' : tune.choice([32, 64, 128, 256]),
	'model/conv3/kernel'  : tune.choice([5, 11, 21, 31, 41]),
	'model/conv3/padding' : tune.choice([0, 'same']),
	# Max Pooling
	'model/maxpool1/kernel'  : tune.choice([3, 5, 9]),
	'model/maxpool1/padding' : 'same',
	'model/maxpool2/kernel'  : tune.choice([3, 5, 9]),
	'model/maxpool2/padding' : 'same',
	'model/maxpool3/kernel'  : tune.choice([3, 5, 9]),
	'model/maxpool3/padding' : 'same',
	# Dense
	'model/fc1/features' : tune.choice([64, 128, 256, 512]),
	'model/fc2/features' : tune.choice([64, 128, 256, 512])
}

## 2.3 Tuning

In [11]:
# Create a hyperparameter optimization tuner

tune_method = tune.with_resources(
	lambda x : cnn_raytune.tune_method(
		tune_config = x,
		core_config = core_config
	),
	{
		'cpu' : 1,
		'gpu' : 1
	}
)

tune_config = TuneConfig(
	metric      = 'valid_loss',
	mode        = 'min',
	search_alg  = None,
	scheduler   = ASHAScheduler(
		time_attr        = 'training_iteration',
		max_t            = 100,
		grace_period     = 1,
		reduction_factor = 4,
		brackets         = 1
		
	),
	num_samples = 2,
	trial_name_creator    = lambda x : str(x.trial_id),
	trial_dirname_creator = lambda x : str(x.trial_id)
)

run_config = RunConfig(
	name              = core_config['model_name'],
	local_dir         = OUT_DATA,
	callbacks         = None,
	log_to_file       = True,
	verbose           = 0,
	progress_reporter = JupyterNotebookReporter(
		max_column_length = 32,
		max_progress_rows = 20,
		parameter_columns = [],
		metric_columns    = []
	)
)

tuner = Tuner(
	trainable   = tune_method,
	tune_config = tune_config,
	run_config  = run_config,
	param_space = param_space
)

report = tuner.fit()

2023-01-27 13:38:27,735	INFO worker.py:1538 -- Started a local Ray instance.


## 2.4 Reports

In [12]:
# Check if there have been errors

if report.errors :
	print('At least one of the trials has failed.')
else :
	print('No errors.')

No errors.


In [13]:
# Print the results as dataframe

dataframe = report.get_dataframe()

print('Shortest training time : {:.2f} seconds'.format(dataframe['time_total_s'].min()))
print(' Longest training time : {:.2f} seconds'.format(dataframe['time_total_s'].max()))
print()

trials = dataframe[['trial_id', 'experiment_id', 'time_total_s', 'training_iteration', 'valid_loss', 'valid_r2', 'valid_mae', 'train_loss']]
trials

Shortest training time : 97.12 seconds
 Longest training time : 129.95 seconds



Unnamed: 0,trial_id,experiment_id,time_total_s,training_iteration,valid_loss,valid_r2,valid_mae,train_loss
0,e202e_00000,ab3e23b259d94b90b6a980a0dec67add,97.12,5,0.923,-0.796,0.769,1.015
1,e202e_00001,ab3e23b259d94b90b6a980a0dec67add,129.949,5,0.793,-0.484,0.706,0.481


### 2.4.1 VIsualization

In [14]:
print('TODO - tensorboard')

TODO - tensorboard


### 2.4.2 Best

In [15]:
# Display best trial

best = report.get_best_result()

print('Best metric : loss = {: 8.5f} r2 = {: 8.5f}'.format(best.metrics['valid_loss'], best.metrics['valid_r2']))
print('Best config : ')
print(json.dumps(best.config, indent = '\t'))

Best metric : loss =  0.79289 r2 = -0.48367
Best config : 
{
	"dataset/batch_size": 128,
	"optimizer/lr": 0.001,
	"optimizer/l2": 0.01,
	"model/dropout": 0.2,
	"model/conv1/filters": 64,
	"model/conv1/kernel": 41,
	"model/conv1/padding": 0,
	"model/conv2/filters": 64,
	"model/conv2/kernel": 5,
	"model/conv2/padding": 0,
	"model/conv3/filters": 128,
	"model/conv3/kernel": 5,
	"model/conv3/padding": "same",
	"model/maxpool1/kernel": 3,
	"model/maxpool1/padding": "same",
	"model/maxpool2/kernel": 9,
	"model/maxpool2/padding": "same",
	"model/maxpool3/kernel": 9,
	"model/maxpool3/padding": "same",
	"model/fc1/features": 512,
	"model/fc2/features": 64
}


In [16]:
# Display per iteration

progress = data_loader.load_csv(
	filename = os.path.join(best.log_dir, 'progress.csv')
)

progress = progress[['trial_id', 'experiment_id', 'time_total_s', 'training_iteration', 'valid_loss', 'valid_r2', 'valid_mae', 'train_loss']]
progress

Unnamed: 0,trial_id,experiment_id,time_total_s,training_iteration,valid_loss,valid_r2,valid_mae,train_loss
0,e202e_00001,ab3e23b259d94b90b6a980a0dec67add,59.489,1,1.135,-1.152,0.854,1.053
1,e202e_00001,ab3e23b259d94b90b6a980a0dec67add,77.099,2,0.893,-0.675,0.749,0.649
2,e202e_00001,ab3e23b259d94b90b6a980a0dec67add,94.704,3,0.866,-0.609,0.735,0.522
3,e202e_00001,ab3e23b259d94b90b6a980a0dec67add,112.316,4,0.547,-0.029,0.598,0.495
4,e202e_00001,ab3e23b259d94b90b6a980a0dec67add,129.949,5,0.793,-0.484,0.706,0.481


In [17]:
print('TODO - graphs')

TODO - graphs


### 2.4.3 Worst

In [18]:
# Display worst trial

worst = report.get_best_result(metric = 'valid_loss', mode = 'max')

print('Worst metric : loss = {: 8.5f} r2 = {: 8.5f}'.format(worst.metrics['valid_loss'], worst.metrics['valid_r2']))
print('Worst config : ')
print(json.dumps(worst.config, indent = '\t'))

Worst metric : loss =  0.92320 r2 = -0.79642
Worst config : 
{
	"dataset/batch_size": 128,
	"optimizer/lr": 1e-05,
	"optimizer/l2": 0.05,
	"model/dropout": 0.1,
	"model/conv1/filters": 32,
	"model/conv1/kernel": 11,
	"model/conv1/padding": 0,
	"model/conv2/filters": 32,
	"model/conv2/kernel": 31,
	"model/conv2/padding": "same",
	"model/conv3/filters": 256,
	"model/conv3/kernel": 41,
	"model/conv3/padding": 0,
	"model/maxpool1/kernel": 5,
	"model/maxpool1/padding": "same",
	"model/maxpool2/kernel": 5,
	"model/maxpool2/padding": "same",
	"model/maxpool3/kernel": 3,
	"model/maxpool3/padding": "same",
	"model/fc1/features": 128,
	"model/fc2/features": 512
}


In [19]:
# Display per iteration

progress = data_loader.load_csv(
	filename = os.path.join(worst.log_dir, 'progress.csv')
)

progress = progress[['trial_id', 'experiment_id', 'time_total_s', 'training_iteration', 'valid_loss', 'valid_r2', 'valid_mae', 'train_loss']]
progress

Unnamed: 0,trial_id,experiment_id,time_total_s,training_iteration,valid_loss,valid_r2,valid_mae,train_loss
0,e202e_00000,ab3e23b259d94b90b6a980a0dec67add,51.651,1,1.595,-1.986,1.03,1.951
1,e202e_00000,ab3e23b259d94b90b6a980a0dec67add,63.166,2,1.423,-1.689,0.964,1.671
2,e202e_00000,ab3e23b259d94b90b6a980a0dec67add,74.494,3,1.213,-1.314,0.883,1.457
3,e202e_00000,ab3e23b259d94b90b6a980a0dec67add,85.695,4,1.033,-1.0,0.813,1.215
4,e202e_00000,ab3e23b259d94b90b6a980a0dec67add,97.12,5,0.923,-0.796,0.769,1.015


In [20]:
print('TODO - graphs')

TODO - graphs


## 2.4 Save

In [21]:
# Save the tuning results

report.get_dataframe().to_csv(
	os.path.join(OUT_DATA, core_config['model_name'] + '-report.csv')
)