In [1]:
# Libraries

import os
import platform
import sys

In [2]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux' :
			ROOT = '/d/hpc/home/up4472/workspace/upolanc-thesis'
		elif platform.system() == 'Windows' :
			ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else :
			raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [3]:
# Code

from source.python.report import report_load
from source.python.report import report_concat
from source.python.report import report_filter
from source.python.report import report_plot
from source.python.io     import loader
from source.python.io     import writer
from source.python        import runtime

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

In [4]:
# List reports folder content

ROOT_DIR = os.path.join(ROOT, 'reports', '2023-05-31')

for index, name in enumerate(os.listdir(ROOT_DIR), start = 0) :
	path_l0 = os.path.join(ROOT_DIR, name)

	if index == 0 : print('{} :'.format(path_l0))
	else          : print('\n{} :'.format(path_l0))

	for name in os.listdir(path_l0) :
		print('...\\{}'.format(name))

C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\2023-05-31\regression-cnn :
...\washburn-bp2150-f2-0500-00-global-mean
...\washburn-bp2150-f4-0500-72-global-mean
...\washburn-bp6150-f2-0500-00-global-mean
...\washburn-bp6150-f2-0500-72-global-mean
...\washburn-bp6150-f4-0500-72-global-mean
...\washburn-bp6150-f5-0500-00-global-mean
...\zrimec-bp2150-f2-0500-00-global-mean
...\zrimec-bp2150-f2-0500-72-global-mean
...\zrimec-bp2150-f4-0500-00-global-mean
...\zrimec-bp2150-f4-0500-72-global-mean
...\zrimec-bp2150-f5-0500-72-global-mean
...\zrimec-bp6150-f2-0500-00-global-mean
...\zrimec-bp6150-f2-0500-72-global-mean
...\zrimec-bp6150-f4-0500-00-global-mean
...\zrimec-bp6150-f5-0500-00-global-mean
...\zrimec-bp6150-f5-0500-72-global-mean

C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\2023-05-31\regression-tuner-cnn :
...\zrimec-bp2150-f2-1000-50-72-global-mean


In [5]:
# Load existing configurations

CONFIG_DIR = os.path.join(ROOT, 'resources', 'tuner')

ZCONFIG_PATH = os.path.join(CONFIG_DIR, 'zrimec2020.json')
WCONFIG_PATH = os.path.join(CONFIG_DIR, 'washburn2019.json')

ZCONFIG = list()
WCONFIG = list()

CONFIG_MAX       = 50
CONFIG_BEST      = 5
CONFIG_OVERWRITE = False

if os.path.exists(ZCONFIG_PATH) and not CONFIG_OVERWRITE :
	ZCONFIG = loader.load_json(filename = ZCONFIG_PATH)

	print('Sucessufully loaded configuration [{:84s}] with [{:2d}] elements'.format(ZCONFIG_PATH, len(ZCONFIG)))

if os.path.exists(WCONFIG_PATH) and not CONFIG_OVERWRITE :
	WCONFIG = loader.load_json(filename = WCONFIG_PATH)

	print('Sucessufully loader configuration [{:84s}] with [{:2d}] elements'.format(WCONFIG_PATH, len(WCONFIG)))

Sucessufully loaded configuration [C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\resources\tuner\zrimec2020.json] with [15] elements
Sucessufully loader configuration [C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\resources\tuner\washburn2019.json] with [ 9] elements


# 1. CNN

## 1.1 Model

In [6]:
# Load cnn model reports

report_cnn_model = report_load.load_cnn_reports(
	root = ROOT_DIR
)

### 1.1.1 Regression

In [7]:
# Display model performance for regression

if len(report_cnn_model['regression']) > 0 :
	report = report_cnn_model['regression']
else :
	report = None

report

Unnamed: 0,Model,Sequence,Filter,Epochs,Features,Target0,Target1,Target2,Optimizer,LR,Beta1,Beta2,Decay,Dropout,Scheduler,Gamma,Batch,Epoch,MSE,R2
4,washburn,bp2150,f4,500,72,global,mean,,adam,0.00038,0.50536,0.97635,0.00189,0.27059,exponential,0.99501,32,30,0.52918,0.36125
2,washburn,bp6150,f2,500,72,global,mean,,adam,0.00038,0.50536,0.97635,0.00189,0.27059,exponential,0.99501,32,15,0.56735,0.34649
1,washburn,bp6150,f4,500,72,global,mean,,adam,0.00038,0.50536,0.97635,0.00189,0.27059,exponential,0.99501,32,24,0.54434,0.33119
14,zrimec,bp2150,f2,500,72,global,mean,,adam,0.00731,0.88487,0.98237,0.00189,0.17652,exponential,0.90905,32,40,0.54811,0.31557
12,zrimec,bp2150,f4,500,72,global,mean,,adam,0.00731,0.88487,0.98237,0.00189,0.17652,exponential,0.90905,32,49,0.57701,0.28726
15,zrimec,bp2150,f2,500,0,global,mean,,adam,0.00731,0.88487,0.98237,0.00189,0.17652,exponential,0.90905,32,28,0.60459,0.27814
9,zrimec,bp6150,f2,500,72,global,mean,,adam,0.00731,0.88487,0.98237,0.00189,0.17652,exponential,0.90905,32,18,0.5954,0.25658
5,washburn,bp2150,f2,500,0,global,mean,,adam,0.00038,0.50536,0.97635,0.00189,0.27059,exponential,0.99501,32,20,0.56812,0.24591
8,zrimec,bp6150,f4,500,0,global,mean,,adam,0.00731,0.88487,0.98237,0.00189,0.17652,exponential,0.90905,32,21,0.62498,0.23848
3,washburn,bp6150,f2,500,0,global,mean,,adam,0.00038,0.50536,0.97635,0.00189,0.27059,exponential,0.99501,32,20,0.61651,0.21829


### 1.1.2 Classification

In [8]:
# Display model performance for classification

if len(report_cnn_model['classification']) > 0 :
	report = report_cnn_model['classification']
else :
	report = None

report

## 1.2 Tune Model

In [9]:
# Load tune model reports

report_tune_model = report_load.load_cnn_tune_reports(
	root = ROOT_DIR,
	show = False,
	n    = 5
)

C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\2023-05-31\regression-tuner-cnn\zrimec-bp2150-f2-1000-50-72-global-mean\report.csv


### 1.2.1 Regression

In [10]:
# Display overall regression prediction

R2_THRESHOLD = 0.00

for key, dataframe in report_tune_model['regression'].items() :
	nan_mse = dataframe['valid_loss'].isnull().values.ravel().sum()
	pos_r2  = dataframe[dataframe['valid_r2'] > R2_THRESHOLD]['valid_r2'].count()
	neg_r2  = dataframe[dataframe['valid_r2'] < R2_THRESHOLD]['valid_r2'].count()

	print('{:64s} : Size = {:4d} || NaN Loss = {:4d} || Pos R2 = {:4d} || Neg R2 = {:4d}'.format(key, len(dataframe), nan_mse, pos_r2, neg_r2))

zrimec-bp2150-f2-1000-50-72-global-mean                          : Size = 1000 || NaN Loss =    0 || Pos R2 =  518 || Neg R2 =  482


In [11]:
# Concat reports and display top few

report = report_concat.concat_cnn_tune_reports(
	reports = report_tune_model,
	mode    = 'regression',
	n       = None
)

if report is not None :
	report = report.head(n = 25)

report

Unnamed: 0,Model,Sequence,Filter,Target0,Target1,Target2,ID,Valid_MSE,Valid_MAE,Valid_R2,Train_MSE,Epoch,Optimizer,LR,Decay,Scheduler,Batch,Dropout
0,zrimec,bp2150,f2,global,mean,,3bb0442a,0.62823,0.64081,0.23789,0.54267,50,adam,0.00071,0.09353,exponential,32,0.07367
1,zrimec,bp2150,f2,global,mean,,f6f35bcc,0.63681,0.64987,0.23073,0.55375,50,adam,0.0009,0.08489,exponential,32,0.06289
2,zrimec,bp2150,f2,global,mean,,553441a0,0.64006,0.65168,0.22889,0.60933,50,adam,0.00099,0.09306,exponential,32,0.07094
3,zrimec,bp2150,f2,global,mean,,cc983bc2,0.6373,0.64538,0.22653,0.41392,50,adam,0.00055,0.07392,exponential,32,0.1198
4,zrimec,bp2150,f2,global,mean,,4dabef6c,0.64229,0.64287,0.22204,0.52192,50,adam,0.00075,0.08244,exponential,32,0.08794
5,zrimec,bp2150,f2,global,mean,,f00aff25,0.6437,0.65116,0.22201,0.48231,50,adam,0.00075,0.09213,exponential,32,0.09302
6,zrimec,bp2150,f2,global,mean,,be335b02,0.64318,0.6472,0.21622,0.47712,50,adam,0.00059,0.08392,exponential,32,0.06264
7,zrimec,bp2150,f2,global,mean,,d631dcfa,0.64789,0.65001,0.21617,0.54161,50,adam,0.0009,0.09995,exponential,32,0.05026
8,zrimec,bp2150,f2,global,mean,,9fb6886a,0.65223,0.65609,0.21552,0.50814,50,adam,0.00099,0.09919,exponential,32,0.06441
9,zrimec,bp2150,f2,global,mean,,08e54884,0.65081,0.65574,0.21456,0.5861,50,adam,0.00099,0.09853,exponential,32,0.06308


In [12]:
# Save top model configurations (to same format as model params)

for key in report_tune_model['regression'].keys() :
	df = report_tune_model['regression'][key]
	df = df.sort_values('valid_r2', ascending = False)

	if   key.startswith('zrimec')   : config = ZCONFIG
	elif key.startswith('washburn') : config = WCONFIG
	else : raise ValueError()

	if   key.startswith('zrimec')   : layers = [3, 3]
	elif key.startswith('washburn') : layers = [6, 3]
	else : raise ValueError()

	for i in range(CONFIG_BEST) :
		item = dict()

		for k, v in df.iloc[i, :].to_dict().items() :
			if   k.startswith('valid')  : pass
			elif k.startswith('train')  : pass
			elif k.startswith('config') : k = k[7:]
			else : continue

			if k.startswith('model/convx/') :
				for i in range(2, layers[0] + 1) :
					oldkey = 'x/'
					newkey = str(i) + '/'

					item[k.replace(oldkey, newkey)] = v

			elif k.startswith('model/maxpoolx/') :
				for i in range(1, layers[1] + 1) :
					oldkey = 'x/'
					newkey = str(i) + '/'

					item[k.replace(oldkey, newkey)] = v

			else :
				item[k] = v

		config.append(item)

ZCONFIG = sorted(ZCONFIG, key = lambda x : x['valid_r2'], reverse = True)
WCONFIG = sorted(WCONFIG, key = lambda x : x['valid_r2'], reverse = True)

ZCONFIG = ZCONFIG[:CONFIG_MAX]
WCONFIG = WCONFIG[:CONFIG_MAX]

writer.write_json(
	filename = ZCONFIG_PATH,
	data     = ZCONFIG
)

writer.write_json(
	filename = WCONFIG_PATH,
	data     = WCONFIG
)

### 1.2.2 Classification

In [13]:
# Concat reports and display top few

report = report_concat.concat_cnn_tune_reports(
	reports = report_tune_model,
	mode    = 'classification',
	n       = None
)

report

## 1.3 Tune Data

In [14]:
# Display top few results for each tune configuration

report_tune_data = report_load.load_data_tune_reports(
	root = ROOT_DIR,
	show = False,
	n    = 25
)

### 1.3.1 Regression

In [15]:
# Concat reports and display top few

report = report_concat.concat_data_tune_reports(
	reports = report_tune_data,
	mode    = 'regression',
	n       = 25
)

report

### 1.3.1 Classification

In [16]:
# Concat reports and display top few

report = report_concat.concat_data_tune_reports(
	reports = report_tune_data,
	mode    = 'classification',
	n       = 25
)

report

## 1.4 Tune Feature

In [17]:
# Display top few results for each tune configuration

report_tune_feature = report_load.load_feature_tune_reports(
	root = ROOT_DIR,
	show = False,
	n    = 25
)

### 1.4.1 Regression

In [18]:
# Concat reports and display top few

report = report_concat.concat_feature_tune_reports(
	reports = report_tune_feature,
	mode    = 'regression',
	n       = None
)

if report is not None :
	report = report[report['Target1'] == 'mean'].head(n = 50)

report

### 1.4.2 Classification

In [19]:
# Concat reports and display top few

report = report_concat.concat_feature_tune_reports(
	reports = report_tune_feature,
	mode    = 'classification',
	n       = 25
)

report

# 2. DNABert

## 2.1 Model

In [20]:
# Load bert model reports

report_bert_model = report_load.load_bert_reports(
	root = ROOT_DIR,
	show = False
)

In [21]:
# Plot

report_plot.models_bert_r2(
	data = report_filter.filter_bert_reports(
		reports = report_bert_model,
		keep_only = ['12'],
		drop_only = ['lamb', '6-00']
	),
	mode     = 'regression',
	x        = 12,
	y        = None,
	filename = None,
	step     = 'epoch'
)

In [22]:
# Concat reports and display top few

report = report_concat.concat_bert_reports(
	data      = report_bert_model,
	mode      = 'regression',
	metric    = 'eval_r2',
	ascending = False
)

if report is not None :
	report = report.head(n = 25)

report