In [1]:
# Libraries

import os
import platform
import sys

In [2]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux' :
			ROOT = '/d/hpc/home/up4472/workspace/upolanc-thesis'
		elif platform.system() == 'Windows' :
			ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else :
			raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [3]:
# Code

from source.python.report import report_load
from source.python.report import report_concat
from source.python.report import report_filter
from source.python.report import report_plot
from source.python.io     import loader
from source.python.io     import writer
from source.python        import runtime

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

In [4]:
# List reports folder content

ROOT_DIR = os.path.join(ROOT, 'reports', 'final')

for index, name in enumerate(os.listdir(ROOT_DIR), start = 0) :
	path_l0 = os.path.join(ROOT_DIR, name)

	if index == 0 : print('{} :'.format(path_l0))
	else          : print('\n{} :'.format(path_l0))

	for name in os.listdir(path_l0) :
		path_l1 = os.path.join(path_l0, name)

		if not os.path.exists(path_l1) :
			continue

		for name in os.listdir(path_l1) :
			print('...\\{}'.format(name))

C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\final\bert :

C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\final\cnn :

C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\final\tune :
...\washburn-transcript-f1-1000-25-72-global-mean
...\zrimec-promoter-f1-2000-25-72-global-mean
...\zrimec-transcript-f1-1000-25-72-global-mean
...\zrimec-transcript-f2-1000-25-72-global-mean
...\zrimec-transcript-f5-1000-25-72-global-mean


In [5]:
# Load existing configurations

CONFIG_DIR = os.path.join(ROOT, 'resources', 'tuner')

ZCONFIG_PATH = os.path.join(CONFIG_DIR, 'zrimec2020.json')
WCONFIG_PATH = os.path.join(CONFIG_DIR, 'washburn2019.json')

ZCONFIG = list()
WCONFIG = list()

CONFIG_MAX       = 25
CONFIG_BEST      = 3
CONFIG_OVERWRITE = True

if os.path.exists(ZCONFIG_PATH) and not CONFIG_OVERWRITE :
	ZCONFIG = loader.load_json(filename = ZCONFIG_PATH)

	print('Sucessufully loaded configuration [{:84s}] with [{:2d}] elements'.format(ZCONFIG_PATH, len(ZCONFIG)))

if os.path.exists(WCONFIG_PATH) and not CONFIG_OVERWRITE :
	WCONFIG = loader.load_json(filename = WCONFIG_PATH)

	print('Sucessufully loader configuration [{:84s}] with [{:2d}] elements'.format(WCONFIG_PATH, len(WCONFIG)))

# 1. CNN

## 1.1 Model

In [6]:
# Load cnn model reports

report_cnn_model = report_load.load_cnn_reports(
	root = ROOT_DIR
)

### 1.1.1 Regression

In [7]:
# Display model performance for regression

if len(report_cnn_model['regression']) > 0 :
	report_cnn_model['regression']

### 1.1.2 Classification

In [8]:
# Display model performance for classification

if len(report_cnn_model['classification']) > 0 :
	report_cnn_model['classification']

## 1.2 Tune Model

In [9]:
# Load tune model reports

report_tune_model = report_load.load_cnn_tune_reports(
	root = ROOT_DIR,
	show = False,
	n    = 5
)

C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\final\tune\cnn-regression\zrimec-promoter-f1-2000-25-72-global-mean\report.csv
C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\final\tune\cnn-regression\zrimec-transcript-f1-1000-25-72-global-mean\report.csv
C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\final\tune\cnn-regression\zrimec-transcript-f2-1000-25-72-global-mean\report.csv
C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\final\tune\cnn-regression\zrimec-transcript-f5-1000-25-72-global-mean\report.csv
C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\final\tune\cnn-regression\washburn-transcript-f1-1000-25-72-global-mean\report.csv


### 1.2.1 Regression

In [10]:
# Display overall regression prediction

R2_THRESHOLD = 0.00

for key, dataframe in report_tune_model['regression'].items() :
	nan_mse = dataframe['valid_loss'].isnull().values.ravel().sum()
	pos_r2  = dataframe[dataframe['valid_r2'] > R2_THRESHOLD]['valid_r2'].count()
	neg_r2  = dataframe[dataframe['valid_r2'] < R2_THRESHOLD]['valid_r2'].count()

	print('{:64s} : Size = {:4d} || NaN Loss = {:4d} || Pos R2 = {:4d} || Neg R2 = {:4d}'.format(key, len(dataframe), nan_mse, pos_r2, neg_r2))

zrimec-promoter-f1-2000-25-72-global-mean                        : Size = 2000 || NaN Loss =    0 || Pos R2 = 1239 || Neg R2 =  761
zrimec-transcript-f1-1000-25-72-global-mean                      : Size = 1000 || NaN Loss =    0 || Pos R2 =  318 || Neg R2 =  682
zrimec-transcript-f2-1000-25-72-global-mean                      : Size = 1000 || NaN Loss =    0 || Pos R2 =  505 || Neg R2 =  495
zrimec-transcript-f5-1000-25-72-global-mean                      : Size = 1000 || NaN Loss =    0 || Pos R2 =  589 || Neg R2 =  411
washburn-transcript-f1-1000-25-72-global-mean                    : Size = 1000 || NaN Loss =    0 || Pos R2 =  701 || Neg R2 =  299


In [11]:
# Concat reports and display top few

report = report_concat.concat_cnn_tune_reports(
	reports = report_tune_model,
	mode    = 'regression',
	n       = 25
)

report

Unnamed: 0,Model,Sequence,Filter,Target0,Target1,Target2,ID,Valid_MSE,Valid_MAE,Valid_R2,Train_MSE,Epoch,Optimizer,Learning_Rate,Decay,Scheduler,Batch_Size,Dropout
0,washburn,transcript,f1,global,mean,,7ef97ae2,0.570024207,0.609821786,0.34703895,0.595069735,25,adam,0.000544388,0.000122204,linear,128,0.365
1,washburn,transcript,f1,global,mean,,0aafbac3,0.561583796,0.609860527,0.34289968,0.545087287,25,adam,0.000531909,0.001713518,linear,64,0.311
2,washburn,transcript,f1,global,mean,,35d812b8,0.565464515,0.604954705,0.340912,0.565714359,25,adam,0.00101506,5.3782e-05,linear,64,0.33
3,washburn,transcript,f1,global,mean,,d4343ae0,0.564567613,0.608256436,0.34080377,0.53700126,25,adam,0.000758836,0.000133459,plateau,64,0.29
4,washburn,transcript,f1,global,mean,,dd04e4a0,0.564029921,0.610085047,0.34040952,0.482817544,25,adam,0.000489443,2.2376e-05,exponential,64,0.298
5,washburn,transcript,f1,global,mean,,916a3364,0.575245946,0.616814337,0.336771,0.585328399,25,adam,0.001012936,0.000182368,linear,128,0.369
6,washburn,transcript,f1,global,mean,,acfb8d5c,0.575034637,0.621437761,0.33579603,0.577052662,25,adam,0.001102548,4.0476e-05,linear,128,0.373
7,washburn,transcript,f1,global,mean,,8cb535f7,0.569429691,0.607819972,0.33464584,0.482634468,25,adam,0.000726321,4.6484e-05,plateau,64,0.205
8,washburn,transcript,f1,global,mean,,47153c98,0.577103387,0.61517696,0.33438563,0.592214389,25,adam,0.002041047,1.9075e-05,linear,128,0.378
9,washburn,transcript,f1,global,mean,,786474da,0.579391778,0.617812843,0.33416605,0.605706154,25,adam,0.000366639,0.000312436,linear,128,0.439


In [12]:
# Save top model configurations (to same format as model params)

for key in report_tune_model['regression'].keys() :
	df = report_tune_model['regression'][key]
	df = df.sort_values('valid_r2', ascending = False)

	if   key.startswith('zrimec')   : config = ZCONFIG
	elif key.startswith('washburn') : config = WCONFIG
	else : raise ValueError()

	if   key.startswith('zrimec')   : layers = [3, 3]
	elif key.startswith('washburn') : layers = [6, 3]
	else : raise ValueError()

	for i in range(CONFIG_BEST) :
		item = dict()

		for k, v in df.iloc[i, :].to_dict().items() :
			if   k.startswith('valid')  : pass
			elif k.startswith('train')  : pass
			elif k.startswith('config') : k = k[7:]
			else : continue

			if k.startswith('model/convx') :
				for i in range(2, layers[0] + 1) :
					item[k.replace('x', str(i))] = v

			elif k.startswith('model/maxpoolx') :
				for i in range(1, layers[1] + 1) :
					item[k.replace('x', str(i))] = v

			else :
				item[k] = v

		config.append(item)

ZCONFIG = sorted(ZCONFIG, key = lambda x : x['valid_r2'], reverse = True)
WCONFIG = sorted(WCONFIG, key = lambda x : x['valid_r2'], reverse = True)

ZCONFIG = ZCONFIG[:CONFIG_MAX]
WCONFIG = WCONFIG[:CONFIG_MAX]

writer.write_json(
	filename = ZCONFIG_PATH,
	data     = ZCONFIG
)

writer.write_json(
	filename = WCONFIG_PATH,
	data     = WCONFIG
)

### 1.2.2 Classification

In [13]:
# Concat reports and display top few

report = report_concat.concat_cnn_tune_reports(
	reports = report_tune_model,
	mode    = 'classification',
	n       = None
)

report

## 1.3 Tune Data

In [14]:
# Display top few results for each tune configuration

report_tune_data = report_load.load_data_tune_reports(
	root = ROOT_DIR,
	show = False,
	n    = 25
)

### 1.3.1 Regression

In [15]:
# Concat reports and display top few

report = report_concat.concat_data_tune_reports(
	reports = report_tune_data,
	mode    = 'regression',
	n       = 25
)

report

### 1.3.1 Classification

In [16]:
# Concat reports and display top few

report = report_concat.concat_data_tune_reports(
	reports = report_tune_data,
	mode    = 'classification',
	n       = 25
)

report

## 1.4 Tune Feature

In [17]:
# Display top few results for each tune configuration

report_tune_feature = report_load.load_feature_tune_reports(
	root = ROOT_DIR,
	show = False,
	n    = 25
)

### 1.4.1 Regression

In [18]:
# Concat reports and display top few

report = report_concat.concat_feature_tune_reports(
	reports = report_tune_feature,
	mode    = 'regression',
	n       = None
)

if report is not None :
	report = report[report['Target1'] == 'mean'].head(n = 50)

report

### 1.4.2 Classification

In [19]:
# Concat reports and display top few

report = report_concat.concat_feature_tune_reports(
	reports = report_tune_feature,
	mode    = 'classification',
	n       = 25
)

report

# 2. DNABert

## 2.1 Model

In [20]:
# Load bert model reports

report_bert_model = report_load.load_bert_reports(
	root = ROOT_DIR,
	show = False
)

In [21]:
# Plot

report_plot.models_bert_r2(
	data = report_filter.filter_bert_reports(
		reports = report_bert_model,
		keep_only = ['12'],
		drop_only = ['lamb', '6-00']
	),
	mode     = 'regression',
	x        = 12,
	y        = None,
	filename = None,
	step     = 'epoch'
)

In [22]:
# Concat reports and display top few

report = report_concat.concat_bert_reports(
	data      = report_bert_model,
	mode      = 'regression',
	metric    = 'eval_r2',
	ascending = False
)

if report is not None :
	report = report.head(n = 25)

report