In [None]:
# Libraries

import argparse
import matplotlib
import numpy
import os
import pandas
import platform
import seaborn
import shutil
import sys

In [None]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux'   : ROOT = '/d/hpc/projects/FRI/up4472/upolanc-thesis'
		elif platform.system() == 'Windows' : ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else : raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		print()

		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [None]:
# Code

from source.python     import runtime
from source.python.cnn import cnn_model
from source.python.cnn import cnn_occlusion
from source.python.io  import loader
from source.python.io  import writer

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

# 1. Setup

In [None]:
# Check for args

# zrimec-0-tf2150-f2-0250-77-tissue-mean-explode
# washburn-0-tf2150-f2-0250-77-tissue-mean-explode

MODEL_FOLDER = 'washburn-0-tf2150-f2-0250-77-tissue-mean-explode'
MODEL_NAME   = 'washburn'
FILTER_ID    = 2

OCCLUSION_START  = 0
OCCLUSION_END    = 2150
OCCLUSION_SIZE   = 20
OCCLUSION_STRIDE = 50
OCCLUSION_TYPE   = 'zero'
OCCLUSION_METHOD = 'region'
RELEVANCE_TYPE   = 'r2'

if __name__ == '__main__' and 'get_ipython' in dir() :
	print('Running as .ipynb')
	print()

if __name__ == '__main__' and 'get_ipython' not in dir() :
	print('Running as .py')
	print()

	parser = argparse.ArgumentParser()

	parser.add_argument('--model_folder',     type = str, default = '')
	parser.add_argument('--occlusion_size',   type = int, default = 10)
	parser.add_argument('--occlusion_stride', type = int, default = 10)
	parser.add_argument('--occlusion_type',   type = str, default = 'zero',   choices = ['zero', 'shuffle', 'random'])
	parser.add_argument('--occlusion_method', type = str, default = 'window', choices = ['window', 'region'])
	parser.add_argument('--relevance_type',   type = str, default = 'r2',     choices = ['r2', 'mse', 'mae'])

	args = vars(parser.parse_args())

	MODEL_FOLDER     = args['model_folder']
	OCCLUSION_SIZE   = args['occlusion_size']
	OCCLUSION_STRIDE = args['occlusion_stride']
	OCCLUSION_TYPE   = args['occlusion_type']
	OCCLUSION_METHOD = args['occlusion_method']
	RELEVANCE_TYPE   = args['relevance_type']

OCCLUSION_METHOD = OCCLUSION_METHOD.lower()
MODEL_FILTER   = int(MODEL_FOLDER.split('-')[3][1:])
MODEL_NAME     = str(MODEL_FOLDER.split('-')[0])
RELEVANCE_TYPE = RELEVANCE_TYPE.upper()

if MODEL_NAME == 'washburn' : MODEL_NAME = MODEL_NAME + '2019r'
if MODEL_NAME == 'zrimec'   : MODEL_NAME = MODEL_NAME + '2020r'

In [None]:
# Setup some directory paths

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')
REP = os.path.join(CWD, 'reports')

FID = 'filter' + str(FILTER_ID)

OUT_DATA  = os.path.join(OUT, 'nbp16-occlusion')
RES_NBP01 = os.path.join(OUT, 'nbp01-filter',    FID)
RES_NBP02 = os.path.join(OUT, 'nbp02-anndata',   FID)
RES_NBP04 = os.path.join(OUT, 'nbp04-feature',   FID)
RES_NBP05 = os.path.join(OUT, 'nbp05-target',    FID)

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f' Resource Directory : {RES_NBP01}')
print(f' Resource Directory : {RES_NBP02}')
print(f' Resource Directory : {RES_NBP04}')
print(f' Resource Directory : {RES_NBP05}')
print()

In [None]:
# System and device

DEVICE = runtime.get_device(only_cpu = False)
SYSTEM = runtime.get_system_info()

for key, value in SYSTEM.items() :
	print('{:25s} : {}'.format(key, value))

print()

In [None]:
# Define paths

folder_path = os.path.join(REP,         'regression-cnn')
folder_path = os.path.join(folder_path, MODEL_FOLDER)

report_path = os.path.join(folder_path, 'report_keep.json')
models_path = os.path.join(folder_path, 'model', '{}-best.pth'.format(MODEL_NAME))
config_path = os.path.join(folder_path, 'config.json')

In [None]:
# Load config and fix split size

config = loader.load_json(config_path)

config['dataset/split/test']  = 0.0
config['dataset/split/valid'] = 0.0

print('Output Target  : {}'.format(config['model/output/target']))
print('Output Type    : {}'.format(config['model/output/type']))
print('Output Explode : {}'.format(config['model/output/explode']))
print('Features       : {}'.format(config['model/features']))
print('Seed           : {}'.format(config['core/random']))
print()

# 2. Dataset

In [None]:
# Create dataset from evaluation transcripts only.

sequences, features = cnn_occlusion.select_only_evaluation_transcripts(
	directory = RES_NBP04,
	report    = report_path
)

dataset, config = cnn_occlusion.to_dataset(
	sequences = sequences,
	features  = features,
	directory = RES_NBP05,
	config    = config
)

# 3. Model

In [None]:
# Load pretrained model.

model, model_params = cnn_occlusion.load_pretrained_model(
	config     = config,
	device     = DEVICE,
	path       = models_path,
	dataloader = None
)

# 4. Baseline Without Occlusion

In [None]:
# Run no occlusion evaluation

model_params['test_dataloader'] = cnn_occlusion.create_dataloader_without_occlusion(
	dataset = dataset,
	config  = config
)

report = cnn_model.eval_regressor(
	model  = model,
	params = model_params
)

baseline = {
	'r2'  : report['eval']['metric']['r2'].flatten().mean(),
	'mse' : report['eval']['metric']['mse'].flatten().mean(),
	'mae' : report['eval']['metric']['mae'].flatten().mean(),
}

print('Baseline R2  : {:.5f}'.format(baseline['r2']))
print('Baseline MSE : {:.5f}'.format(baseline['mse']))
print('Baseline MAE : {:.5f}'.format(baseline['mae']))
print()

# 5. Reports With Window Occlusion

In [None]:
# Compute scores with occlusion

if OCCLUSION_METHOD == 'window' :
	reports = list()

	for spoint in numpy.arange(OCCLUSION_START, OCCLUSION_END, OCCLUSION_STRIDE) :
		epoint = spoint + OCCLUSION_SIZE
		mpoint = (spoint + epoint) // 2

		model_params['test_dataloader'] = cnn_occlusion.create_dataloader_with_occlusion(
			dataset = dataset,
			config  = config,
			start   = spoint,
			end     = epoint,
			method  = OCCLUSION_TYPE
		)

		evaluation = cnn_model.eval_regressor(
			model  = model,
			params = model_params
		)

		evaluation = {
			'start' : int(spoint),
			'end'   : int(epoint),
			'mid'   : int(mpoint),
			'r2'    : float(evaluation['eval']['metric']['r2'].flatten().mean()),
			'mse'   : float(evaluation['eval']['metric']['mse'].flatten().mean()),
			'mae'   : float(evaluation['eval']['metric']['mae'].flatten().mean())
		}

		evaluation['r2-relevance'] = float(cnn_occlusion.compute_relevance(
			base  = baseline['r2'],
			value = evaluation['r2']
		))

		evaluation['mse-relevance'] = float(cnn_occlusion.compute_relevance(
			base  = baseline['mse'],
			value = evaluation['mse']
		))

		evaluation['mae-relevance'] = float(cnn_occlusion.compute_relevance(
			base  = baseline['mae'],
			value = evaluation['mae']
		))

		reports.append(evaluation)

In [None]:
# Save evaluations

if OCCLUSION_METHOD == 'window' :
	writer.write_json(
		data     = reports,
		filename = os.path.join(OUT_DATA, 'table-occlusion-w{}-s{}.json'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE))
	)

In [None]:
# Create data and save

if OCCLUSION_METHOD == 'window' :
	relevances = pandas.DataFrame.from_dict({
		'R2'       : [report['r2-relevance']  for report in reports],
		'MAE'      : [report['mae-relevance'] for report in reports],
		'MSE'      : [report['mse-relevance'] for report in reports],
		'Midpoint' : [report['mid']           for report in reports]
	})

	writer.write_csv(
		data     = relevances,
		filename = os.path.join(OUT_DATA, 'table-occlusion-w{}-s{}.csv'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE)),
	)

In [None]:
# Plot relevance for entire sequence

if OCCLUSION_METHOD == 'window' :
	fig, ax = matplotlib.pyplot.subplots(figsize = (16, 10))
	fig.tight_layout()

	seaborn.lineplot(
		data      = relevances,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = 2,
		ax        = ax,
		alpha     = 0.9
	)

	ax.set_ylabel(None)
	ax.set_xlabel(None)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-occlusion-w{}-s{}-full.png'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

In [None]:
# Plot relevance for each region seperately

if OCCLUSION_METHOD == 'window' :
	lengths = [1000, 300, 350, 500]
	regions = [
		('Promoter',      0, sum(lengths[:1])),
		('5\'UTR',     1000, sum(lengths[:2])),
		('3\'UTR',     1300, sum(lengths[:3])),
		('Terminator', 1650, sum(lengths[:4])),
	]

	fig, ax = matplotlib.pyplot.subplots(nrows = 1, ncols = 4, sharey = True, figsize = (16, 4))
	fig.tight_layout()

	for index, region in enumerate(regions) :
		axis = ax[index]

		subset = relevances
		subset = subset[subset['Midpoint'] >= region[1]]
		subset = subset[subset['Midpoint'] <  region[2]]

		seaborn.lineplot(
			data      = subset,
			x         = 'Midpoint',
			y         = RELEVANCE_TYPE,
			linewidth = 2,
			ax        = axis,
			alpha     = 0.9
		)

		axis.set_title(region[0])
		axis.set_ylabel(None)
		axis.set_xlabel(None)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-occlusion-w{}-s{}-seperate.png'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

# 6. Reports With Region Occlusion

In [None]:
# Compute scores with occlusion

if OCCLUSION_METHOD == 'region' :
	lengths = [1000, 300, 350, 500]
	regions = [
		('Promoter',      0, sum(lengths[:1])),
		('5\'UTR',     1000, sum(lengths[:2])),
		('3\'UTR',     1300, sum(lengths[:3])),
		('Terminator', 1650, sum(lengths[:4])),
	]

	reports = list()

	for region, spoint, epoint in regions :
		mpoint = (spoint + epoint) // 2

		model_params['test_dataloader'] = cnn_occlusion.create_dataloader_with_occlusion(
			dataset = dataset,
			config  = config,
			start   = spoint,
			end     = epoint,
			method  = OCCLUSION_TYPE
		)

		evaluation = cnn_model.eval_regressor(
			model  = model,
			params = model_params
		)

		evaluation = {
			'region' : str(region),
			'start'  : int(spoint),
			'end'    : int(epoint),
			'mid'    : int(mpoint),
			'r2'     : float(evaluation['eval']['metric']['r2'].flatten().mean()),
			'mse'    : float(evaluation['eval']['metric']['mse'].flatten().mean()),
			'mae'    : float(evaluation['eval']['metric']['mae'].flatten().mean())
		}

		evaluation['r2-relevance'] = float(cnn_occlusion.compute_relevance(
			base  = baseline['r2'],
			value = evaluation['r2']
		))

		evaluation['mse-relevance'] = float(cnn_occlusion.compute_relevance(
			base  = baseline['mse'],
			value = evaluation['mse']
		))

		evaluation['mae-relevance'] = float(cnn_occlusion.compute_relevance(
			base  = baseline['mae'],
			value = evaluation['mae']
		))

		reports.append(evaluation)

In [None]:
# Save evaluations

if OCCLUSION_METHOD == 'region' :
	writer.write_json(
		data     = reports,
		filename = os.path.join(OUT_DATA, 'table-occlusion-region.json')
	)

In [None]:
# Create data and save

if OCCLUSION_METHOD == 'region' :
	relevances = pandas.DataFrame.from_dict({
		'R2'       : [report['r2-relevance']  for report in reports],
		'MAE'      : [report['mae-relevance'] for report in reports],
		'MSE'      : [report['mse-relevance'] for report in reports],
		'Midpoint' : [report['mid']           for report in reports]
	})

	writer.write_csv(
		data     = relevances,
		filename = os.path.join(OUT_DATA, 'table-occlusion-region.csv'),
	)

In [None]:
dataframe = None

if OCCLUSION_METHOD == 'region' :
	reports.append({
		'region'        : 'None',
		'start'         : 0,
		'end'           : 0,
		'mid'           : 0,
		'r2'            : float(baseline['r2']),
		'mse'           : float(baseline['mse']),
		'mae'           : float(baseline['mae']),
		'r2-relevance'  : float(0),
		'mse-relevance' : float(0),
		'mae-relevance' : float(0)
	})

	dataframe = pandas.DataFrame.from_dict(reports)
	dataframe = dataframe.rename(columns = {
		'region'        : 'Occlusion',
		'r2'            : 'R2',
		'r2-relevance'  : 'R2-Relevance',
		'mae'           : 'MAE',
		'mae-relevance' : 'MAE-Relevance',
		'mse'           : 'MSE',
		'mse-relevance' : 'MSE-Relevance'
	})

	dataframe[ 'R2-Delta'] = dataframe[ 'R2'] - baseline[ 'r2']
	dataframe['MAE-Delta'] = dataframe['MAE'] - baseline['mae']
	dataframe['MSE-Delta'] = dataframe['MSE'] - baseline['mse']

	dataframe = dataframe[['Occlusion', 'R2', 'R2-Delta', 'R2-Relevance']]

dataframe