In [None]:
# Libraries

import matplotlib
import numpy
import os
import pandas
import platform
import scipy
import seaborn
import shutil
import sys

In [None]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux'   : ROOT = '/d/hpc/projects/FRI/up4472/upolanc-thesis'
		elif platform.system() == 'Windows' : ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else : raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		print()

		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [None]:
# Code

from source.python              import runtime
from source.python.cnn          import cnn_plot
from source.python.data.feature import feature_processing
from source.python.io           import loader

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

# 1. Setup

In [None]:
# Check for args

if __name__ == '__main__' and 'get_ipython' in dir() :
	print('Running as .ipynb')
	print()

if __name__ == '__main__' and 'get_ipython' not in dir() :
	print('Running as .py')
	print()

In [None]:
# Setup some directory paths

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')

OUT_DATA  = os.path.join(OUT, 'nbp17-plots')
RES_NBP01 = os.path.join(OUT, 'nbp01-filter')
RES_NBP02 = os.path.join(OUT, 'nbp02-anndata')
RES_NBP04 = os.path.join(OUT, 'nbp04-feature')

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f' Resource Directory : {RES_NBP01}')
print(f' Resource Directory : {RES_NBP02}')
print(f' Resource Directory : {RES_NBP04}')
print()

# 2. Transcript RSD Distribution

In [None]:
# Compute RSD for every filter

dataframes = dict()

for fid in [1, 2, 3, 4, 5, 6] :
	anndata = loader.load_h5ad(filename = os.path.join(RES_NBP04, 'filter{}'.format(fid), 'arabidopsis-r36.h5ad'))
	split   = loader.load_json(filename = os.path.join(RES_NBP01, 'filter{}'.format(fid), 'filter.json'))

	keep_transcript = split['data']['keep_transcript']
	drop_transcript = split['data']['drop_transcript']

	result = dict()

	result['Data']      = list()
	result['Group']     = list()
	result['Target']    = list()
	result['Median']    = list()
	result['Mean']      = list()
	result['Min']       = list()
	result['Max']       = list()
	result['Samples']   = list()
	result['RSD']       = list()
	result['RSD < 1.0'] = list()
	result['RSD < 0.5'] = list()

	target = 'global-mean'
	matrix = anndata

	for selection in ['all', 'keep', 'drop'] :
		if   selection == 'all'  : data = matrix
		elif selection == 'keep' : data = matrix[:, anndata.var.index.isin(keep_transcript)].copy()
		elif selection == 'drop' : data = matrix[:, anndata.var.index.isin(drop_transcript)].copy()
		else : raise ValueError()

		data = data.layers['boxcox1p']
		mean = data.mean(axis = 0)
		std  = data.std(axis = 0)
		rsd  = std / mean

		result['Data'     ].append(selection)
		result['Target'   ].append(target)
		result['Group'    ].append('')
		result['RSD'      ].append(rsd)
		result['Median'   ].append(numpy.median(rsd))
		result['Mean'     ].append(numpy.mean(rsd))
		result['Min'      ].append(numpy.min(rsd))
		result['Max'      ].append(numpy.max(rsd))
		result['Samples'  ].append(numpy.size(data, 0))
		result['RSD < 1.0'].append(numpy.sum(rsd < 1.0) / len(rsd))
		result['RSD < 0.5'].append(numpy.sum(rsd < 0.5) / len(rsd))

	target = 'tissue-mean'
	matrix = anndata

	for group in anndata.obs['Tissue'].unique() :
		matrix = anndata[anndata.obs['Tissue'] == group, :].copy()

		for selection in ['all', 'keep', 'drop'] :
			if   selection == 'all'  : data = matrix
			elif selection == 'keep' : data = matrix[:, anndata.var.index.isin(keep_transcript)].copy()
			elif selection == 'drop' : data = matrix[:, anndata.var.index.isin(drop_transcript)].copy()
			else : raise ValueError()

			data = data.layers['boxcox1p']
			mean = data.mean(axis = 0)
			std  = data.std(axis = 0)
			rsd  = std / mean

			result['Data'     ].append(selection)
			result['Target'   ].append(target)
			result['Group'    ].append(group)
			result['RSD'      ].append(rsd)
			result['Median'   ].append(numpy.median(rsd))
			result['Mean'     ].append(numpy.mean(rsd))
			result['Min'      ].append(numpy.min(rsd))
			result['Max'      ].append(numpy.max(rsd))
			result['Samples'  ].append(numpy.size(data, 0))
			result['RSD < 1.0'].append(numpy.sum(rsd < 1.0) / len(rsd))
			result['RSD < 0.5'].append(numpy.sum(rsd < 0.5) / len(rsd))

	target = 'group-mean'
	matrix = anndata

	for group in anndata.obs['Group'].unique() :
		matrix = anndata[anndata.obs['Group'] == group, :].copy()

		for selection in ['all', 'keep', 'drop'] :
			if   selection == 'all'  : data = matrix
			elif selection == 'keep' : data = matrix[:, anndata.var.index.isin(keep_transcript)].copy()
			elif selection == 'drop' : data = matrix[:, anndata.var.index.isin(drop_transcript)].copy()
			else : raise ValueError()

			data = data.layers['boxcox1p']
			mean = data.mean(axis = 0)
			std  = data.std(axis = 0)
			rsd  = std / mean

			result['Data'     ].append(selection)
			result['Target'   ].append(target)
			result['Group'    ].append(group)
			result['RSD'      ].append(rsd)
			result['Median'   ].append(numpy.median(rsd))
			result['Mean'     ].append(numpy.mean(rsd))
			result['Min'      ].append(numpy.min(rsd))
			result['Max'      ].append(numpy.max(rsd))
			result['Samples'  ].append(numpy.size(data, 0))
			result['RSD < 1.0'].append(numpy.sum(rsd < 1.0) / len(rsd))
			result['RSD < 0.5'].append(numpy.sum(rsd < 0.5) / len(rsd))

	dataframes[fid] = pandas.DataFrame.from_dict(result)
	dataframes[fid].to_csv(os.path.join(OUT_DATA, 'table-rsd-f{}.csv'.format(fid)))

In [None]:
# Display RSD

search_tissue = [
	('tissue-mean', 'seedling'),
	('tissue-mean', 'seed'),
	('tissue-mean', 'leaf'),
	('tissue-mean', 'root'),
	('tissue-mean', 'flower')
]

search_group = [
	('group-mean', 'young_seedling'),
	('group-mean', 'mature_seed'),
	('group-mean', 'mature_leaf'),
	('group-mean', 'mature_root'),
	('group-mean', 'mature_flower')
]

search = [('global-mean', None)]

matplotlib.rcParams.update({'font.size' : 30})
seaborn.set_theme(font_scale = 2.5)

search.extend(search_tissue)
search.extend(search_group)

for fid in [1, 2, 3, 4, 5, 6] :
	fig, ax = matplotlib.pyplot.subplots(figsize = (16, 10))
	fig.tight_layout()

	color = {
		'global-mean' : 'r',
		'tissue-mean' : 'g',
		'group-mean'  : 'b'
	}

	for target, group in search :
		x = dataframes[fid]
		x = x[x['Data'] == 'all']

		if target is not None : x = x[x['Target'] == target]
		if group  is not None : x = x[x['Group']  == group]

		x = x['RSD'].iloc[0]

		rsd_x = list()
		rsd_y = list()

		for t in numpy.arange(0.00, 2.00, 0.05) :
			rsd_y.append(t)
			rsd_x.append(numpy.sum(x < t) / len(x) * 100.0)

		label = ''

		if target is not None : label = target
		if group  is not None : label = label + '-' + group

		seaborn.lineplot(
			x         = rsd_y,
			y         = rsd_x,
			label     = label,
			ax        = ax,
			alpha     = 0.9,
			color     = color[target],
			linewidth = 4
		)

	matplotlib.pyplot.legend(
		loc  = 'lower right',
		prop = None
	)

	ax.set_xlabel('RSD')
	ax.set_ylabel('Percentage')

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-rsd-f{}.png'.format(fid)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

runtime.set_plot_theme()

# 3. Variance vs. TPM

In [None]:
# Compute variance

anndata = loader.load_h5ad(filename = os.path.join(RES_NBP04, 'filter2', 'arabidopsis-r36.h5ad'))
filters = loader.load_json(filename = os.path.join(RES_NBP01, 'filter2', 'filter.json'))
anndata = anndata[:, anndata.var.index.isin(filters['data']['keep_transcript'])]

dictionary = dict()
supergroup = 'Tissue'
layer      = 'boxcox1p'

for group in anndata.obs[supergroup].unique() :
	subdata = anndata[anndata.obs[supergroup].isin([group]), :]

	transcript = subdata.var.index.tolist()
	variance   = subdata.layers[layer].var(axis = 1).tolist()
	stdev      = subdata.layers[layer].std(axis = 1).tolist()
	dictionary.setdefault(group, dict())

	for k, v, s in zip(transcript, variance, stdev) :
		dictionary[group].setdefault(k, dict()).setdefault('variance', v)
		dictionary[group].setdefault(k, dict()).setdefault('stdev', s)

In [None]:
# Plot scatterplot variance

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
models_path = os.path.join(folder_path, 'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
report_path = os.path.join(models_path, 'report_keep.json')

matplotlib.rcParams.update({'font.size' : 36})
seaborn.set_theme(font_scale = 3)

for xtarget in ['ypred', 'ytrue'] :
	report = loader.load_json(report_path)
	report = report['eval']

	scatterdict = {
		'ypred' : list(),
		'ytrue' : list(),
		'value' : list(),
		'group' : list()
	}

	for key, ypred, ytrue in zip(report['keys'], report['ypred'], report['ytrue']) :
		if isinstance(ypred, list) : ypred = ypred[0]
		if isinstance(ytrue, list) : ytrue = ytrue[0]

		group      = key.split('?')[0]
		transcript = key.split('?')[1]

		if not group      in dictionary.keys()        : continue
		if not transcript in dictionary[group].keys() : continue

		scatterdict['ytrue'].append(ytrue)
		scatterdict['ypred'].append(ypred)
		scatterdict['value'].append(dictionary[group][transcript]['variance'])
		scatterdict['group'].append(group)

	scatterdf = pandas.DataFrame.from_dict(scatterdict)
	scatterdf = scatterdf.rename(columns = {
		xtarget : 'TPM',
		'value' : 'Variance',
		'group' : 'Tissue'
	})

	fig, ax = matplotlib.pyplot.subplots(figsize = (14, 10))
	fig.tight_layout()

	seaborn.scatterplot(
		data  = scatterdf,
		x     = 'TPM',
		y     = 'Variance',
		hue   = 'Tissue',
		ax    = ax,
		s     = 300,
		alpha = 0.9
	)

	ax.set_xlabel('TPM')
	ax.set_ylabel('Variance')
	ax.legend(markerscale = 2)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}-scatterplot.png'.format(xtarget)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

runtime.set_plot_theme()

In [None]:
# Plot lmplot variance

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
models_path = os.path.join(folder_path, 'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
report_path = os.path.join(models_path, 'report_keep.json')

for xtarget in ['ypred', 'ytrue'] :
	report = loader.load_json(report_path)
	report = report['eval']

	scatterdict = {
		'ypred' : list(),
		'ytrue' : list(),
		'value' : list(),
		'group' : list()
	}

	for key, ypred, ytrue in zip(report['keys'], report['ypred'], report['ytrue']) :
		if isinstance(ypred, list) : ypred = ypred[0]
		if isinstance(ytrue, list) : ytrue = ytrue[0]

		group      = key.split('?')[0]
		transcript = key.split('?')[1]

		if not group      in dictionary.keys()        : continue
		if not transcript in dictionary[group].keys() : continue

		scatterdict['ytrue'].append(ytrue)
		scatterdict['ypred'].append(ypred)
		scatterdict['value'].append(dictionary[group][transcript]['variance'])
		scatterdict['group'].append(group)

	scatterdf = pandas.DataFrame.from_dict(scatterdict)
	scatterdf = scatterdf.rename(columns = {
		xtarget : 'TPM',
		'value' : 'Variance',
		'group' : 'Tissue'
	})

	seaborn.lmplot(
		data  = scatterdf,
		x     = 'TPM',
		y     = 'Variance',
		hue   = 'Tissue',
		height = 10,
		aspect = 1.6
	)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}-lmplot.png'.format(xtarget)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

In [None]:
# Plot boxplot variance

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
models_path = os.path.join(folder_path, 'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
report_path = os.path.join(models_path, 'report_keep.json')

matplotlib.rcParams.update({'font.size' : 36})
seaborn.set_theme(font_scale = 3)

for xtarget in ['ypred', 'ytrue'] :
	report = loader.load_json(report_path)
	report = report['eval']

	scatterdict = {
		'ypred' : list(),
		'ytrue' : list(),
		'value' : list(),
		'group' : list()
	}

	for key, ypred, ytrue in zip(report['keys'], report['ypred'], report['ytrue']) :
		if isinstance(ypred, list) : ypred = ypred[0]
		if isinstance(ytrue, list) : ytrue = ytrue[0]

		group      = key.split('?')[0]
		transcript = key.split('?')[1]

		if not group      in dictionary.keys()        : continue
		if not transcript in dictionary[group].keys() : continue

		scatterdict['ytrue'].append(ytrue)
		scatterdict['ypred'].append(ypred)
		scatterdict['value'].append(dictionary[group][transcript]['variance'])
		scatterdict['group'].append(group)

	scatterdf = pandas.DataFrame.from_dict(scatterdict)
	scatterdf = scatterdf.rename(columns = {
		xtarget : 'TPM',
		'value' : 'Variance',
		'group' : 'Tissue'
	})

	fig, ax = matplotlib.pyplot.subplots(figsize = (7, 12))
	fig.tight_layout()

	seaborn.boxplot(
		data  = scatterdf,
		x     = 'Variance',
		y     = 'Tissue',
		ax    = ax
	)

	ax.set_xlabel('Variance')
	ax.set_ylabel('Tissue')

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}-boxplot.png'.format(xtarget)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

runtime.set_plot_theme()

In [None]:
# Plot scatterplot and boxplot variance

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
models_path = os.path.join(folder_path, 'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
report_path = os.path.join(models_path, 'report_keep.json')

matplotlib.rcParams.update({'font.size' : 36})
seaborn.set_theme(font_scale = 3)

for xtarget in ['ypred', 'ytrue'] :
	report = loader.load_json(report_path)
	report = report['eval']

	scatterdict = {
		'ypred' : list(),
		'ytrue' : list(),
		'value' : list(),
		'group' : list()
	}

	for key, ypred, ytrue in zip(report['keys'], report['ypred'], report['ytrue']) :
		if isinstance(ypred, list) : ypred = ypred[0]
		if isinstance(ytrue, list) : ytrue = ytrue[0]

		group      = key.split('?')[0]
		transcript = key.split('?')[1]

		if not group      in dictionary.keys()        : continue
		if not transcript in dictionary[group].keys() : continue

		scatterdict['ytrue'].append(ytrue)
		scatterdict['ypred'].append(ypred)
		scatterdict['value'].append(dictionary[group][transcript]['variance'])
		scatterdict['group'].append(group)

	scatterdf = pandas.DataFrame.from_dict(scatterdict)
	scatterdf = scatterdf.rename(columns = {
		xtarget : 'TPM',
		'value' : 'Variance',
		'group' : 'Tissue'
	})

	fig, ax = matplotlib.pyplot.subplots(
		nrows       = 1,
		ncols       = 2,
		sharey      = True,
		figsize     = (28, 12),
		gridspec_kw = {
			'width_ratios' : [60, 40]
		}
	)
	fig.tight_layout()

	ax1 = ax[0]
	ax2 = ax[1]

	seaborn.scatterplot(
		data  = scatterdf,
		x     = 'TPM',
		y     = 'Variance',
		hue   = 'Tissue',
		ax    = ax1,
		s     = 300,
		alpha = 0.9
	)

	ax1.set_xlabel('TPM')
	ax1.set_ylabel('Variance')
	ax1.legend(markerscale = 2)

	seaborn.boxplot(
		data  = scatterdf,
		y     = 'Variance',
		x     = 'Tissue',
		ax    = ax2
	)

	ax2.set_xlabel('Tissue')
	ax2.set_ylabel(None)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}-multiplot.png'.format(xtarget)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

runtime.set_plot_theme()

# 4. Prediction Error

In [None]:
# Plot prediction error

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
wmodels_path = os.path.join(folder_path,  'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
zmodels_path = os.path.join(folder_path,  'zrimec-0-tf2150-f2-0250-77-tissue-mean-explode')
wreport_path = os.path.join(wmodels_path, 'report_keep.json')
zreport_path = os.path.join(zmodels_path, 'report_keep.json')

matplotlib.rcParams.update({'font.size' : 48})
seaborn.set_theme(font_scale = 4)

for name, report_path in [('washburn', wreport_path), ('zrimec', zreport_path)] :
	report = loader.load_json(report_path)
	report = report['eval']

	keys = set([x.split('?')[0] for x in report['keys']])
	vals = dict()

	for key in keys :
		vals[key] = list()

	for key, ypred, ytrue in zip(report['keys'], report['ypred'], report['ytrue']) :
		if isinstance(ypred, list) : ypred = ypred[0]
		if isinstance(ytrue, list) : ytrue = ytrue[0]

		key = key.split('?')[0]
		val = ypred - ytrue

		vals[key].append(val)

	n, nrows, ncols = cnn_plot.compute_gridsize(
		n = len(keys)
	)

	if nrows < ncols :
		nrows, ncols = ncols, nrows

	fig, ax = matplotlib.pyplot.subplots(nrows, ncols, sharex = True, sharey = True, figsize = (10 * ncols, 10 * nrows))
	fig.tight_layout()

	minval = 0
	maxval = 0

	for index, key in enumerate(keys) :
		minval = min(numpy.min(vals[key]), minval)
		maxval = max(numpy.max(vals[key]), maxval)

	for index, key in enumerate(sorted(keys)) :
		r = index // ncols
		c = index  % ncols

		axis = ax[r, c]

		seaborn.histplot(
			x     = vals[key],
			ax    = axis,
			alpha = 0.9
		)

		axis.axvline(x = 0, color = 'r', linewidth = 4)

		gmin = minval - 0.1
		gmax = 290

		axis.text(gmin, 1.00 * gmax, r'$\bar{x}$')
		axis.text(gmin, 0.89 * gmax, r'$\tilde{x}$')
	
		axis.text(gmin + 0.38, 1.00 * gmax, '=')
		axis.text(gmin + 0.38, 0.89 * gmax, '=')

		axis.text(gmin + 0.65, 1.00 * gmax, '{: .3f}'.format(numpy.mean(vals[key])))
		axis.text(gmin + 0.65, 0.89 * gmax, '{: .3f}'.format(numpy.median(vals[key])))

		axis.set_title(key.title())
		axis.set_xlabel('Error')
		axis.set_ylabel('Count')

	for index in range(n, nrows * ncols) :
		if nrows == 1 or ncols == 1 : axis = ax[index]
		else                        : axis = ax[index // ncols, index % ncols]

		axis.axis('off')

	matplotlib.pyplot.subplots_adjust(
		left   = None,
		bottom = None,
		right  = None,
		top    = None,
		wspace = 0.05,
		hspace = 0.10
	)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}-prediction-error.png'.format(name)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

# 5. Actual vs. Predicted

In [None]:
# Plot actual vs. predicted

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
wmodels_path = os.path.join(folder_path,  'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
zmodels_path = os.path.join(folder_path,  'zrimec-0-tf2150-f2-0250-77-tissue-mean-explode')
wreport_path = os.path.join(wmodels_path, 'report_keep.json')
zreport_path = os.path.join(zmodels_path, 'report_keep.json')

matplotlib.rcParams.update({'font.size' : 48})
seaborn.set_theme(font_scale = 4)

for name, report_path in [('washburn', wreport_path), ('zrimec', zreport_path)] :
	report = loader.load_json(report_path)
	report = report['eval']

	keys   = set([x.split('?')[0] for x in report['keys']])
	ypreds = dict()
	ytrues = dict()

	for key in keys :
		ypreds[key] = list()
		ytrues[key] = list()

	for key, ypred, ytrue in zip(report['keys'], report['ypred'], report['ytrue']) :
		if isinstance(ypred, list) : ypred = ypred[0]
		if isinstance(ytrue, list) : ytrue = ytrue[0]

		key = key.split('?')[0]

		ypreds[key].append(ypred)
		ytrues[key].append(ytrue)

	n, nrows, ncols = cnn_plot.compute_gridsize(
		n = len(keys)
	)

	if nrows < ncols :
		nrows, ncols = ncols, nrows

	fig, ax = matplotlib.pyplot.subplots(nrows, ncols, sharex = True, sharey = True, figsize = (10 * ncols, 10 * nrows))
	fig.tight_layout()

	for index, key in enumerate(sorted(keys)) :
		r = index // ncols
		c = index  % ncols

		axis = ax[r, c]

		x = numpy.array(ypreds[key])
		y = numpy.array(ytrues[key])

		seaborn.scatterplot(
			x     = x,
			y     = y,
			ax    = axis,
			alpha = 0.9
		)

		res = scipy.stats.linregress(x, y)

		axis.plot(x, res.intercept + res.slope * x,
			color     = 'r',
			linewidth = 4
		)

		xmin, xmax = axis.get_xlim()
		ymin, ymax = axis.get_ylim()

		gmin = min(xmin, ymin)
		gmax = max(xmax, ymax)

		axis.set_title(key.title())
		axis.set_xlim([gmin, gmax])
		axis.set_ylim([gmin, gmax])
		axis.set_aspect('equal')

		offset = 0.05

		axis.text(0.0 * gmax - offset, 0.92 * gmax, 'k')
		axis.text(0.0 * gmax - offset, 0.84 * gmax, 'r')

		axis.text(0.05 * gmax - offset, 0.92 * gmax, '=')
		axis.text(0.05 * gmax - offset, 0.84 * gmax, '=')

		axis.text(0.10 * gmax - offset, 0.92 * gmax, '{: .3f}'.format(res.slope))
		axis.text(0.10 * gmax - offset, 0.84 * gmax, '{: .3f}'.format(res.rvalue))

		axis.set_xlabel('Predicted')
		axis.set_ylabel('Actual')

	for index in range(n, nrows * ncols) :
		if nrows == 1 or ncols == 1 : axis = ax[index]
		else                        : axis = ax[index // ncols, index % ncols]

		axis.axis('off')

	matplotlib.pyplot.subplots_adjust(
		left   = None,
		bottom = None,
		right  = None,
		top    = None,
		wspace = 0.05,
		hspace = 0.10
	)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}-prediction-linefit.png'.format(name)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

# 6. Error vs. TPM

In [None]:
# Define

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
wmodels_path = os.path.join(folder_path,  'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
zmodels_path = os.path.join(folder_path,  'zrimec-0-tf2150-f2-0250-77-tissue-mean-explode')
wreport_path = os.path.join(wmodels_path, 'report_keep.json')
zreport_path = os.path.join(zmodels_path, 'report_keep.json')

data = {
	'washburn' : None,
	'zrimec'   : None,
}

thresholds_boxcox = numpy.arange(0, 5, 0.25)
thresholds_tpm    = [0, 10, 50, 100, 250, 500, 1_000, 5_000, 10_000, 50_000, 250_000, 1_000_000]

to_tpm     = lambda x : feature_processing.boxcox1p_inv(x, -0.1455)
to_boxcox  = lambda x : x

to_absolute_error = lambda ytrue, ypred : abs(ytrue - ypred)
to_squared_error  = lambda ytrue, ypred : (ytrue - ypred) ** 2

In [None]:
# Plot error vs. tpm

combinations = [
	(to_squared_error,  'squared-error'),
	(to_absolute_error, 'absolute-error')
]

for to_metric, metric_name in combinations :
	data = {
		'washburn' : None,
		'zrimec'   : None,
	}

	for model_name, report_path in [('washburn', wreport_path), ('zrimec', zreport_path)] :
		report = loader.load_json(report_path)
		report = report['eval']

		temp = {
			'y' : list(),
			'x' : list()
		}

		for ytrue, ypred in zip(report['ytrue'], report['ypred']) :
			ytrue = ytrue[0]
			ypred = ypred[0]

			ytrue = to_boxcox(ytrue)
			ypred = to_boxcox(ypred)
			value = to_metric(ytrue, ypred)

			for i, threshold in enumerate(thresholds_boxcox) :
				if ytrue <= threshold :
					temp['y'].append(value)
					temp['x'].append(threshold)

					break

		data[model_name] = temp

	fig, ax = matplotlib.pyplot.subplots(figsize = (16, 10))
	fig.tight_layout()

	seaborn.lineplot(
		data      = pandas.DataFrame.from_dict(data['washburn']),
		x         = 'x',
		y         = 'y',
		ax        = ax,
		linewidth = 4,
		label     = 'Washburn'
	)

	seaborn.lineplot(
		data      = pandas.DataFrame.from_dict(data['zrimec']),
		x         = 'x',
		y         = 'y',
		ax        = ax,
		linewidth = 4,
		label     = 'Zrimec'
	)

	ax.set_xlabel('TPM')
	ax.set_ylabel(metric_name.replace('-', ' ').title())

	matplotlib.pyplot.legend(
		loc = 'upper left'
	)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}.png'.format(metric_name)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

# 7. Relevance of Window Occlusion

In [None]:
# Load original occlusion relevance data

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
wocclusion_path = os.path.join(folder_path,  'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
zocclusion_path = os.path.join(folder_path,  'zrimec-0-tf2150-f2-0250-77-tissue-mean-explode')

OCCLUSION_SIZE   = 10
OCCLUSION_STRIDE = 1
OCCLUSION_JSON   = 'table-occlusion-w{}-s{}.json'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE)
OCCLUSION_CSV    = 'table-occlusion-w{}-s{}.csv'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE)
RELEVANCE_TYPE   = 'R2'

wocclusion_json = loader.load_json(os.path.join(wocclusion_path, 'occlusion', OCCLUSION_JSON))
zocclusion_json = loader.load_json(os.path.join(zocclusion_path, 'occlusion', OCCLUSION_JSON))
wocclusion_csv  = loader.load_csv(os.path.join(wocclusion_path, 'occlusion', OCCLUSION_CSV))
zocclusion_csv  = loader.load_csv(os.path.join(zocclusion_path, 'occlusion', OCCLUSION_CSV))

wocclusion_json = pandas.DataFrame.from_dict(wocclusion_json)
wocclusion_json = wocclusion_json.drop(columns = ['mae-relevance', 'mse-relevance', 'r2-relevance', 'start', 'end'])
wocclusion_json = wocclusion_json.rename(columns = {'mae' : 'MAE', 'mse' : 'MSE', 'r2' : 'R2', 'mid' : 'Midpoint'})

zocclusion_json = pandas.DataFrame.from_dict(zocclusion_json)
zocclusion_json = zocclusion_json.drop(columns = ['mae-relevance', 'mse-relevance', 'r2-relevance', 'start', 'end'])
zocclusion_json = zocclusion_json.rename(columns = {'mae' : 'MAE', 'mse' : 'MSE', 'r2' : 'R2', 'mid' : 'Midpoint'})

WASHBURN_BASELINE = dict()
ZRIMEC_BASELINE   = dict()

for metric in ['R2', 'MAE', 'MSE'] :
	wr = wocclusion_csv[metric]
	zr = zocclusion_csv[metric]

	wo = wocclusion_json[metric]
	zo = zocclusion_json[metric]

	ZRIMEC_BASELINE[metric]   = (zo / (1 - zr)).to_numpy().mean()
	WASHBURN_BASELINE[metric] = (wo / (1 - wr)).to_numpy().mean()

	print('Baseline {:3s} for Washburn : {:.5f}'.format(metric, WASHBURN_BASELINE[metric]))
	print('Baseline {:3s} for Zrimec   : {:.5f}'.format(metric, ZRIMEC_BASELINE[metric]))

print()

In [None]:
# Increase stride artifically

OCCLUSION_STRIDE = 10

wocclusion_json = wocclusion_json[wocclusion_json['Midpoint'] % OCCLUSION_STRIDE == 0]
zocclusion_json = zocclusion_json[zocclusion_json['Midpoint'] % OCCLUSION_STRIDE == 0]

wocclusion_csv = wocclusion_csv[wocclusion_csv['Midpoint'] % OCCLUSION_STRIDE == 0]
zocclusion_csv = zocclusion_csv[zocclusion_csv['Midpoint'] % OCCLUSION_STRIDE == 0]

In [None]:
# Plot relevance for entire sequence

LINEWIDTH = 2
ALPHA     = 0.8

matplotlib.rcParams.update({'font.size' : 36})
seaborn.set_theme(font_scale = 3)

for vline in [True, False] :
	fig, ax = matplotlib.pyplot.subplots(figsize = (16, 8))
	fig.tight_layout()

	seaborn.lineplot(
		data      = wocclusion_csv,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Washburn',
		ax        = ax,
		alpha     = ALPHA
	)

	seaborn.lineplot(
		data      = zocclusion_csv,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Zrimec',
		ax        = ax,
		alpha     = ALPHA
	)

	if vline : 
		ax.axvline(1000, ymin = 0, ymax = 1, color = 'k')
		ax.axvline(1300, ymin = 0, ymax = 1, color = 'k')
		ax.axvline(1650, ymin = 0, ymax = 1, color = 'k')

	ax.set_xlabel('Position')
	ax.set_ylabel('Relevance')

	ax.legend(loc = 'upper left')

	if vline : path = 'graph-occlusion-relevance-w{}-s{}-full-with-vline.png'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE)
	else     : path = 'graph-occlusion-relevance-w{}-s{}-full.png'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, path),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

runtime.set_plot_theme()

In [None]:
# Setting x ticks

def set_xticks_zero_right (ticks, left, right) : return ticks - right
def set_xticks_zero_left  (ticks, left, right) : return ticks - left

In [None]:
# Plot relevance for each region seperately

lengths = [1000, 300, 350, 500]
regions = [
	('Promoter',      0, sum(lengths[:1])),
	('5\'UTR',     1000, sum(lengths[:2])),
	('3\'UTR',     1300, sum(lengths[:3])),
	('Terminator', 1650, sum(lengths[:4])),
]

LINEWIDTH = 2
ALPHA     = 0.8
NROWS     = 2
NCOLS     = 2

matplotlib.rcParams.update({'font.size' : 48})
seaborn.set_theme(font_scale = 4)

fig, ax = matplotlib.pyplot.subplots(nrows = NROWS, ncols = NCOLS, sharey = True, figsize = (NCOLS * 16, NROWS * 10))
fig.tight_layout()

for index, region in enumerate(regions) :
	if NROWS > 1 : axis = ax[index // NCOLS, index % NCOLS]
	else         : axis = ax[index]

	if   region[0] in ['Promoter']   : marker = 'TSS'
	elif region[0] in ['5\'UTR']     : marker = 'CDS'
	elif region[0] in ['3\'UTR']     : marker = 'TTS'
	elif region[0] in ['Terminator'] : marker = 'TTS'
	else : raise ValueError()

	subset = wocclusion_csv
	subset = subset[subset['Midpoint'] >= region[1]]
	subset = subset[subset['Midpoint'] <  region[2]]

	if region[0] in ['Promoter']   : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['5\'UTR']     : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['3\'UTR']     : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['Terminator'] : subset['Midpoint'] = set_xticks_zero_left (subset['Midpoint'], region[1], region[2])

	seaborn.lineplot(
		data      = subset,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Washburn',
		ax        = axis,
		alpha     = ALPHA
	)

	subset = zocclusion_csv
	subset = subset[subset['Midpoint'] >= region[1]]
	subset = subset[subset['Midpoint'] <  region[2]]

	if region[0] in ['Promoter']   : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['5\'UTR']     : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['3\'UTR']     : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['Terminator'] : subset['Midpoint'] = set_xticks_zero_left (subset['Midpoint'], region[1], region[2])

	seaborn.lineplot(
		data      = subset,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Zrimec',
		ax        = axis,
		alpha     = ALPHA
	)

	axis.set_title(region[0])

	axis.set_xlabel('Position from {}'.format(marker))
	axis.set_ylabel('Relevance')

	axis.legend(loc = 'upper center')

matplotlib.pyplot.subplots_adjust(
	left   = None,
	bottom = None,
	right  = None,
	top    = None,
	wspace = 0.05,
	hspace = 0.40
)

matplotlib.pyplot.savefig(
	os.path.join(OUT_DATA, 'graph-occlusion-relevance-w{}-s{}-seperate.png'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE)),
	dpi         = 120,
	format      = 'png',
	bbox_inches = 'tight',
	pad_inches  = 0
)

runtime.set_plot_theme()

In [None]:
# Plot relevance for entire sequence

RELEVANCE_TYPE = 'R2'
PLOT_TWINX     = False
LINEWIDTH      = 2
ALPHA          = 0.8

for vline in [False, True] :
	fig, ax1 = matplotlib.pyplot.subplots(figsize = (16, 8))
	fig.tight_layout()

	seaborn.lineplot(
		data      = wocclusion_json,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Washburn',
		ax        = ax1,
		alpha     = ALPHA
	)

	if PLOT_TWINX : ax2 = ax1.twinx()
	else          : ax2 = ax1

	seaborn.lineplot(
		data      = zocclusion_json,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Zrimec',
		ax        = ax2,
		alpha     = ALPHA
	)

	ax1.axhline(WASHBURN_BASELINE[RELEVANCE_TYPE], color = 'b', linestyle = '--', alpha = ALPHA)
	ax2.axhline(ZRIMEC_BASELINE[RELEVANCE_TYPE],   color = 'r', linestyle = '--', alpha = ALPHA)

	if vline : 
		ax1.axvline(1000, ymin = 0, ymax = 1, color = 'k')
		ax1.axvline(1300, ymin = 0, ymax = 1, color = 'k')
		ax1.axvline(1650, ymin = 0, ymax = 1, color = 'k')

	ax1.set_xlabel('Position')

	if PLOT_TWINX : ax2.set_ylabel('Zrimec '   + RELEVANCE_TYPE)
	if PLOT_TWINX : ax1.set_ylabel('Washburn ' + RELEVANCE_TYPE)
	else          : ax1.set_ylabel(RELEVANCE_TYPE)

	if vline : path = 'graph-occlusion-{}-w{}-s{}-full-with-vline.png'.format(RELEVANCE_TYPE.lower(), OCCLUSION_SIZE, OCCLUSION_STRIDE)
	else     : path = 'graph-occlusion-{}-w{}-s{}-full.png'.format(RELEVANCE_TYPE.lower(), OCCLUSION_SIZE, OCCLUSION_STRIDE)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, path),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

In [None]:
# Plot relevance for each region seperately

lengths = [1000, 300, 350, 500]
regions = [
	('Promoter',      0, sum(lengths[:1])),
	('5\'UTR',     1000, sum(lengths[:2])),
	('3\'UTR',     1300, sum(lengths[:3])),
	('Terminator', 1650, sum(lengths[:4])),
]

matplotlib.rcParams.update({'font.size' : 48})
seaborn.set_theme(font_scale = 4)

LINEWIDTH = 2
ALPHA     = 0.8
NROWS     = 2
NCOLS     = 2

fig, ax = matplotlib.pyplot.subplots(nrows = NROWS, ncols = NCOLS, sharey = True, figsize = (NCOLS * 16, NROWS * 10))
fig.tight_layout()

for index, region in enumerate(regions) :
	if NROWS > 1 : axis = ax[index // NCOLS, index % NCOLS]
	else         : axis = ax[index]

	if   region[0] in ['Promoter']   : marker = 'TSS'
	elif region[0] in ['5\'UTR']     : marker = 'CDS'
	elif region[0] in ['3\'UTR']     : marker = 'TTS'
	elif region[0] in ['Terminator'] : marker = 'TTS'
	else : raise ValueError()

	subset = wocclusion_json
	subset = subset[subset['Midpoint'] >= region[1]]
	subset = subset[subset['Midpoint'] <  region[2]]

	if region[0] in ['Promoter']   : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['5\'UTR']     : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['3\'UTR']     : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['Terminator'] : subset['Midpoint'] = set_xticks_zero_left (subset['Midpoint'], region[1], region[2])

	seaborn.lineplot(
		data      = subset,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Washburn',
		ax        = axis,
		alpha     = ALPHA
	)

	subset = zocclusion_json
	subset = subset[subset['Midpoint'] >= region[1]]
	subset = subset[subset['Midpoint'] <  region[2]]

	if region[0] in ['Promoter']   : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['5\'UTR']     : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['3\'UTR']     : subset['Midpoint'] = set_xticks_zero_right(subset['Midpoint'], region[1], region[2])
	if region[0] in ['Terminator'] : subset['Midpoint'] = set_xticks_zero_left (subset['Midpoint'], region[1], region[2])

	seaborn.lineplot(
		data      = subset,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Zrimec',
		ax        = axis,
		alpha     = ALPHA
	)

	axis.set_title(region[0])

	axis.set_xlabel('Position from {}'.format(marker))
	axis.set_ylabel(RELEVANCE_TYPE)

	axis.axhline(WASHBURN_BASELINE[RELEVANCE_TYPE], color = 'b', linestyle = '--', alpha = ALPHA)
	axis.axhline(ZRIMEC_BASELINE[RELEVANCE_TYPE],   color = 'r', linestyle = '--', alpha = ALPHA)

	axis.legend(loc = 'center right')

matplotlib.pyplot.subplots_adjust(
	left   = None,
	bottom = None,
	right  = None,
	top    = None,
	wspace = 0.05,
	hspace = 0.40
)

matplotlib.pyplot.savefig(
	os.path.join(OUT_DATA, 'graph-occlusion-{}-w{}-s{}-seperate.png'.format(RELEVANCE_TYPE.lower(), OCCLUSION_SIZE, OCCLUSION_STRIDE)),
	dpi         = 120,
	format      = 'png',
	bbox_inches = 'tight',
	pad_inches  = 0
)

runtime.set_plot_theme()

In [None]:
# Plot relevance and r2 for entire sequence

LINEWIDTH = 2
ALPHA     = 0.8

matplotlib.rcParams.update({'font.size' : 36})
seaborn.set_theme(font_scale = 3)

for vline in [True, False] :
	fig, ax = matplotlib.pyplot.subplots(nrows = 2, ncols = 1, sharex = True, figsize = (16, 16))
	fig.tight_layout()

	ax1 = ax[0]
	ax2 = ax[1]

	seaborn.lineplot(
		data      = wocclusion_csv,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Washburn',
		ax        = ax1,
		alpha     = ALPHA
	)

	seaborn.lineplot(
		data      = zocclusion_csv,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Zrimec',
		ax        = ax1,
		alpha     = ALPHA
	)

	if vline : 
		ax1.axvline(1000, ymin = 0, ymax = 1, color = 'k')
		ax1.axvline(1300, ymin = 0, ymax = 1, color = 'k')
		ax1.axvline(1650, ymin = 0, ymax = 1, color = 'k')

		# ax1.set_title('{:>25s} {:>36s} {:>15s} {:>26s}'.format('Promoter', '5\'UTR', '3\'UTR', 'Terminator'))
		ax1.set_title('{:>18s} {:>22s} {:>7s} {:>16s}'.format('Promoter', '5\'UTR', '3\'UTR', 'Terminator'))

	ax1.set_xlabel('Position')
	ax1.set_ylabel('Relevance')

	ax1.legend(loc = 'upper left')

	seaborn.lineplot(
		data      = wocclusion_json,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Washburn',
		ax        = ax2,
		alpha     = ALPHA
	)

	if PLOT_TWINX : ax3 = ax2.twinx()
	else          : ax3 = ax2

	seaborn.lineplot(
		data      = zocclusion_json,
		x         = 'Midpoint',
		y         = RELEVANCE_TYPE,
		linewidth = LINEWIDTH,
		label     = 'Zrimec',
		ax        = ax3,
		alpha     = ALPHA
	)

	ax2.axhline(WASHBURN_BASELINE[RELEVANCE_TYPE], color = 'b', linestyle = '--', alpha = ALPHA)
	ax3.axhline(ZRIMEC_BASELINE[RELEVANCE_TYPE],   color = 'r', linestyle = '--', alpha = ALPHA)

	if vline : 
		ax2.axvline(1000, ymin = 0, ymax = 1, color = 'k')
		ax2.axvline(1300, ymin = 0, ymax = 1, color = 'k')
		ax2.axvline(1650, ymin = 0, ymax = 1, color = 'k')

	ax2.set_xlabel('Position')

	if PLOT_TWINX : ax3.set_ylabel('Zrimec '   + RELEVANCE_TYPE)
	if PLOT_TWINX : ax2.set_ylabel('Washburn ' + RELEVANCE_TYPE)
	else          : ax2.set_ylabel(RELEVANCE_TYPE)

	if vline : path = 'graph-occlusion-both-w{}-s{}-full-with-vline.png'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE)
	else     : path = 'graph-occlusion-both-w{}-s{}-full.png'.format(OCCLUSION_SIZE, OCCLUSION_STRIDE)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, path),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

runtime.set_plot_theme()

# 8. Relevance of Region Occlusion

In [None]:
# Load regions occlusion data

wregions = loader.load_json(os.path.join(wocclusion_path, 'occlusion', 'table-occlusion-region.json'))
zregions = loader.load_json(os.path.join(zocclusion_path, 'occlusion', 'table-occlusion-region.json'))

wregions.append({
	'end'           : numpy.inf,
	'mae'           : WASHBURN_BASELINE['MAE'],
	'mae-relevance' : 0.0,
	'mid'           : numpy.inf,
	'mse'           : WASHBURN_BASELINE['MSE'],
	'mse-relevance' : 0.0,
	'r2'            : WASHBURN_BASELINE['R2'],
	'r2-relevance'  : 0.0,
	'region'        : 'None',
	'start'         : numpy.inf
})

zregions.append({
	'end'           : numpy.inf,
	'mae'           : ZRIMEC_BASELINE['MAE'],
	'mae-relevance' : 0.0,
	'mid'           : numpy.inf,
	'mse'           : ZRIMEC_BASELINE['MSE'],
	'mse-relevance' : 0.0,
	'r2'            : ZRIMEC_BASELINE['R2'],
	'r2-relevance'  : 0.0,
	'region'        : 'None',
	'start'         : numpy.inf
})

wregions = pandas.DataFrame.from_dict(wregions)
zregions = pandas.DataFrame.from_dict(zregions)

wregions['Delta'] = wregions['r2'] - WASHBURN_BASELINE['R2']
zregions['Delta'] = zregions['r2'] - ZRIMEC_BASELINE['R2']

regions = pandas.concat([wregions.assign(Model = 'Washburn'), zregions.assign(Model = 'Zrimec')])
regions = regions.rename(columns = {
	'start'        : 'Start',
	'region'       : 'Occlusion',
	'r2'           : 'R2',
	'r2-relevance' : 'Relevance',
})

regions = regions.sort_values(['Start', 'Model'])
regions = regions.set_index(['Occlusion', 'Model'])
regions = regions[['R2', 'Delta', 'Relevance']]

regions

# 9. Updating Group Barplots

In [None]:
# Define path

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\output\nbp00-analysis'

tissue_metadata = os.path.join(folder_path,  'tissue-metadata.csv')
tissue_metadata = pandas.read_csv(tissue_metadata)

In [None]:
# Plot barplots

group1   = 'Tissue'
group2   = 'Age'

matplotlib.rcParams.update({'font.size' : 42})
seaborn.set_theme(font_scale = 3.33)

unique1 = tissue_metadata[group1].unique()
unique2 = tissue_metadata[group2].unique()
counts1 = [len(tissue_metadata.loc[tissue_metadata[group1] == name]) for name in unique1]
counts2 = [len(tissue_metadata.loc[tissue_metadata[group2] == name]) for name in unique2]

df1 = pandas.DataFrame.from_dict({
	group1  : unique1,
	'Count' : counts1
}).sort_values('Count', ascending = False)

df2 = pandas.DataFrame.from_dict({
	group2  : unique2,
	'Count' : counts2
}).sort_values('Count', ascending = False)

fig, axis = matplotlib.pyplot.subplots(ncols = 2, nrows = 1, sharey = True, figsize = (28, 11))
fig.tight_layout()

g1 = seaborn.barplot(
	data   = df1,
	x      = group1,
	y      = 'Count',
	hue    = group1,
	width  = 0.8,
	ax     = axis[0],
	alpha  = 0.9,
	dodge  = False
)

g2 = seaborn.barplot(
	data   = df2,
	x      = group2,
	y      = 'Count',
	hue    = group2,
	width  = 0.8,
	ax     = axis[1],
	alpha  = 0.9,
	dodge  = False
)

handles, labels = axis[0].get_legend_handles_labels()
axis[0].legend(handles = handles[1:], labels = labels[1:])

handles, labels = axis[1].get_legend_handles_labels()
axis[1].legend(handles = handles[1:], labels = labels[1:])

g2.set(ylabel = None)
g1.set(xticklabels = [])
g2.set(xticklabels = [])

matplotlib.pyplot.subplots_adjust(
	left   = None,
	bottom = None,
	right  = None,
	top    = None,
	wspace = 0.05,
	hspace = None
)

matplotlib.pyplot.savefig(
	os.path.join(OUT_DATA, 'class-distribution-tissue-age.png'),
	dpi         = 120,
	format      = 'png',
	bbox_inches = 'tight',
	pad_inches  = 0
)

runtime.set_plot_theme()

# 10. Updating Region Length Histograms

In [None]:
# Define path

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\output\nbp00-analysis'

gene_annotation = os.path.join(folder_path, 'gene-annotation.csv')
gene_annotation = pandas.read_csv(gene_annotation)

In [None]:
# Group regions

regions = ['mRNA', 'UTR5', 'CDS', 'UTR3']
data    = gene_annotation
groupby = 'Transcript'

if regions is None :
	regions = data['Type'].unique().tolist()

entries = list()

for group, dataframe in data.groupby(groupby) :
	f0 = dataframe['Seq'].iloc[0]
	f1 = dataframe['Strand'].iloc[0]
	f3 = dataframe['Gene'].iloc[0]
	f4 = dataframe['Transcript'].iloc[0]

	for region in regions :
		f2 = region
		f5 = dataframe['Exon'].iloc[0]
		f6 = dataframe['Parent'].iloc[0]

		data = dataframe.loc[dataframe['Type'].isin([region])]

		f7 = -1 if len(data) == 0 else data['Start'].min()
		f8 = -1 if len(data) == 0 else data['End'].max()
		f9 =  0 if len(data) == 0 else data['Length'].sum()

		entries.append([f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, len(data)])

dataframe = pandas.DataFrame(
	data    = entries,
	columns = ['Seq', 'Strand', 'Type', 'Gene', 'Transcript', 'Exon', 'Parent', 'Start', 'End', 'Length', 'Regions']
)

In [None]:
# Plot region length histogram

vlines = {
	'mRNA' : 0,
	'UTR5' : 300,
	'CDS'  : 0,
	'UTR3' : 350
}

FONTSIZE  = 42
LINEWIDTH = 4
ALPHA     = 0.9
KDE       = False
BINS      = 60
LOG       = True
VLINE     = True

dataframe_utr5 = dataframe[dataframe['Type'] == 'UTR5']
dataframe_utr5 = dataframe_utr5[dataframe_utr5['Length'] > 0]

dataframe_utr3 = dataframe[dataframe['Type'] == 'UTR3']
dataframe_utr3 = dataframe_utr3[dataframe_utr3['Length'] > 0]

matplotlib.rcParams.update({'font.size' : 36})
seaborn.set_theme(font_scale = 3)

fig, axis = matplotlib.pyplot.subplots(ncols = 2, nrows = 1, sharey = True, figsize = (26, 10))
fig.tight_layout()

seaborn.histplot(
	x         = 'Length',
	data      = dataframe_utr5,
	ax        = axis[0],
	log_scale = LOG,
	alpha     = ALPHA,
	kde       = KDE,
	bins      = BINS
)

seaborn.histplot(
	x         = 'Length',
	data      = dataframe_utr3,
	ax        = axis[1],
	log_scale = LOG,
	alpha     = ALPHA,
	kde       = KDE,
	bins      = BINS
)

if VLINE :
	if vlines['UTR5'] > 0 :
		percentile = scipy.stats.percentileofscore(dataframe_utr5['Length'], vlines['UTR5'], nan_policy = 'omit')

		x = 1.08 * vlines['UTR5']
		y = 0.90 * axis[0].get_ylim()[-1]

		axis[0].axvline(vlines['UTR5'], color = '#F0665E', linewidth = LINEWIDTH, alpha = ALPHA)
		axis[0].text(x, y, f'{percentile:.1f}%', color = '#F0665E', fontsize = FONTSIZE, alpha = ALPHA)

	if vlines['UTR3'] > 0 :
		percentile = scipy.stats.percentileofscore(dataframe_utr3['Length'], vlines['UTR3'], nan_policy = 'omit')

		x = 1.08 * vlines['UTR3']
		y = 0.90 * axis[1].get_ylim()[-1]

		axis[1].axvline(vlines['UTR3'], color = '#F0665E', linewidth = LINEWIDTH, alpha = ALPHA)
		axis[1].text(x, y, f'{percentile:.1f}%', color = '#F0665E', fontsize = FONTSIZE, alpha = ALPHA)

axis[1].set_title('3\'UTR')
axis[0].set_title('5\'UTR')
axis[1].set_xlabel('Length')
axis[0].set_xlabel('Length')
axis[0].set_ylabel('Count')

matplotlib.pyplot.subplots_adjust(
	left   = None,
	bottom = None,
	right  = None,
	top    = None,
	wspace = 0.05,
	hspace = None
)

matplotlib.pyplot.savefig(
	os.path.join(OUT_DATA, 'length-distribution.png'),
	dpi         = 120,
	format      = 'png',
	bbox_inches = 'tight',
	pad_inches  = 0
)

runtime.set_plot_theme()

# 11. Tissue-Specific TPM Distribution

In [None]:
#

anndata = loader.load_h5ad(filename = os.path.join(RES_NBP04, 'filter{}'.format(2), 'arabidopsis-r36.h5ad'))
split   = loader.load_json(filename = os.path.join(RES_NBP01, 'filter{}'.format(2), 'filter.json'))

keep_transcript = split['data']['keep_transcript']
drop_transcript = split['data']['drop_transcript']

keep = anndata.var.index.isin(keep_transcript)
drop = anndata.var.index.isin(drop_transcript)

keep = anndata[:, keep]
drop = anndata[:, drop]

tissues = keep.obs['Tissue'].value_counts().to_dict()

In [None]:
#

NROWS = 3
NCOLS = 2
ALPHA = 0.8

matplotlib.rcParams.update({'font.size' : 48})
seaborn.set_theme(font_scale = 4)

fig, ax = matplotlib.pyplot.subplots(
	nrows   = NROWS,
	ncols   = NCOLS,
	sharex  = True,
	sharey  = True,
	figsize = (16 * NCOLS, 10 * NROWS)
)

fig.tight_layout()

for index, (k, n) in enumerate(tissues.items()) :
	row = index // NCOLS
	col = index  % NCOLS

	axis = ax[row, col]

	keep_k = keep[keep.obs['Tissue'] == k, :].layers['boxcox1p'].mean(axis = 0)
	drop_k = drop[drop.obs['Tissue'] == k, :].layers['boxcox1p'].mean(axis = 0)

	keep_l = numpy.array(['Keep' for _ in keep_k])
	drop_l = numpy.array(['Drop' for _ in drop_k])

	df = pandas.DataFrame.from_dict({
		'TPM'   : numpy.concatenate((keep_k, drop_k)),
		'Label' : numpy.concatenate((keep_l, drop_l))
	})

	seaborn.histplot(
		data  = df,
		x     = 'TPM',
		ax    = axis,
		hue   = 'Label',
		alpha = ALPHA
	)

	axis.set_title(k.title())

ax[2, 1].axis('off')

matplotlib.pyplot.subplots_adjust(
	left   = None,
	bottom = None,
	right  = None,
	top    = None,
	wspace = 0.05,
	hspace = None
)

matplotlib.pyplot.savefig(
	os.path.join(OUT_DATA, 'tpm-distribution.png'),
	dpi         = 120,
	format      = 'png',
	bbox_inches = 'tight',
	pad_inches  = 0
)

runtime.set_plot_theme()