In [None]:
# Libraries

import matplotlib
import numpy
import os
import pandas
import platform
import scipy
import seaborn
import shutil
import sys

In [None]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux'   : ROOT = '/d/hpc/projects/FRI/up4472/upolanc-thesis'
		elif platform.system() == 'Windows' : ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else : raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		print()

		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [None]:
# Code

from source.python              import runtime
from source.python.cnn          import cnn_plot
from source.python.data.feature import feature_processing
from source.python.io           import loader

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

# 1. Setup

In [None]:
# Check for args

if __name__ == '__main__' and 'get_ipython' in dir() :
	print('Running as .ipynb')
	print()

if __name__ == '__main__' and 'get_ipython' not in dir() :
	print('Running as .py')
	print()

In [None]:
# Setup some directory paths

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')

OUT_DATA  = os.path.join(OUT, 'nbp17-plots')
RES_NBP01 = os.path.join(OUT, 'nbp01-filter')
RES_NBP02 = os.path.join(OUT, 'nbp02-anndata')
RES_NBP04 = os.path.join(OUT, 'nbp04-feature')

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f' Resource Directory : {RES_NBP01}')
print(f' Resource Directory : {RES_NBP02}')
print(f' Resource Directory : {RES_NBP04}')
print()

# 2. Transcript RSD Distribution

In [None]:
# Compute RSD for every filter

dataframes = dict()

for fid in [1, 2, 3, 4, 5, 6] :
	anndata = loader.load_h5ad(filename = os.path.join(RES_NBP04, 'filter{}'.format(fid), 'arabidopsis-r36.h5ad'))
	split   = loader.load_json(filename = os.path.join(RES_NBP01, 'filter{}'.format(fid), 'filter.json'))

	keep_transcript = split['data']['keep_transcript']
	drop_transcript = split['data']['drop_transcript']

	result = dict()

	result['Data']      = list()
	result['Group']     = list()
	result['Target']    = list()
	result['Median']    = list()
	result['Mean']      = list()
	result['Min']       = list()
	result['Max']       = list()
	result['Samples']   = list()
	result['RSD']       = list()
	result['RSD < 1.0'] = list()
	result['RSD < 0.5'] = list()

	target = 'global-mean'
	matrix = anndata

	for selection in ['all', 'keep', 'drop'] :
		if   selection == 'all'  : data = matrix
		elif selection == 'keep' : data = matrix[:, anndata.var.index.isin(keep_transcript)].copy()
		elif selection == 'drop' : data = matrix[:, anndata.var.index.isin(drop_transcript)].copy()
		else : raise ValueError()

		data = data.layers['boxcox1p']
		mean = data.mean(axis = 0)
		std  = data.std(axis = 0)
		rsd  = std / mean

		result['Data'     ].append(selection)
		result['Target'   ].append(target)
		result['Group'    ].append('')
		result['RSD'      ].append(rsd)
		result['Median'   ].append(numpy.median(rsd))
		result['Mean'     ].append(numpy.mean(rsd))
		result['Min'      ].append(numpy.min(rsd))
		result['Max'      ].append(numpy.max(rsd))
		result['Samples'  ].append(numpy.size(data, 0))
		result['RSD < 1.0'].append(numpy.sum(rsd < 1.0) / len(rsd))
		result['RSD < 0.5'].append(numpy.sum(rsd < 0.5) / len(rsd))

	target = 'tissue-mean'
	matrix = anndata

	for group in anndata.obs['Tissue'].unique() :
		matrix = anndata[anndata.obs['Tissue'] == group, :].copy()

		for selection in ['all', 'keep', 'drop'] :
			if   selection == 'all'  : data = matrix
			elif selection == 'keep' : data = matrix[:, anndata.var.index.isin(keep_transcript)].copy()
			elif selection == 'drop' : data = matrix[:, anndata.var.index.isin(drop_transcript)].copy()
			else : raise ValueError()

			data = data.layers['boxcox1p']
			mean = data.mean(axis = 0)
			std  = data.std(axis = 0)
			rsd  = std / mean

			result['Data'     ].append(selection)
			result['Target'   ].append(target)
			result['Group'    ].append(group)
			result['RSD'      ].append(rsd)
			result['Median'   ].append(numpy.median(rsd))
			result['Mean'     ].append(numpy.mean(rsd))
			result['Min'      ].append(numpy.min(rsd))
			result['Max'      ].append(numpy.max(rsd))
			result['Samples'  ].append(numpy.size(data, 0))
			result['RSD < 1.0'].append(numpy.sum(rsd < 1.0) / len(rsd))
			result['RSD < 0.5'].append(numpy.sum(rsd < 0.5) / len(rsd))

	target = 'group-mean'
	matrix = anndata

	for group in anndata.obs['Group'].unique() :
		matrix = anndata[anndata.obs['Group'] == group, :].copy()

		for selection in ['all', 'keep', 'drop'] :
			if   selection == 'all'  : data = matrix
			elif selection == 'keep' : data = matrix[:, anndata.var.index.isin(keep_transcript)].copy()
			elif selection == 'drop' : data = matrix[:, anndata.var.index.isin(drop_transcript)].copy()
			else : raise ValueError()

			data = data.layers['boxcox1p']
			mean = data.mean(axis = 0)
			std  = data.std(axis = 0)
			rsd  = std / mean

			result['Data'     ].append(selection)
			result['Target'   ].append(target)
			result['Group'    ].append(group)
			result['RSD'      ].append(rsd)
			result['Median'   ].append(numpy.median(rsd))
			result['Mean'     ].append(numpy.mean(rsd))
			result['Min'      ].append(numpy.min(rsd))
			result['Max'      ].append(numpy.max(rsd))
			result['Samples'  ].append(numpy.size(data, 0))
			result['RSD < 1.0'].append(numpy.sum(rsd < 1.0) / len(rsd))
			result['RSD < 0.5'].append(numpy.sum(rsd < 0.5) / len(rsd))

	dataframes[fid] = pandas.DataFrame.from_dict(result)
	dataframes[fid].to_csv(os.path.join(OUT_DATA, 'table-rsd-f{}.csv'.format(fid)))

In [None]:
# Display RSD

search_tissue = [
	('tissue-mean', 'seedling'),
	('tissue-mean', 'seed'),
	('tissue-mean', 'leaf'),
	('tissue-mean', 'root'),
	('tissue-mean', 'flower')
]

search_group = [
	('group-mean', 'young_seedling'),
	('group-mean', 'mature_seed'),
	('group-mean', 'mature_leaf'),
	('group-mean', 'mature_root'),
	('group-mean', 'mature_flower')
]

search = [('global-mean', None)]

search.extend(search_tissue)
search.extend(search_group)

for fid in [1, 2, 3, 4, 5, 6] :
	fig, ax = matplotlib.pyplot.subplots(figsize = (16, 10))
	fig.tight_layout()

	color = {
		'global-mean' : 'r',
		'tissue-mean' : 'g',
		'group-mean'  : 'b'
	}

	for target, group in search :
		x = dataframes[fid]
		x = x[x['Data'] == 'all']

		if target is not None : x = x[x['Target'] == target]
		if group  is not None : x = x[x['Group']  == group]

		x = x['RSD'].iloc[0]

		rsd_x = list()
		rsd_y = list()

		for t in numpy.arange(0.00, 2.00, 0.05) :
			rsd_y.append(t)
			rsd_x.append(numpy.sum(x < t) / len(x) * 100.0)

		label = ''

		if target is not None : label = target
		if group  is not None : label = label + '-' + group

		seaborn.lineplot(
			x         = rsd_y,
			y         = rsd_x,
			label     = label,
			ax        = ax,
			alpha     = 0.9,
			color     = color[target],
			linewidth = 4
		)

	matplotlib.pyplot.legend(
		loc  = 'lower right',
		prop = None
	)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-rsd-f{}.png'.format(fid)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

# 3. Variance vs. TPM

In [None]:
# Compute variance

anndata = loader.load_h5ad(filename = os.path.join(RES_NBP04, 'filter2', 'arabidopsis-r36.h5ad'))
filters = loader.load_json(filename = os.path.join(RES_NBP01, 'filter2', 'filter.json'))
anndata = anndata[:, anndata.var.index.isin(filters['data']['keep_transcript'])]

dictionary = dict()
supergroup = 'Tissue'
layer      = 'boxcox1p'

for group in anndata.obs[supergroup].unique() :
	subdata = anndata[anndata.obs[supergroup].isin([group]), :]

	transcript = subdata.var.index.tolist()
	variance   = subdata.layers[layer].var(axis = 1).tolist()
	stdev      = subdata.layers[layer].std(axis = 1).tolist()
	dictionary.setdefault(group, dict())

	for k, v, s in zip(transcript, variance, stdev) :
		dictionary[group].setdefault(k, dict()).setdefault('variance', v)
		dictionary[group].setdefault(k, dict()).setdefault('stdev', s)

In [None]:
# Plot variance

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
models_path = os.path.join(folder_path, 'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
report_path = os.path.join(models_path, 'report_keep.json')

for xtarget in ['ypred', 'ytrue'] :
	report = loader.load_json(report_path)
	report = report['eval']

	scatterdict = {
		'ypred' : list(),
		'ytrue' : list(),
		'value' : list(),
		'group' : list()
	}

	for key, ypred, ytrue in zip(report['keys'], report['ypred'], report['ytrue']) :
		if isinstance(ypred, list) : ypred = ypred[0]
		if isinstance(ytrue, list) : ytrue = ytrue[0]

		group      = key.split('?')[0]
		transcript = key.split('?')[1]

		if not group      in dictionary.keys()        : continue
		if not transcript in dictionary[group].keys() : continue

		scatterdict['ytrue'].append(ytrue)
		scatterdict['ypred'].append(ypred)
		scatterdict['value'].append(dictionary[group][transcript]['variance'])
		scatterdict['group'].append(group)

	fig, ax = matplotlib.pyplot.subplots(figsize = (16, 10))
	fig.tight_layout()

	seaborn.scatterplot(
		data  = scatterdict,
		x     = xtarget,
		y     = 'value',
		hue   = 'group',
		ax    = ax,
		s     = 300,
		alpha = 0.9
	)

	ax.set_xlabel(None)
	ax.set_ylabel(None)
	ax.legend(markerscale = 2)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}.png'.format(xtarget)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

# 4. Prediction Error

In [None]:
# Plot prediction error

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
wmodels_path = os.path.join(folder_path,  'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
zmodels_path = os.path.join(folder_path,  'zrimec-0-tf2150-f2-0250-77-tissue-mean-explode')
wreport_path = os.path.join(wmodels_path, 'report_keep.json')
zreport_path = os.path.join(zmodels_path, 'report_keep.json')

matplotlib.rcParams.update({'font.size' : 48})
seaborn.set_theme(font_scale = 4)

for name, report_path in [('washburn', wreport_path), ('zrimec', zreport_path)] :
	report = loader.load_json(report_path)
	report = report['eval']

	keys = set([x.split('?')[0] for x in report['keys']])
	vals = dict()

	for key in keys :
		vals[key] = list()

	for key, ypred, ytrue in zip(report['keys'], report['ypred'], report['ytrue']) :
		if isinstance(ypred, list) : ypred = ypred[0]
		if isinstance(ytrue, list) : ytrue = ytrue[0]

		key = key.split('?')[0]
		val = ypred - ytrue

		vals[key].append(val)

	n, nrows, ncols = cnn_plot.compute_gridsize(
		n = len(keys)
	)

	fig, ax = matplotlib.pyplot.subplots(nrows, ncols, sharex = True, sharey = True, figsize = (16 * ncols, 10 * nrows))
	fig.tight_layout()

	minval = 0
	maxval = 0

	for index, key in enumerate(keys) :
		minval = min(numpy.min(vals[key]), minval)
		maxval = max(numpy.max(vals[key]), maxval)

	for index, key in enumerate(keys) :
		r = index // ncols
		c = index  % ncols

		axis = ax[r, c]

		seaborn.histplot(
			x     = vals[key],
			ax    = axis,
			alpha = 0.9
		)

		axis.axvline(x = 0, color = 'r', linewidth = 4)

		gmin = minval - 0.1
		gmax = 290

		axis.text(gmin, 1.00 * gmax, key.title())
		axis.text(gmin, 0.89 * gmax, r'$\bar{x}$')
		axis.text(gmin, 0.78 * gmax, r'$\tilde{x}$')
	
		axis.text(gmin + 0.38, 0.89 * gmax, '=')
		axis.text(gmin + 0.38, 0.78 * gmax, '=')

		axis.text(gmin + 0.65, 0.89 * gmax, '{: .3f}'.format(numpy.mean(vals[key])))
		axis.text(gmin + 0.65, 0.78 * gmax, '{: .3f}'.format(numpy.median(vals[key])))

		axis.set_xlabel(None)
		axis.set_ylabel(None)

	for index in range(n, nrows * ncols) :
		if nrows == 1 or ncols == 1 : axis = ax[index]
		else                        : axis = ax[index // ncols, index % ncols]

		axis.axis('off')

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}-prediction-error.png'.format(name)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

# 5. Actual vs. Predicted

In [None]:
# Plot actual vs. predicted

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
wmodels_path = os.path.join(folder_path,  'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
zmodels_path = os.path.join(folder_path,  'zrimec-0-tf2150-f2-0250-77-tissue-mean-explode')
wreport_path = os.path.join(wmodels_path, 'report_keep.json')
zreport_path = os.path.join(zmodels_path, 'report_keep.json')

matplotlib.rcParams.update({'font.size' : 36})
seaborn.set_theme(font_scale = 3)

for name, report_path in [('washburn', wreport_path), ('zrimec', zreport_path)] :
	report = loader.load_json(report_path)
	report = report['eval']

	keys   = set([x.split('?')[0] for x in report['keys']])
	ypreds = dict()
	ytrues = dict()

	for key in keys :
		ypreds[key] = list()
		ytrues[key] = list()

	for key, ypred, ytrue in zip(report['keys'], report['ypred'], report['ytrue']) :
		if isinstance(ypred, list) : ypred = ypred[0]
		if isinstance(ytrue, list) : ytrue = ytrue[0]

		key = key.split('?')[0]

		ypreds[key].append(ypred)
		ytrues[key].append(ytrue)

	n, nrows, ncols = cnn_plot.compute_gridsize(
		n = len(keys)
	)

	fig, ax = matplotlib.pyplot.subplots(nrows, ncols, sharex = True, sharey = True, figsize = (10 * ncols, 10 * nrows))
	fig.tight_layout()

	for index, key in enumerate(keys) :
		r = index // ncols
		c = index  % ncols

		axis = ax[r, c]

		x = numpy.array(ypreds[key])
		y = numpy.array(ytrues[key])

		seaborn.scatterplot(
			x     = x,
			y     = y,
			ax    = axis,
			alpha = 0.9
		)

		res = scipy.stats.linregress(x, y)

		axis.plot(x, res.intercept + res.slope * x,
			color     = 'r',
			linewidth = 4
		)

		xmin, xmax = axis.get_xlim()
		ymin, ymax = axis.get_ylim()

		gmin = min(xmin, ymin)
		gmax = max(xmax, ymax)

		axis.set_xlim([gmin, gmax])
		axis.set_ylim([gmin, gmax])
		axis.set_aspect('equal')

		offset = 0.05

		axis.text(0.0 * gmax - offset, 0.92 * gmax, key.title())
		axis.text(0.0 * gmax - offset, 0.84 * gmax, 'k')
		axis.text(0.0 * gmax - offset, 0.76 * gmax, 'r')

		axis.text(0.05 * gmax - offset, 0.84 * gmax, '=')
		axis.text(0.05 * gmax - offset, 0.76 * gmax, '=')

		axis.text(0.10 * gmax - offset, 0.84 * gmax, '{: .3f}'.format(res.slope))
		axis.text(0.10 * gmax - offset, 0.76 * gmax, '{: .3f}'.format(res.rvalue))

		axis.set_xlabel(None)
		axis.set_ylabel(None)

	for index in range(n, nrows * ncols) :
		if nrows == 1 or ncols == 1 : axis = ax[index]
		else                        : axis = ax[index // ncols, index % ncols]

		axis.axis('off')

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}-prediction-linefit.png'.format(name)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)

# 6. Error vs. TPM

In [None]:
# Define

folder_path = r'C:\Developer\Workspace\PyCharm\Projects\upolanc-thesis\reports\regression-cnn'
wmodels_path = os.path.join(folder_path,  'washburn-0-tf2150-f2-0250-77-tissue-mean-explode')
zmodels_path = os.path.join(folder_path,  'zrimec-0-tf2150-f2-0250-77-tissue-mean-explode')
wreport_path = os.path.join(wmodels_path, 'report_keep.json')
zreport_path = os.path.join(zmodels_path, 'report_keep.json')

data = {
	'washburn' : None,
	'zrimec'   : None,
}

thresholds_boxcox = numpy.arange(0, 5, 0.25)
thresholds_tpm    = [0, 10, 50, 100, 250, 500, 1_000, 5_000, 10_000, 50_000, 250_000, 1_000_000]

to_tpm     = lambda x : feature_processing.boxcox1p_inv(x, -0.1455)
to_boxcox  = lambda x : x

to_absolute_error = lambda ytrue, ypred : abs(ytrue - ypred)
to_squared_error  = lambda ytrue, ypred : (ytrue - ypred) ** 2

In [None]:
# Plot error vs. tpm

combinations = [
	(to_squared_error,  'squared-error'),
	(to_absolute_error, 'absolute-error')
]

for to_metric, name in combinations :
	data = {
		'washburn' : None,
		'zrimec'   : None,
	}

	for name, report_path in [('washburn', wreport_path), ('zrimec', zreport_path)] :
		report = loader.load_json(report_path)
		report = report['eval']

		temp = {
			'y' : list(),
			'x' : list()
		}

		for ytrue, ypred in zip(report['ytrue'], report['ypred']) :
			ytrue = ytrue[0]
			ypred = ypred[0]

			ytrue = to_boxcox(ytrue)
			ypred = to_boxcox(ypred)
			value = to_metric(ytrue, ypred)

			for i, threshold in enumerate(thresholds_boxcox) :
				if ytrue <= threshold :
					temp['y'].append(value)
					temp['x'].append(threshold)

					break

		data[name] = temp

	fig, ax = matplotlib.pyplot.subplots(figsize = (16, 10))
	fig.tight_layout()

	seaborn.lineplot(
		data      = pandas.DataFrame.from_dict(data['washburn']),
		x         = 'x',
		y         = 'y',
		ax        = ax,
		linewidth = 4,
		label     = 'Washburn'
	)

	seaborn.lineplot(
		data      = pandas.DataFrame.from_dict(data['zrimec']),
		x         = 'x',
		y         = 'y',
		ax        = ax,
		linewidth = 4,
		label     = 'Zrimec'
	)

	ax.set_ylabel(None)
	ax.set_xlabel(None)

	matplotlib.pyplot.legend(
		loc = 'upper left'
	)

	matplotlib.pyplot.savefig(
		os.path.join(OUT_DATA, 'graph-{}.png'.format(name)),
		dpi         = 120,
		format      = 'png',
		bbox_inches = 'tight',
		pad_inches  = 0
	)