In [None]:
# Libraries

import matplotlib
import numpy
import os
import pandas
import platform
import seaborn
import shutil
import sys
import warnings

In [None]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux'   : ROOT = '/d/hpc/projects/FRI/up4472/upolanc-thesis'
		elif platform.system() == 'Windows' : ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else : raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		print()

		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [None]:
# Code

from source.python               import runtime
from source.python.data.analysis import analysis_statistics
from source.python.data.feature  import feature_anndata
from source.python.io            import loader
from source.python.io            import writer

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

# 1. Setup

In [None]:
# Setup some directory paths

FILTER_ID = 2
SUBFOLDER = 'filter' + str(FILTER_ID)

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')

OUT_DATA  = os.path.join(OUT,      'nbp02-anndata',  SUBFOLDER)
OUT_PLOT  = os.path.join(OUT_DATA, 'plot')
RES_NBP01 = os.path.join(OUT,      'nbp01-filter',   SUBFOLDER)

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)
os.makedirs(OUT_PLOT, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f'   Output Directory : {OUT_PLOT}')
print(f' Resource Directory : {RES_NBP01}')
print()

In [None]:
# Load the updated and synchronized data

tissue_metadata = loader.load_csv(
	filename = os.path.join(RES_NBP01, 'tissue-metadata.csv')
)

tissue_tpm = loader.load_csv(
	filename = os.path.join(RES_NBP01, 'tissue-tpm.csv')
)

filter_dict = loader.load_json(
	filename = os.path.join(RES_NBP01, 'filter.json')
)

In [None]:
# Filtered transcripts

keep_transcript = filter_dict['data']['keep_transcript']
drop_transcript = filter_dict['data']['drop_transcript']

In [None]:
# Define layer mapping

mapping = {
	'log1p'    : None,
	'boxcox1p' : None,
	'normal'   : 'boxcox1p',
	'standard' : 'boxcox1p'
}

In [None]:
# Define percentile functions

def percentile90 (x, axis) : return numpy.percentile(x, 90, axis = axis)
def percentile75 (x, axis) : return numpy.percentile(x, 75, axis = axis)
def percentile60 (x, axis) : return numpy.percentile(x, 60, axis = axis)

functions = [
	('mean', numpy.mean),
	('max',  numpy.max),
	('std',  numpy.std),
	('min',  numpy.min)
]

In [None]:
# Define transcripts to boxplot

# AT2G17360.1 - expressive in all-to-most tissues
# AT2G07713.1 - expressive only in senescence
# AT4G08160.1 - expressive in all but dominant in single

boxplot_transcripts = ['AT2G17360.1', 'AT2G07713.1', 'AT4G08160.1']

In [None]:
# Get the samples per tissue

samples_tissue = dict()
samples_name   = tissue_tpm['Transcript'][0]

for group in tissue_metadata['Tissue'].unique() :
	samples_tissue[group] = tissue_metadata[tissue_metadata['Tissue'] == group]['Sample'].tolist()

for group in tissue_metadata['Group'].unique() :
	if group.lower().startswith('missing') :
		continue

	samples_tissue[group] = tissue_metadata[tissue_metadata['Group'] == group]['Sample'].tolist()

# 2. Anndata

In [None]:
# Create anndata object from the metadata and tpm tables

with warnings.catch_warnings() :
	warnings.simplefilter('ignore')

	anndata = feature_anndata.create_anndata(
		mat = tissue_tpm,
		obs = tissue_metadata
	)

transcripts = anndata.var.index.tolist()
samples     = anndata.obs.index.tolist()

feature_anndata.show_structure(anndata)

In [None]:
# Display anndata tpm matrix information

feature_anndata.show_matrix(anndata, layer = None)

# 3. Log

In [None]:
# Compute and save the log1p tpm values

anndata = feature_anndata.compute_log1p(
	data       = anndata,
	layer      = mapping['log1p'],
	base       = 2,
	store_into = 'log1p'
)

In [None]:
# Display anndata log1p tpm matrix information

feature_anndata.show_matrix(anndata, layer = 'log1p')

In [None]:
# Compute basic statistics

dataframes = [
	analysis_statistics.generate_basic_statistics(
		data       = anndata[samples_tissue[tissue]].layers['log1p'].T,
		transcript = transcripts,
		tissue     = tissue,
		axis       = 1
	)

	for tissue in samples_tissue.keys()
]

dataframes.append(
	analysis_statistics.generate_basic_statistics(
		data       = anndata.layers['log1p'].T,
		transcript = transcripts,
		tissue     = 'Global',
		axis       = 1
	)
)

statistic_log1p_basic = pandas.concat(dataframes).set_index(['Transcript', 'Tissue'])
statistic_log1p_basic.xs(samples_name, level = 0)

In [None]:
# Compute advance statistics

dataframes = [
	analysis_statistics.genearte_advance_statistics(
		data       = anndata[samples_tissue[tissue]].layers['log1p'].T,
		transcript = transcripts,
		tissue     = tissue,
		axis       = 1
	)

	for tissue in samples_tissue.keys()
]

dataframes.append(
	analysis_statistics.genearte_advance_statistics(
		data       = anndata.layers['log1p'].T,
		transcript = transcripts,
		tissue     = 'Global',
		axis       = 1
	)
)

statistic_log1p_advance = pandas.concat(dataframes).set_index(['Transcript', 'Tissue'])
statistic_log1p_advance.xs(samples_name, level = 0)

In [None]:
# Compute normality statistics

with warnings.catch_warnings() :
	warnings.simplefilter('ignore')

	dataframes = [
		analysis_statistics.generate_normality_statistics(
			data       = anndata[samples_tissue[tissue]].layers['log1p'].T,
			transcript = transcripts,
			tissue     = tissue
		)

		for tissue in samples_tissue.keys()
	]

	dataframes.append(
		analysis_statistics.generate_normality_statistics(
			data       = anndata.layers['log1p'].T,
			transcript = transcripts,
			tissue     = 'Global'
		)
	)

statistic_log1p_normality = pandas.concat(dataframes).set_index(['Transcript', 'Tissue'])
statistic_log1p_normality.xs(samples_name, level = 0)

## 3.2 Distribution

In [None]:
# Display log1p tpm value distribution per gene

for name, function in functions :
	if name == 'min' : continue

	feature_anndata.tpm_histplot(
		data     = anndata[:, keep_transcript],
		layer    = 'log1p',
		function = function,
		filters  = None,
		filename = os.path.join(OUT_PLOT, 'distribution-log1p-keep-' + name)
	)

	matplotlib.pyplot.show()

In [None]:
# Display log1p tpm value distribution per gene

for name, function in functions :
	if name == 'min' : continue

	feature_anndata.tpm_histplot(
		data     = anndata[:, drop_transcript],
		layer    = 'log1p',
		function = function,
		filters  = None,
		filename = os.path.join(OUT_PLOT, 'distribution-log1p-drop-' + name)
	)

	matplotlib.pyplot.show()

## 3.3 Transcripts

In [None]:
# Display an example of few transcript expressions per tissue

for transcript in boxplot_transcripts :
	if transcript not in anndata.var.index : continue

	feature_anndata.gene_boxplot(
		data       = anndata,
		transcript = transcript,
		groupby    = 'Tissue',
		layer      = 'log1p',
		filename   = os.path.join(OUT_PLOT, 'tissue-log1p-' + transcript)
	)

	matplotlib.pyplot.show()

# 4. BoxCox

In [None]:
# Compute and save the boxcox1p tpm values

anndata, boxcox_factors = feature_anndata.compute_boxcox1p(
	data       = anndata,
	store_into = 'boxcox1p',
	layer      = mapping['boxcox1p'],
	eps        =  1.0000000000000000,
	lmbda      = -0.1455266110158969
)

print('Lambda : {}'.format(boxcox_factors['lambda']))
print()

In [None]:
# Display anndata boxcox1p tpm matrix information

feature_anndata.show_matrix(anndata, layer = 'boxcox1p')

## 4.1 Statistics

In [None]:
# Compute basic statistics

dataframes = [
	analysis_statistics.generate_basic_statistics(
		data       = anndata[samples_tissue[tissue]].layers['boxcox1p'].T,
		transcript = transcripts,
		tissue     = tissue,
		axis       = 1
	)

	for tissue in samples_tissue.keys()
]

dataframes.append(
	analysis_statistics.generate_basic_statistics(
		data       = anndata.layers['boxcox1p'].T,
		transcript = transcripts,
		tissue     = 'Global',
		axis       = 1
	)
)

statistic_boxcox1p_basic = pandas.concat(dataframes).set_index(['Transcript', 'Tissue'])
statistic_boxcox1p_basic.xs(samples_name, level = 0)

In [None]:
# Compute advance statistics

dataframes = [
	analysis_statistics.genearte_advance_statistics(
		data       = anndata[samples_tissue[tissue]].layers['boxcox1p'].T,
		transcript = transcripts,
		tissue     = tissue,
		axis       = 1
	)

	for tissue in samples_tissue.keys()
]

dataframes.append(
	analysis_statistics.genearte_advance_statistics(
		data       = anndata.layers['boxcox1p'].T,
		transcript = transcripts,
		tissue     = 'Global',
		axis       = 1
	)
)

statistic_boxcox1p_advance = pandas.concat(dataframes).set_index(['Transcript', 'Tissue'])
statistic_boxcox1p_advance.xs(samples_name, level = 0)

In [None]:
# Compute normality statistics

with warnings.catch_warnings() :
	warnings.simplefilter('ignore')

	dataframes = [
		analysis_statistics.generate_normality_statistics(
			data       = anndata[samples_tissue[tissue]].layers['boxcox1p'].T,
			transcript = transcripts,
			tissue     = tissue
		)

		for tissue in samples_tissue.keys()
	]

	dataframes.append(
		analysis_statistics.generate_normality_statistics(
			data       = anndata.layers['boxcox1p'].T,
			transcript = transcripts,
			tissue     = 'Global'
		)
	)

statistic_boxcox1p_normality = pandas.concat(dataframes).set_index(['Transcript', 'Tissue'])
statistic_boxcox1p_normality.xs(samples_name, level = 0)

## 4.2 Distribution

In [None]:
# Display boxcox1p tpm value distribution per gene

for name, function in functions :
	if name == 'min' : continue

	feature_anndata.tpm_histplot(
		data     = anndata[:, keep_transcript],
		layer    = 'boxcox1p',
		function = function,
		filters  = None,
		filename = os.path.join(OUT_PLOT, 'distribution-boxcox1p-keep-' + name)
	)

	matplotlib.pyplot.show()

In [None]:
# Display boxcox1p tpm value distribution per gene

for name, function in functions :
	if name == 'min' : continue

	feature_anndata.tpm_histplot(
		data     = anndata[:, drop_transcript],
		layer    = 'boxcox1p',
		function = function,
		filters  = None,
		filename = os.path.join(OUT_PLOT, 'distribution-boxcox1p-drop-' + name)
	)

	matplotlib.pyplot.show()

## 4.3 Transcripts

In [None]:
# Display an example of few transcript expressions per tissue

for transcript in boxplot_transcripts :
	if transcript not in anndata.var.index : continue

	feature_anndata.gene_boxplot(
		data       = anndata,
		transcript = transcript,
		groupby    = 'Tissue',
		layer      = 'boxcox1p',
		filename   = os.path.join(OUT_PLOT, 'tissue-boxcox1p-' + transcript)
	)

	matplotlib.pyplot.show()

# 5. Normal

In [None]:
# Compute and save the normalized tpm values

anndata, normal_factors = feature_anndata.compute_normalized(
	data       = anndata,
	layer      = mapping['normal'],
	store_into = 'normal'
)

print('Min : {}'.format(normal_factors['min']))
print('Max : {}'.format(normal_factors['max']))
print()

In [None]:
# Display normalized log1p tpm matrix information

feature_anndata.show_matrix(anndata, layer = 'normal')

## 5.1 Distribution

In [None]:
# Display normal tpm value distribution per gene

for name, function in functions :
	if name == 'min' : continue

	feature_anndata.tpm_histplot(
		data     = anndata[:, keep_transcript],
		layer    = 'normal',
		function = function,
		filters  = None,
		filename = os.path.join(OUT_PLOT, 'distribution-nornal-keep-' + name)
	)

	matplotlib.pyplot.show()

In [None]:
# Display boxcox1p tpm value distribution per gene

for name, function in functions :
	if name == 'min' : continue

	feature_anndata.tpm_histplot(
		data     = anndata[:, drop_transcript],
		layer    = 'normal',
		function = function,
		filters  = None,
		filename = os.path.join(OUT_PLOT, 'distribution-normal-drop-' + name)
	)

	matplotlib.pyplot.show()

## 5.2 Transcripts

In [None]:
# Display an example of few transcript expressions per tissue

for transcript in boxplot_transcripts :
	if transcript not in anndata.var.index : continue

	feature_anndata.gene_boxplot(
		data       = anndata,
		transcript = transcript,
		groupby    = 'Tissue',
		layer      = 'normal',
		filename   = os.path.join(OUT_PLOT, 'tissue-normal-' + transcript)
	)

	matplotlib.pyplot.show()

# 6. Standard

In [None]:
# Compute and save the standardized tpm values

anndata, standard_factors = feature_anndata.compute_standardized(
	data       = anndata,
	layer      = mapping['standard'],
	store_into = 'standard',
	axis       = None
)

print('Mean : {}'.format(standard_factors['mean']))
print(' Std : {}'.format(standard_factors['std']))
print()

In [None]:
# Display anndata standardized tpm matrix information

feature_anndata.show_matrix(anndata, layer = 'standard')

## 6.1 Distribution

In [None]:
# Display normal tpm value distribution per gene

for name, function in functions :
	if name == 'min' : continue

	feature_anndata.tpm_histplot(
		data     = anndata[:, keep_transcript],
		layer    = 'standard',
		function = function,
		filters  = None,
		filename = os.path.join(OUT_PLOT, 'distribution-standard-keep-' + name)
	)

	matplotlib.pyplot.show()

In [None]:
# Display normal tpm value distribution per gene

for name, function in functions :
	if name == 'min' : continue

	feature_anndata.tpm_histplot(
		data     = anndata[:, drop_transcript],
		layer    = 'standard',
		function = function,
		filters  = None,
		filename = os.path.join(OUT_PLOT, 'distribution-standard-drop-' + name)
	)

	matplotlib.pyplot.show()

## 6.2 Transcripts

In [None]:
# Display an example of few transcript expressions per tissue

for transcript in boxplot_transcripts :
	if transcript not in anndata.var.index : continue

	feature_anndata.gene_boxplot(
		data       = anndata,
		transcript = transcript,
		groupby    = 'Tissue',
		layer      = 'standard',
		filename   = os.path.join(OUT_PLOT, 'tissue-standard-' + transcript)
	)

	matplotlib.pyplot.show()

# 7. Save

In [None]:
# Save the annotated data with multiple layers

writer.write_h5ad(
	data     = anndata,
	filename = os.path.join(OUT_DATA, 'arabidopsis-r36.h5ad')
)

In [None]:
# Save processing info

writer.write_json(
	data     = mapping,
	filename = os.path.join(OUT_DATA, 'layer-mapping.json')
)

writer.write_json(
	data     = boxcox_factors,
	filename = os.path.join(OUT_DATA, 'factors-boxcox.json')
)

writer.write_json(
	data     = normal_factors,
	filename = os.path.join(OUT_DATA, 'factors-normal.json')
)

writer.write_json(
	data     = standard_factors,
	filename = os.path.join(OUT_DATA, 'factors-standard.json')
)

writer.write_csv(
	data        = statistic_log1p_basic,
	filename    = os.path.join(OUT_DATA, 'statistics-log1p-basic.csv'),
	write_index = True
)

writer.write_csv(
	data        = statistic_log1p_advance,
	filename    = os.path.join(OUT_DATA, 'statistics-log1p-advance.csv'),
	write_index = True
)

writer.write_csv(
	data        = statistic_log1p_normality,
	filename    = os.path.join(OUT_DATA, 'statistics-log1p-normality.csv'),
	write_index = True
)

writer.write_csv(
	data        = statistic_boxcox1p_basic,
	filename    = os.path.join(OUT_DATA, 'statistics-boxcox1p-basic.csv'),
	write_index = True
)

writer.write_csv(
	data        = statistic_boxcox1p_advance,
	filename    = os.path.join(OUT_DATA, 'statistics-boxcox1p-advance.csv'),
	write_index = True
)

writer.write_csv(
	data        = statistic_boxcox1p_normality,
	filename    = os.path.join(OUT_DATA, 'statistics-boxcox1p-normality.csv'),
	write_index = True
)

# 8. Distributions

In [None]:
# Plot multiple lambda distributions to compare

LMBDA = boxcox_factors['lambda']
EPS   = boxcox_factors['eps']

lambdas = sorted([
	-0.9, -0.8, -0.7, -0.6, -0.5,
	-0.4, -0.3, -0.2, -0.1,  0.0,
	 0.1,  0.2,  0.3,  0.4, LMBDA
])

fig, ax = matplotlib.pyplot.subplots(
	nrows   = 3,
	ncols   = 5,
	figsize = (16, 10),
	sharex  = False,
	sharey  = True
)

for index, lmbda in enumerate(lambdas) :
	if lmbda is None :
		matrix = anndata.X
		title  = 'Original'
	else :
		layer = 'boxcox1p-{}'.format(index)

		anndata, _ = feature_anndata.compute_boxcox1p(
			data       = anndata,
			store_into = layer,
			layer      = None,
			eps        = EPS,
			lmbda      = lmbda
		)

		matrix = anndata.layers[layer]
		title  = 'Lambda {:.3f}'.format(lmbda)

	irow = index // 5
	icol = index  % 5

	data = pandas.DataFrame.from_dict({
		'Values' : numpy.mean(matrix, axis = 0)
	})

	seaborn.histplot(
		data  = data,
		x     = 'Values',
		alpha = 0.9,
		color = '#799FCB',
		ax    = ax[irow, icol],
		kde   = False
	)

	ax[irow, icol].set_title(title, loc = 'center')
	ax[irow, icol].set_ylabel(None)
	ax[irow, icol].set_xlabel(None)

matplotlib.pyplot.savefig(
	os.path.join(OUT_PLOT, 'distribution-lambda.png'),
	dpi    = 120,
	format = 'png'
)