In [None]:
# Libraries

import matplotlib
import numpy
import os
import pandas
import platform
import shutil
import sys
import warnings

In [None]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux'   : ROOT = '/d/hpc/projects/FRI/up4472/upolanc-thesis'
		elif platform.system() == 'Windows' : ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else : raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		print()

		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [None]:
# Code

from source.python               import runtime
from source.python.data.analysis import analysis_annotation
from source.python.data.analysis import analysis_assembly
from source.python.data.analysis import analysis_metadata
from source.python.data.analysis import analysis_statistics
from source.python.data.analysis import analysis_tpm
from source.python.io            import loader
from source.python.io            import writer

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

# 1. Setup

In [None]:
# Setup some directory paths

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')

OUT_DATA   = os.path.join(OUT,      'nbp00-analysis')
OUT_PLOT   = os.path.join(OUT_DATA, 'plot')
RES_GENOME = os.path.join(RES,      'genome')

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)
os.makedirs(OUT_PLOT, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f' Resource Directory : {RES_GENOME}')
print()

In [None]:
# Load and clean (remove non-relevant columns, rename columns to same format, etc..)

data = loader.load_resources(
	directory = RES_GENOME,
	plant     = 'arabidopsis-r36',
	clean     = True
)

gene_annotation = data['gene_annotation']
gene_assembly   = data['gene_assembly']
tissue_metadata = data['tissue_metadata']
tissue_tpm      = data['tissue_tpm']

# 2. Sync

In [None]:
# Drop mt and pt sequence annotations

mt = gene_annotation[(gene_annotation['Seq'] == 'Mt') & (gene_annotation['Type'] == 'mRNA')]
pt = gene_annotation[(gene_annotation['Seq'] == 'Pt') & (gene_annotation['Type'] == 'mRNA')]

print(f'Transcript (Mt) : ' + '{:}'.format(len(mt)))
print(f'Transcript (Pt) : ' + '{:}'.format(len(pt)))
print()

gcount = gene_annotation['Gene'].nunique()
tcount = gene_annotation['Transcript'].nunique()

gene_annotation = gene_annotation[~gene_annotation['Seq'].isin(['Pt'])]
gene_annotation = gene_annotation[~gene_annotation['Seq'].isin(['Mt'])]

print('Gene       : {:} -> {:}'.format(gcount, gene_annotation['Gene'].nunique(dropna = True)))
print('Transcript : {:} -> {:}'.format(tcount, gene_annotation['Transcript'].nunique(dropna = True)))
print()

In [None]:
# Check for sample matching between tpm and metadata tables

x = set(tissue_metadata['Sample'].dropna())
y = set(tissue_tpm.columns[1:].dropna())

sample_i = list(x.intersection(y))
sample_x = list(x.difference(y))
sample_y = list(y.difference(x))

print(f'Samples in <tissue-metadata> : {len(sample_i)} / {len(x)}')
print(f'Samples in <tissue-tpm>      : {len(sample_i)} / {len(y)}')
print()
print(f'Samples missing in <tissue-metadata> : [{len(sample_y)}] ' + ' '.join(sample_y[:5]) + (' ...' if len(sample_y) > 5 else ''))
print(f'Samples missing in <tissue-tpm>      : [{len(sample_x)}] ' + ' '.join(sample_x[:5]) + (' ...' if len(sample_x) > 5 else ''))
print()

if len(sample_x) > 0 :
	tissue_metadata = tissue_metadata[~tissue_metadata['Sample'].isin(sample_x)]

if len(sample_y) > 0 :
	tissue_tpm = tissue_tpm.drop(columns = sample_y)

# 3. Data

## 3.1 Gene Annotation

In [None]:
# Show the first few gene annotations

analysis_annotation.show(gene_annotation)

In [None]:
# Inspect gene annotation columns, and dispay the number of unique values (as well as the first few values)

analysis_annotation.inspect_columns(
	data    = gene_annotation,
	columns = None
)

In [None]:
# Display a full gene with all its regions

analysis_annotation.show(
	data     = gene_annotation,
	query    = 'AT1G01010.1',
	query_by = 'Transcript',
	rows     = None
)

In [None]:
# Print number of genes and transcripts

print('      Gene IDs : {:,}'.format(gene_annotation['Gene'].nunique(dropna = True)))
print('Transcript IDs : {:,}'.format(gene_annotation['Transcript'].nunique(dropna = True)))
print()

In [None]:
# Display region type count per gene

analysis_annotation.type_distribution(
	data    = gene_annotation,
	groupby = 'Transcript',
	regions = ['mRNA', 'UTR5', 'CDS', 'UTR3']
)

In [None]:
# Display the region length statistics per each region

analysis_annotation.length_statistics(
	data = gene_annotation
)

## 3.2 Gene Assembly

In [None]:
# Display the length and short nucleotide sequences for each chromosome

analysis_assembly.show(
	data = gene_assembly,
	head = 25,
	tail = 25
)

In [None]:
# Display the nucleotide frequency withing each chromosome

analysis_assembly.show_nucleotide_frequency(
	data     = gene_assembly,
	relative = False
)

## 3.3 Tissue Metadata

In [None]:
# Display the first few annotated samples

analysis_metadata.show(tissue_metadata)

In [None]:
# Inspect sample annotation columns, and dispay the number of unique values (as well as the first few values)

analysis_metadata.inspect_columns(
	data    = tissue_metadata,
	columns = None
)

In [None]:
# Display tissue samples

template = '{:' + str(tissue_metadata['Tissue'].str.len().max()) + 's} : {:4d} [{:6.2f} %]'

for tissue, dataframe in tissue_metadata.groupby('Tissue') :
	length  = len(dataframe)
	percent = 100 * length / len(tissue_metadata)

	print(template.format(tissue, length, percent))

print()

In [None]:
# Display age samples

template = '{:' + str(tissue_metadata['Age'].str.len().max()) + 's} : {:4d} [{:6.2f} %]'

for age, dataframe in tissue_metadata.groupby('Age') :
	length  = len(dataframe)
	percent = 100 * length / len(tissue_metadata)

	print(template.format(age, length, percent))

print()

In [None]:
# Display group samples

template = '{:' + str(tissue_metadata['Group'].str.len().max()) + 's} : {:4d} [{:6.2f} %]'

for group, dataframe in tissue_metadata.groupby('Group') :
	length  = len(dataframe)
	percent = 100 * length / len(tissue_metadata)

	print(template.format(group, length, percent))

print()

In [None]:
# Display perturbation samples

template = '{:' + str(tissue_metadata['Perturbation'].str.len().max()) + 's} : {:4d} [{:6.2f} %]'

for group, dataframe in tissue_metadata.groupby('Perturbation') :
	length  = len(dataframe)
	percent = 100 * length / len(tissue_metadata)

	print(template.format(group, length, percent))

## 3.4 Tissue TPM

In [None]:
# Display the first few tpm values

analysis_tpm.show(tissue_tpm)

In [None]:
# Print the global tpm statistics

matrix = tissue_tpm.iloc[:, 1:].to_numpy()

print('   Max TPM : {:13,.5f}'.format(numpy.max(matrix)))
print('  Mean TPM : {:13,.5f}'.format(numpy.mean(matrix)))
print('Median TPM : {:13,.5f}'.format(numpy.median(matrix)))
print('   Min TPM : {:13,.5f}'.format(numpy.min(matrix)))
print()

In [None]:
# Print the amoutn of tpm reading below or equal 1.0

matrix = tissue_tpm.iloc[:, 1:].to_numpy()

total = numpy.size(matrix)
where = numpy.count_nonzero(matrix <= 1.0)
other = total - where

print('   Total elements : {:11,d}'.format(total))
print('Non Zero elements : {:11,d} [{:5.2f} %]'.format(where, 100 * where / total))
print('    Zero elements : {:11,d} [{:5.2f} %]'.format(other, 100 * other / total))
print()

In [None]:
# Print the amoutn of tpm reading below or equal 0.0

matrix = tissue_tpm.iloc[:, 1:].to_numpy()

total = numpy.size(matrix)
where = numpy.count_nonzero(matrix <= 0.0)
other = total - where

print('Elements          : {:11,d}'.format(total))
print('Elements <= 0.000 : {:11,d} [{:5.2f} %]'.format(where, 100 * where / total))
print('Elements >  0.000 : {:11,d} [{:5.2f} %]'.format(other, 100 * other / total))
print()

In [None]:
# Add small shift to zeros

tissue_tpm = tissue_tpm.replace(to_replace = 0.0, value = 0.001, inplace = False)

matrix = tissue_tpm.iloc[:, 1:].to_numpy()

total = numpy.size(matrix)
where = numpy.count_nonzero(matrix <= 0.001)
other = total - where

print('Elements          : {:11,d}'.format(total))
print('Elements <= 0.001 : {:11,d} [{:5.2f} %]'.format(where, 100 * where / total))
print('Elements >  0.001 : {:11,d} [{:5.2f} %]'.format(other, 100 * other / total))
print()

# 4. Plots

In [None]:
# Display the distribution of groups for all samples

for group in ['tissue', 'age', 'group', 'perturbation'] :
	analysis_metadata.distribution_barplot(
		data     = tissue_metadata,
		group    = group.capitalize(),
		filename = os.path.join(OUT_PLOT, 'distribution-' + group)
	)

	matplotlib.pyplot.show()

In [None]:
# Display the region length distributions per each region

with warnings.catch_warnings() :
	warnings.simplefilter('ignore')

	for group, vline in zip(['mRNA', 'UTR5', 'CDS', 'UTR3'], [0, 300, 0, 350]) :
		analysis_annotation.length_histplot(
			data        = gene_annotation,
			value       = group,
			vline       = vline,
			linewidth   = 4,
			show_xlabel = False,
			show_ylabel = False,
			filename    = os.path.join(OUT_PLOT, 'length-region-' + group.lower())
		)

In [None]:
# Display the region length distributions per each region (grouped regions)

dataframe = analysis_annotation.group_regions(
	data    = gene_annotation,
	groupby = 'Transcript',
	regions = ['mRNA', 'UTR5', 'CDS', 'UTR3']
)

with warnings.catch_warnings() :
	warnings.simplefilter('ignore')

	for group, vline in zip(['mRNA', 'UTR5', 'CDS', 'UTR3'], [0, 300, 0, 350]) :
		analysis_annotation.length_histplot(
			data        = dataframe,
			value       = group,
			vline       = vline,
			linewidth   = 4,
			show_xlabel = False,
			show_ylabel = False,
			filename    = os.path.join(OUT_PLOT, 'length-transcript-' + group.lower())
		)

# 5. Groups

In [None]:
# Get the samples per tissue

samples_tissue = dict()

for group in tissue_metadata['Tissue'].unique() :
	samples_tissue[group] = tissue_metadata[tissue_metadata['Tissue'] == group]['Sample'].tolist()

for group in tissue_metadata['Group'].unique() :
	samples_tissue[group] = tissue_metadata[tissue_metadata['Group'] == group]['Sample'].tolist()

# 6. Statistics

In [None]:
# Define transcripts

transcripts = tissue_tpm.iloc[:, 0 ].values

In [None]:
# Compute basic statistics

dataframes = [
	analysis_statistics.generate_basic_statistics(
		data       = tissue_tpm[samples_tissue[tissue]],
		transcript = transcripts,
		tissue     = tissue,
		axis       = 1
	)

	for tissue in samples_tissue.keys()
]

dataframes.append(
	analysis_statistics.generate_basic_statistics(
		data       = tissue_tpm,
		transcript = transcripts,
		tissue     = 'Global',
		axis       = 1
	)
)

statistic_basic = pandas.concat(dataframes).set_index(['Transcript', 'Tissue'])
statistic_basic.xs('AT1G01010.1', level = 0)

In [None]:
# Compute advance statistics

dataframes = [
	analysis_statistics.genearte_advance_statistics(
		data       = tissue_tpm[samples_tissue[tissue]],
		transcript = transcripts,
		tissue     = tissue,
		axis       = 1
	)

	for tissue in samples_tissue.keys()
]

dataframes.append(
	analysis_statistics.genearte_advance_statistics(
		data       = tissue_tpm,
		transcript = transcripts,
		tissue     = 'Global',
		axis       = 1
	)
)

statistic_advance = pandas.concat(dataframes).set_index(['Transcript', 'Tissue'])
statistic_advance.xs('AT1G01010.1', level = 0)

In [None]:
# Compute normality statistics

with warnings.catch_warnings() :
	warnings.simplefilter('ignore')

	dataframes = [
		analysis_statistics.generate_normality_statistics(
			data       = tissue_tpm[samples_tissue[tissue]],
			transcript = transcripts,
			tissue     = tissue
		)

		for tissue in samples_tissue.keys()
	]

	dataframes.append(
		analysis_statistics.generate_normality_statistics(
			data       = tissue_tpm,
			transcript = transcripts,
			tissue     = 'Global'
		)
	)

statistic_normality = pandas.concat(dataframes).set_index(['Transcript', 'Tissue'])
statistic_normality.xs('AT1G01010.1', level = 0)

# 7. Save

In [None]:
# Save the updated and synchronized tables

writer.write_csv(
	data         = tissue_metadata,
	filename     = os.path.join(OUT_DATA, 'tissue-metadata.csv'),
	write_index  = False
)

writer.write_csv(
	data        = tissue_tpm,
	filename    = os.path.join(OUT_DATA, 'tissue-tpm.csv'),
	write_index = False
)

writer.write_csv(
	data        = gene_annotation,
	filename    = os.path.join(OUT_DATA, 'gene-annotation.csv'),
	write_index = False
)

writer.write_csv(
	data        = statistic_basic,
	filename    = os.path.join(OUT_DATA, 'statistics-basic.csv'),
	write_index = True
)

writer.write_csv(
	data        = statistic_advance,
	filename    = os.path.join(OUT_DATA, 'statistics-advance.csv'),
	write_index = True
)

writer.write_csv(
	data        = statistic_normality,
	filename    = os.path.join(OUT_DATA, 'statistics-normality.csv'),
	write_index = True
)