In [None]:
# Libraries

import matplotlib
import os
import platform
import shutil
import sys
import warnings

In [None]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux'   : ROOT = '/d/hpc/projects/FRI/up4472/upolanc-thesis'
		elif platform.system() == 'Windows' : ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else : raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		print()

		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [None]:
# Code

from source.python               import runtime
from source.python.data.analysis import analysis_annotation
from source.python.data.analysis import analysis_assembly
from source.python.data.analysis import analysis_metadata
from source.python.data.analysis import analysis_tpm
from source.python.io            import loader
from source.python.io            import writer

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

# 1. Setup

In [None]:
# Setup some directory paths

# 0 -         none
# 1 -          all -          longest transcript per gene
# 2 -          all - maximum mean tpm transcript per gene
# 3 -          tpm - transcript global avg >  5
# 4 - perturbation - control + unstressed
# 5 -       strand - positive strand
# 6 -          tpm - transcript global avg > 10

FILTER_ID = 2
SUBFOLDER = 'filter' + str(FILTER_ID)

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')

OUT_DATA   = os.path.join(OUT,      'nbp01-filter', SUBFOLDER)
OUT_PLOT   = os.path.join(OUT_DATA, 'plot')
RES_GENOME = os.path.join(RES,      'genome')
RES_NBP00  = os.path.join(OUT,      'nbp00-analysis')

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)
os.makedirs(OUT_PLOT, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f'   Output Directory : {OUT_PLOT}')
print(f' Resource Directory : {RES_GENOME}')
print()

In [None]:
# Load the updated and synchronized data

genome_assembly = loader.load_faidx(
	filename  = os.path.join(RES_GENOME, 'arabidopsis-r36', 'gene-assembly.fa')
)

genome_annotation = loader.load_csv(
	filename   = os.path.join(RES_NBP00, 'gene-annotation.csv'),
	low_memory = False
)

tissue_metadata = loader.load_csv(
	filename = os.path.join(RES_NBP00, 'tissue-metadata.csv')
)

tissue_tpm = loader.load_csv(
	filename = os.path.join(RES_NBP00, 'tissue-tpm.csv')
)

In [None]:
# Define filtering factors

FILTER = {
	'id'                          : FILTER_ID,
	'desc'                        : 'none',
	'keep_transcript'             : 'none',
	'keep_age'                    : None,
	'keep_perturbation'           : None,
	'keep_tissue'                 : None,
	'keep_group'                  : None,
	'keep_strand'                 : None,
	'tissue_percentage_threshold' : 0.01,

	# Per sample filter
	'sample_filter_global' : {
		'max' : 0.0,
		'sum' : 0.0,
		'avg' : 0.0,
		'std' : 0.0,
		'px0' : [
			0.0,
			0.0
		]
	},

	# Per transcript filter
	'transcript_filter_global' : {
		'max' : 0.0,
		'sum' : 0.0,
		'avg' : 0.0,
		'std' : 0.0,
		'px0' : [
			0.0,
			0.0
		]
	},

	# Per tissue-transcript filter
	'transcript_filter_tissue' : {
		'max' : 0.0,
		'sum' : 0.0,
		'avg' : 0.0,
		'std' : 0.0,
		'px0' : [
			1.0,
			0.5
		]
	},
	'data' : {
		'drop_transcript' : list(),
		'keep_transcript' : list()
	}
}

if FILTER_ID == 1 :
	FILTER['desc'] = 'all - longest transcript per gene'
	FILTER['keep_transcript'] = 'longest_per_mrna'

if FILTER_ID == 2 :
	FILTER['desc'] = 'all - maximum mean tpm transcript per gene'
	FILTER['keep_transcript'] = 'maximum_mean_per_mrna'

if FILTER_ID == 3 :
	FILTER['desc'] = 'tpm - transcript global avg > 5'
	FILTER['transcript_filter_global']['avg'] = 5.0
	FILTER['keep_transcript'] = 'maximum_mean_per_mrna'

if FILTER_ID == 4 :
	FILTER['desc'] = 'pertrurbation - control + unstressed'
	FILTER['keep_perturbation'] = ['control', 'unstressed']
	FILTER['keep_transcript'] = 'maximum_mean_per_mrna'

if FILTER_ID == 5 :
	FILTER['desc'] = 'strand - positive'
	FILTER['keep_strand'] = ['+']
	FILTER['keep_transcript'] = 'maximum_mean_per_mrna'

if FILTER_ID == 6 :
	FILTER['desc'] = 'tpm - transcript global avg > 10'
	FILTER['transcript_filter_global']['avg'] = 10.0
	FILTER['keep_transcript'] = 'maximum_mean_per_mrna'

# 2. Annotation

In [None]:
# Keep only the longest transcripts per gene

gcount = genome_annotation['Gene'].nunique()
tcount = genome_annotation['Transcript'].nunique()

if FILTER['keep_transcript'] == 'longest_per_mrna' :
	data = genome_annotation.copy()
	data = data[data['Type'] == 'mRNA']

	data = data[data.groupby('Gene')['Length'    ].transform(max) == data['Length']]
	data = data[data.groupby('Gene')['Transcript'].transform(min) == data['Transcript']]

	genome_annotation = genome_annotation[genome_annotation['Transcript'].isin(data['Transcript'])]

if FILTER['keep_transcript'] == 'maximum_mean_per_mrna' :
	data = genome_annotation.copy()
	data = data[data['Type'] == 'mRNA']

	data['TPM'] = data['Transcript'].map(lambda x : tissue_tpm[tissue_tpm['Transcript'] == x].iloc[0, 1:].mean())

	data = data[data.groupby('Gene')['TPM'       ].transform(max) == data['TPM']]
	data = data[data.groupby('Gene')['Transcript'].transform(min) == data['Transcript']]

	genome_annotation = genome_annotation[genome_annotation['Transcript'].isin(data['Transcript'])]

print('Gene       : {:} -> {:}'.format(gcount, genome_annotation['Gene'].nunique(dropna = True)))
print('Transcript : {:} -> {:}'.format(tcount, genome_annotation['Transcript'].nunique(dropna = True)))
print()

# 3. Metadata

In [None]:
# Drop any tissue below a given threshold

threshold = FILTER['tissue_percentage_threshold']
threshold = round(threshold * len(tissue_metadata))
samples   = list()

if threshold > 0 :
	for tissue, dataframe in tissue_metadata.groupby('Tissue') :
		if len(dataframe) < threshold :
			samples.append(tissue)

	tissue_metadata = tissue_metadata.loc[~tissue_metadata['Tissue'].isin(samples)]

print('Drop threshold : {}'.format(threshold))
print('Drop count     : {}'.format(len(samples)))
print('Drop groups    : {}'.format(', '.join(samples)))
print()

# 4. Filter Basic

In [None]:
# Filter samples that are problematic globally (looks at entire sample through all transcripts)

dataframe, sample_filter = analysis_tpm.filter_samples(
	data    = tissue_tpm,
	cutoff  = FILTER['sample_filter_global']
)

sample_filter = list(sample_filter.keys())

if len(sample_filter) > 0 :
	item = sample_filter[0]
	data = dataframe[item].to_numpy()

	print()
	print(f'Sample <{item}> has a sum expression : {data.sum():9.4f}')
	print(f'Sample <{item}> has a max expression : {data.max():9.4f}')
	print(f'Sample <{item}> has a min expression : {data.min():9.4f}')
	print()

if len(sample_filter) > 0 :
	tissue_tpm = tissue_tpm.drop(columns = sample_filter)

In [None]:
# Check for sample matching between tpm and metadata tables

x = set(tissue_metadata['Sample'].dropna())
y = set(tissue_tpm.columns[1:].dropna())

sample_i = list(x.intersection(y))
sample_x = list(x.difference(y))
sample_y = list(y.difference(x))

print(f'Samples in <tissue-metadata> : {len(sample_i)} / {len(x)}')
print(f'Samples in <tissue-tpm>      : {len(sample_i)} / {len(y)}')
print()
print(f'Samples missing in <tissue-metadata> : [{len(sample_y)}] ' + ' '.join(sample_y[:5]) + (' ...' if len(sample_y) > 5 else ''))
print(f'Samples missing in <tissue-tpm>      : [{len(sample_x)}] ' + ' '.join(sample_x[:5]) + (' ...' if len(sample_x) > 5 else ''))
print()

if len(sample_x) > 0 :
	tissue_metadata = tissue_metadata[~tissue_metadata['Sample'].isin(sample_x)]

if len(sample_y) > 0 :
	tissue_tpm = tissue_tpm.drop(columns = sample_y)

x = set(tissue_metadata['Sample'].dropna())
y = set(tissue_tpm.columns[1:].dropna())

sample_i = list(x.intersection(y))
sample_x = list(x.difference(y))
sample_y = list(y.difference(x))

print(f'Samples in <tissue-metadata> : {len(sample_i)} / {len(x)}')
print(f'Samples in <tissue-tpm>      : {len(sample_i)} / {len(y)}')
print()
print(f'Samples missing in <tissue-metadata> : [{len(sample_y)}] ' + ' '.join(sample_y[:5]) + (' ...' if len(sample_y) > 5 else ''))
print(f'Samples missing in <tissue-tpm>      : [{len(sample_x)}] ' + ' '.join(sample_x[:5]) + (' ...' if len(sample_x) > 5 else ''))
print()

In [None]:
# Filter genes that are problematic globally (looks at entire transcript through all samples)

dataframe, gene_filter = analysis_tpm.filter_genes(
	data    = tissue_tpm,
	cutoff  = FILTER['transcript_filter_global']
)

gene_filter = list(gene_filter.keys())

if len(gene_filter) > 0 :
	item = gene_filter[0]
	data = dataframe[item].to_numpy()

	print()
	print(f'Gene <{item}> has a sum expression : {data.sum():9.4f}')
	print(f'Gene <{item}> has a avg expression : {data.mean():9.4f}')
	print(f'Gene <{item}> has a max expression : {data.max():9.4f}')
	print(f'Gene <{item}> has a min expression : {data.min():9.4f}')
	print()

FILTER['data']['drop_transcript'].extend(gene_filter)

In [None]:
# Check for gene matching between tpm and annotation tables

x = set(tissue_tpm['Transcript'].dropna())
y = set(genome_annotation['Transcript'].dropna())

gene_i = list(x.intersection(y))
gene_x = list(x.difference(y))
gene_y = list(y.difference(x))

print(f'Samples in <tissue-tpm>        : {len(gene_i)} / {len(x)}')
print(f'Samples in <genome-annotation> : {len(gene_i)} / {len(y)}')
print()
print(f'Genes missing in <tissue-tpm>        : [{len(gene_y)}] ' + ' '.join(gene_y[:5]) + (' ...' if len(gene_y) > 5 else ''))
print(f'Genes missing in <genome-annotation> : [{len(gene_x)}] ' + ' '.join(gene_x[:5]) + (' ...' if len(gene_x) > 5 else ''))
print()

if len(gene_x) > 0 :
	tissue_tpm = tissue_tpm[~tissue_tpm['Transcript'].isin(gene_x)]

if len(gene_y) > 0 :
	genome_annotation = genome_annotation[~genome_annotation['Transcript'].isin(gene_y)]

x = set(tissue_tpm['Transcript'].dropna())
y = set(genome_annotation['Transcript'].dropna())

gene_i = list(x.intersection(y))
gene_x = list(x.difference(y))
gene_y = list(y.difference(x))

print(f'Samples in <tissue-tpm>        : {len(gene_i)} / {len(x)}')
print(f'Samples in <genome-annotation> : {len(gene_i)} / {len(y)}')
print()
print(f'Genes missing in <tissue-tpm>        : [{len(gene_y)}] ' + ' '.join(gene_y[:5]) + (' ...' if len(gene_y) > 5 else ''))
print(f'Genes missing in <genome-annotation> : [{len(gene_x)}] ' + ' '.join(gene_x[:5]) + (' ...' if len(gene_x) > 5 else ''))
print()

# 5. Filter Groups

In [None]:
# Filter genes that are problematic locally (looks at entire transcript through grouped samples; at least one group must satisfy the filter to keep)

gene_common, gene_filter = analysis_tpm.filter_genes_per_group(
	metadata = tissue_metadata,
	tpm      = tissue_tpm,
	group    = 'Tissue',
	cutoff   = FILTER['transcript_filter_tissue']
)

FILTER['data']['drop_transcript'].extend(gene_common)

In [None]:
# Check for gene matching between tpm and annotation tables

x = set(tissue_tpm['Transcript'].dropna())
y = set(genome_annotation['Transcript'].dropna())

gene_i = list(x.intersection(y))
gene_x = list(x.difference(y))
gene_y = list(y.difference(x))

print(f'Samples in <tissue-tpm>        : {len(gene_i)} / {len(x)}')
print(f'Samples in <genome-annotation> : {len(gene_i)} / {len(y)}')
print()
print(f'Genes missing in <tissue-tpm>        : [{len(gene_y)}] ' + ' '.join(gene_y[:5]) + (' ...' if len(gene_y) > 5 else ''))
print(f'Genes missing in <genome-annotation> : [{len(gene_x)}] ' + ' '.join(gene_x[:5]) + (' ...' if len(gene_x) > 5 else ''))
print()

if len(gene_x) > 0 :
	tissue_tpm = tissue_tpm[~tissue_tpm['Transcript'].isin(gene_x)]

if len(gene_y) > 0 :
	genome_annotation = genome_annotation[~genome_annotation['Transcript'].isin(gene_y)]

x = set(tissue_tpm['Transcript'].dropna())
y = set(genome_annotation['Transcript'].dropna())

gene_i = list(x.intersection(y))
gene_x = list(x.difference(y))
gene_y = list(y.difference(x))

print(f'Samples in <tissue-tpm>        : {len(gene_i)} / {len(x)}')
print(f'Samples in <genome-annotation> : {len(gene_i)} / {len(y)}')
print()
print(f'Genes missing in <tissue-tpm>        : [{len(gene_y)}] ' + ' '.join(gene_y[:5]) + (' ...' if len(gene_y) > 5 else ''))
print(f'Genes missing in <genome-annotation> : [{len(gene_x)}] ' + ' '.join(gene_x[:5]) + (' ...' if len(gene_x) > 5 else ''))
print()

# 6. Filter Samples

In [None]:
# Filter specific tags

if FILTER['keep_age'] is not None :
	sample_count    = len(tissue_metadata)
	tissue_metadata = tissue_metadata[tissue_metadata['Age'].isin(FILTER['keep_age'])].copy()
	sample_count    = sample_count - len(tissue_metadata)

	print('Filterd [{:4d}] samples due to [{:12s}] not beeing in [{}]'.format(sample_count, 'Age', ', '.join(FILTER['keep_age'])))
	print()

if FILTER['keep_perturbation'] is not None :
	sample_count    = len(tissue_metadata)
	tissue_metadata = tissue_metadata[tissue_metadata['Perturbation'].isin(FILTER['keep_perturbation'])].copy()
	sample_count    = sample_count - len(tissue_metadata)

	print('Filterd [{:4d}] samples due to [{:12s}] not beeing in [{}]'.format(sample_count, 'Perturbation', ', '.join(FILTER['keep_perturbation'])))
	print()

if FILTER['keep_tissue'] is not None :
	sample_count    = len(tissue_metadata)
	tissue_metadata = tissue_metadata[tissue_metadata['Tissue'].isin(FILTER['keep_tissue'])].copy()
	sample_count    = sample_count - len(tissue_metadata)

	print('Filterd [{:4d}] samples due to [{:12s}] not beeing in [{}]'.format(sample_count, 'Tissue', ', '.join(FILTER['keep_tissue'])))
	print()

if FILTER['keep_group'] is not None :
	sample_count    = len(tissue_metadata)
	tissue_metadata = tissue_metadata[tissue_metadata['Group'].isin(FILTER['keep_group'])].copy()
	sample_count    = sample_count - len(tissue_metadata)

	print('Filterd [{:4d}] samples due to [{:12s}] not beeing in [{}]'.format(sample_count, 'Group', ', '.join(FILTER['keep_group'])))
	print()

In [None]:
# Check for sample matching between tpm and metadata tables

x = set(tissue_metadata['Sample'].dropna())
y = set(tissue_tpm.columns[1:].dropna())

sample_i = list(x.intersection(y))
sample_x = list(x.difference(y))
sample_y = list(y.difference(x))

print(f'Samples in <tissue-metadata> : {len(sample_i)} / {len(x)}')
print(f'Samples in <tissue-tpm>      : {len(sample_i)} / {len(y)}')
print()
print(f'Samples missing in <tissue-metadata> : [{len(sample_y)}] ' + ' '.join(sample_y[:5]) + (' ...' if len(sample_y) > 5 else ''))
print(f'Samples missing in <tissue-tpm>      : [{len(sample_x)}] ' + ' '.join(sample_x[:5]) + (' ...' if len(sample_x) > 5 else ''))
print()

if len(sample_x) > 0 :
	tissue_metadata = tissue_metadata[~tissue_metadata['Sample'].isin(sample_x)]

if len(sample_y) > 0 :
	tissue_tpm = tissue_tpm.drop(columns = sample_y)

x = set(tissue_metadata['Sample'].dropna())
y = set(tissue_tpm.columns[1:].dropna())

sample_i = list(x.intersection(y))
sample_x = list(x.difference(y))
sample_y = list(y.difference(x))

print(f'Samples in <tissue-metadata> : {len(sample_i)} / {len(x)}')
print(f'Samples in <tissue-tpm>      : {len(sample_i)} / {len(y)}')
print()
print(f'Samples missing in <tissue-metadata> : [{len(sample_y)}] ' + ' '.join(sample_y[:5]) + (' ...' if len(sample_y) > 5 else ''))
print(f'Samples missing in <tissue-tpm>      : [{len(sample_x)}] ' + ' '.join(sample_x[:5]) + (' ...' if len(sample_x) > 5 else ''))
print()

# 7. Filter Genes

In [None]:
# Filter specific tags

if FILTER['keep_strand'] is not None :
	gene_count        = genome_annotation['Gene'].nunique()
	genome_annotation = genome_annotation[genome_annotation['Strand'].isin(FILTER['keep_strand'])].copy()
	gene_count        = gene_count - genome_annotation['Gene'].nunique()

	print('Filterd [{:4d}] genes due to [{:12s}] not beeing in [{}]'.format(gene_count, 'Strand', ', '.join(FILTER['keep_strand'])))
	print()

In [None]:
# Check for gene matching between tpm and annotation tables

x = set(tissue_tpm['Transcript'].dropna())
y = set(genome_annotation['Transcript'].dropna())

gene_i = list(x.intersection(y))
gene_x = list(x.difference(y))
gene_y = list(y.difference(x))

print(f'Samples in <tissue-tpm>        : {len(gene_i)} / {len(x)}')
print(f'Samples in <genome-annotation> : {len(gene_i)} / {len(y)}')
print()
print(f'Genes missing in <tissue-tpm>        : [{len(gene_y)}] ' + ' '.join(gene_y[:5]) + (' ...' if len(gene_y) > 5 else ''))
print(f'Genes missing in <genome-annotation> : [{len(gene_x)}] ' + ' '.join(gene_x[:5]) + (' ...' if len(gene_x) > 5 else ''))
print()

if len(gene_x) > 0 :
	tissue_tpm = tissue_tpm[~tissue_tpm['Transcript'].isin(gene_x)]

if len(gene_y) > 0 :
	genome_annotation = genome_annotation[~genome_annotation['Transcript'].isin(gene_y)]

x = set(tissue_tpm['Transcript'].dropna())
y = set(genome_annotation['Transcript'].dropna())

gene_i = list(x.intersection(y))
gene_x = list(x.difference(y))
gene_y = list(y.difference(x))

print(f'Samples in <tissue-tpm>        : {len(gene_i)} / {len(x)}')
print(f'Samples in <genome-annotation> : {len(gene_i)} / {len(y)}')
print()
print(f'Genes missing in <tissue-tpm>        : [{len(gene_y)}] ' + ' '.join(gene_y[:5]) + (' ...' if len(gene_y) > 5 else ''))
print(f'Genes missing in <genome-annotation> : [{len(gene_x)}] ' + ' '.join(gene_x[:5]) + (' ...' if len(gene_x) > 5 else ''))
print()

# 8. Save

In [None]:
# Modify transcripts

drop_transcript = set(FILTER['data']['drop_transcript'])
keep_transcript = set(tissue_tpm['Transcript'].tolist())

drop_transcript = {x for x in drop_transcript if x in keep_transcript}
keep_transcript = keep_transcript.difference(drop_transcript)

FILTER['data'] = {
	'keep_transcript' : list(keep_transcript),
	'drop_transcript' : list(drop_transcript)
}

In [None]:
# Safety check

x = set(FILTER['data']['keep_transcript'])
y = set(tissue_tpm['Transcript'].dropna())

sample_i = list(x.intersection(y))
sample_x = list(x.difference(y))
sample_y = list(y.difference(x))

print('Total transcript : {:5d} | {:5s} [y    ]'.format(len(y), ''))
print(' Keep transcript : {:5d} | {:5s} [x    ]'.format(len(x), ''))
print(' Keep transcript : {:5d} | {:5d} [y & x]'.format(len(sample_i), len(FILTER['data']['keep_transcript'])))
print(' Drop transcript : {:5d} | {:5d} [y - x]'.format(len(sample_y), len(FILTER['data']['drop_transcript'])))
print()

In [None]:
# Save the updated and synchronized tables

writer.write_csv(
	data         = tissue_metadata,
	filename     = os.path.join(OUT_DATA, 'tissue-metadata.csv'),
	write_index  = False
)

writer.write_csv(
	data        = tissue_tpm,
	filename    = os.path.join(OUT_DATA, 'tissue-tpm.csv'),
	write_index = False
)

writer.write_csv(
	data        = genome_annotation,
	filename    = os.path.join(OUT_DATA, 'gene-annotation.csv'),
	write_index = False
)

writer.write_json(
	data     = FILTER,
	filename = os.path.join(OUT_DATA, 'filter.json')
)

# 9. Plot

In [None]:
# Display the distribution of groups for all samples

for group in ['tissue', 'age', 'group', 'perturbation'] :
	analysis_metadata.distribution_barplot(
		data     = tissue_metadata,
		group    = group.capitalize(),
		filename = os.path.join(OUT_PLOT, 'distribution-' + group)
	)

	matplotlib.pyplot.show()

In [None]:
# Display the region length distributions per each region

with warnings.catch_warnings() :
	warnings.simplefilter('ignore')

	for group, vline in zip(['mRNA', 'UTR5', 'CDS', 'UTR3'], [0, 300, 0, 350]) :
		analysis_annotation.length_histplot(
			data     = genome_annotation,
			value    = group,
			vline    = vline,
			filename = os.path.join(OUT_PLOT, 'length-region-' + group.lower())
		)

In [None]:
# Display the region length distributions per each region (grouped regions)

dataframe = analysis_annotation.group_regions(
	data    = genome_annotation,
	groupby = 'Transcript',
	regions = ['mRNA', 'UTR5', 'CDS', 'UTR3']
)

with warnings.catch_warnings() :
	warnings.simplefilter('ignore')

	for group, vline in zip(['mRNA', 'UTR5', 'CDS', 'UTR3'], [0, 300, 0, 350]) :
		analysis_annotation.length_histplot(
			data     = dataframe,
			value    = group,
			vline    = vline,
			filename = os.path.join(OUT_PLOT, 'length-transcript-' + group.lower())
		)