In [None]:
# Libraries

import itertools
import numpy
import os
import platform
import shutil
import sys
import warnings

In [None]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux'   : ROOT = '/d/hpc/projects/FRI/up4472/upolanc-thesis'
		elif platform.system() == 'Windows' : ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else : raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		print()

		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [None]:
# Code

from source.python              import runtime
from source.python.bert         import bert_data
from source.python.data.feature import feature_target
from source.python.dataset      import dataset_split
from source.python.io           import loader
from source.python.io           import writer

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

# 1. Setup

In [None]:
# Setup some directory paths

FILTER_ID = 2
SUBFOLDER = 'filter' + str(FILTER_ID)

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')

OUT_DATA  = os.path.join(OUT,      'nbp05-target',  SUBFOLDER)
OUT_PLOT  = os.path.join(OUT_DATA, 'plot')
RES_NBP01 = os.path.join(OUT,      'nbp01-filter',  SUBFOLDER)
RES_NBP02 = os.path.join(OUT,      'nbp02-anndata', SUBFOLDER)
RES_NBP04 = os.path.join(OUT,      'nbp04-feature', SUBFOLDER)

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)
os.makedirs(OUT_PLOT, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f'   Output Directory : {OUT_PLOT}')
print(f' Resource Directory : {RES_NBP01}')
print(f' Resource Directory : {RES_NBP02}')
print(f' Resource Directory : {RES_NBP04}')
print()

In [None]:
# Load the annotated and cleaned data

anndata = loader.load_h5ad(
	filename = os.path.join(RES_NBP04, 'arabidopsis-r36.h5ad')
)

sequence_prom_utr5_keep = loader.load_fasta(
	filename  = os.path.join(RES_NBP04, 'sequences-promoter-utr5-keep.fasta'),
	to_string = True
)

sequence_prom_utr5_drop = loader.load_fasta(
	filename  = os.path.join(RES_NBP04, 'sequences-promoter-utr5-drop.fasta'),
	to_string = True
)

sequence_prom_keep = loader.load_fasta(
	filename  = os.path.join(RES_NBP04, 'sequences-promoter-full-keep.fasta'),
	to_string = True
)

sequence_prom_drop = loader.load_fasta(
	filename  = os.path.join(RES_NBP04, 'sequences-promoter-full-drop.fasta'),
	to_string = True
)

sequence_2150_keep = loader.load_fasta(
	filename  = os.path.join(RES_NBP04, 'sequences-2150-keep.fasta'),
	to_string = True
)

sequence_2150_drop = loader.load_fasta(
	filename  = os.path.join(RES_NBP04, 'sequences-2150-drop.fasta'),
	to_string = True
)

feature_base_keep = loader.load_npz(
	filename = os.path.join(RES_NBP04, 'features-base-keep.npz')
)

feature_base_drop = loader.load_npz(
	filename = os.path.join(RES_NBP04, 'features-base-drop.npz')
)

filter_dict = loader.load_json(
	filename = os.path.join(RES_NBP01, 'filter.json')
)

In [None]:
# Filtered transcripts

keep_transcript = filter_dict['data']['keep_transcript']
drop_transcript = filter_dict['data']['drop_transcript']

In [None]:
# Load anndata information

mapping_layer   = loader.load_json(filename = os.path.join(RES_NBP02, 'layer-mapping.json'))
factor_boxcox   = loader.load_json(filename = os.path.join(RES_NBP02, 'factors-boxcox.json'))
factor_normal   = loader.load_json(filename = os.path.join(RES_NBP02, 'factors-normal.json'))
factor_standard = loader.load_json(filename = os.path.join(RES_NBP02, 'factors-standard.json'))

In [None]:
# Constants

GENERATE_BERT_TRANSCRIPT_2150    = True
GENERATE_BERT_PROMOTER_512       = True
GENERATE_BERT_PROMOTER_4096      = True
GENERATE_BERT_PROMOTER_UTR5_4096 = True

TARGET_LAYER   = 'boxcox1p'
OUTLIER_FILTER = 'zscore'

OUTLIER_PARAMS = {
	'factor-zscore' : 3.0,
	'factor-iqr'    : 1.5
}

KMERS       = [3, 4, 5, 6]
RANDOM_SEED = runtime.lock_random(
	seed     = None,
	generate = True
)

print('Random Seed : {}'.format(RANDOM_SEED))
print()

# 2. Regression

## 2.1 Extraction

In [None]:
# Extract the transcript tpm values

with warnings.catch_warnings() :
	warnings.simplefilter('ignore')

	values, order = feature_target.extract_tpm_multi(
		data      = anndata,
		layer     = TARGET_LAYER,
		verbose   = True,
		groups    = ['Tissue'],
		functions = [
			('mean', lambda x, axis : numpy.nanmean(x, axis = axis))
		],
		outlier_filter = OUTLIER_FILTER,
		outlier_params = OUTLIER_PARAMS
	)

In [None]:
raise ValueError()

In [None]:
# Extend to include global mean per transcript

if TARGET_LAYER is None :
	matrix = anndata.X
else :
	matrix = anndata.layers[TARGET_LAYER]

for index, transcript in enumerate(anndata.var.index) :
	values[transcript]['global-mean'] = [numpy.nanmean(matrix[:, index])]
	values[transcript]['global-max']  = [numpy.nanmax (matrix[:, index])]

order['global'] = ['global']

In [None]:
# Define what values to filter out (none means keep all)

filters = {
	'tissue'       : None,
	'age'          : None,
	'group'        : ['mature_leaf', 'mature_flower', 'mature_root', 'mature_seed', 'young_seedling'],
	'perturbation' : None
}

for key, keep in filters.items() :
	if keep is None :
		continue

	keep  = [x for x in keep if x in order[key]]
	index = [order[key].index(x) for x in keep]

	order[key] = keep

	for transcript in values.keys() :
		for group, array in values[transcript].items() :
			if not group.startswith(key.lower()) :
				continue

			values[transcript][group] = [array[x] for x in index]

In [None]:
# Display an example of the transcript tpm values

gene = anndata.var.index.to_list()[0]

for index, (key, value) in enumerate(values[gene].items()) :
	print(f'{key:17s} : [{len(value):2d}] ' + ' '.join('{:7.5f}'.format(x) for x in value))

In [None]:
# Split into keep and drop

values_keep = {k : v for k, v in values.items() if k in keep_transcript}
values_drop = {k : v for k, v in values.items() if k in drop_transcript}

print(' Keep : {:5d}'.format(len(values_keep)))
print(' Drop : {:5d}'.format(len(values_drop)))
print('Total : {:5d} / {:5d}'.format(len(values_drop) + len(values_keep), len(values)))
print()

In [None]:
# Safety checks for nans

for t0, t1 in itertools.product(['tissue', 'age', 'group', 'perturbation', 'global'], ['mean', 'max']) :
	t = '{}-{}'.format(t0, t1)

	for k, v in values_keep.items() :
		if numpy.isnan(v[t]).any() :
			print('{} {:30s} {}'.format(k, t, v[t]))

print()

## 2.2 Distribution

In [None]:
# Compute distributions

distribution_keep = feature_target.distribution_group(
	data   = values_keep,
	order  = order,
	select = 'mean',
	genes  = list(values_keep.keys())
)

distribution_drop = feature_target.distribution_group(
	data   = values_drop,
	order  = order,
	select = 'mean',
	genes  = list(values_drop.keys())
)

In [None]:
# Display both keep and drop datasets

feature_target.distribution_histplot(
	data     = [distribution_keep, distribution_drop],
	names    = ['keep', 'drop'],
	groupby  = 'global',
	discrete = False,
	title    = False,
	filename = os.path.join(OUT_PLOT, 'distribution-value-{}-mean'.format('global'))
)

In [None]:
# Display both keep and drop datasets

feature_target.distribution_histplot(
	data     = [distribution_keep, distribution_drop],
	names    = ['keep', 'drop'],
	groupby  = 'tissue',
	discrete = False,
	title    = False,
	filename = os.path.join(OUT_PLOT, 'distribution-value-{}-mean'.format('tissue'))
)

In [None]:
# Display both keep and drop datasets

feature_target.distribution_histplot(
	data     = [distribution_keep, distribution_drop],
	names    = ['keep', 'drop'],
	groupby  = 'group',
	discrete = False,
	title    = False,
	filename = os.path.join(OUT_PLOT, 'distribution-value-{}-mean'.format('group'))
)

## 2.3 Save

In [None]:
# Save the target transcript tpm values

writer.write_json(
	data     = values_keep,
	filename = os.path.join(OUT_DATA, 'target-values-keep.json')
)

writer.write_json(
	data     = values_drop,
	filename = os.path.join(OUT_DATA, 'target-values-drop.json')
)

writer.write_json(
	data     = order,
	filename = os.path.join(OUT_DATA, 'target-order.json')
)

# 3. Classification

## 3.1 Extraction

In [None]:
# Extract the transcript tpm classification per group (reuses tpm values from regression extraction)

labels, bounds = feature_target.classify_tpm(
	data    = values,
	classes = 5
)

In [None]:
# Display bounds in different configurations (start, values and mapping are manually set based on nbp02-anndata)

feature_target.display_bounds_mapping(
	bounds     = bounds['tissue-mean'],
	start      = TARGET_LAYER,
	values = {
		'min_value'  : factor_normal['min'],
		'max_value'  : factor_normal['max'],
		'box_lambda' : factor_boxcox['lambda'],
		'log_base'   : 2
	},
	mapping = mapping_layer
)

In [None]:
# Display an example of the transcript classification

gene = anndata.var.index.to_list()[0]

for index, (key, value) in enumerate(labels[gene].items()) :
	print(f'{key:17s} : [{len(value):2d}] ' + ' '.join('{:1d}'.format(x) for x in value))

In [None]:
# Split into keep and drop

labels_keep = {k : v for k, v in labels.items() if k in keep_transcript}
labels_drop = {k : v for k, v in labels.items() if k in drop_transcript}

print(' Keep : {:5d}'.format(len(labels_keep)))
print(' Drop : {:5d}'.format(len(labels_drop)))
print('Total : {:5d} / {:5d}'.format(len(labels_drop) + len(labels_keep), len(labels)))
print()

## 3.2 Distribution

In [None]:
# Compute distributions

distribution_keep = feature_target.distribution_group(
	data   = labels_keep,
	order  = order,
	select = 'mean',
	genes  = list(labels_keep.keys())
)

distribution_drop = feature_target.distribution_group(
	data   = labels_drop,
	order  = order,
	select = 'mean',
	genes  = list(labels_drop.keys())
)

In [None]:
# Display both keep and drop datasets

feature_target.distribution_histplot(
	data     = [distribution_keep, distribution_drop],
	names    = ['keep', 'drop'],
	groupby  = 'tissue',
	discrete = True,
	title    = False,
	filename = os.path.join(OUT_PLOT, 'distribution-class-{}-mean'.format('tissue'))
)

In [None]:
# Display both keep and drop datasets

feature_target.distribution_histplot(
	data     = [distribution_keep, distribution_drop],
	names    = ['keep', 'drop'],
	groupby  = 'group',
	discrete = True,
	title    = False,
	filename = os.path.join(OUT_PLOT, 'distribution-class-{}-mean'.format('group'))
)

In [None]:
# Display both keep and drop datasets

feature_target.distribution_histplot(
	data     = [distribution_keep, distribution_drop],
	names    = ['keep', 'drop'],
	groupby  = 'global',
	discrete = True,
	title    = False,
	filename = os.path.join(OUT_PLOT, 'distribution-class-{}-mean'.format('global'))
)

## 3.3 Save

In [None]:
# Save the target transcript classification

writer.write_json(
	data     = labels_keep,
	filename = os.path.join(OUT_DATA, 'target-labels-keep.json')
)

writer.write_json(
	data     = labels_drop,
	filename = os.path.join(OUT_DATA, 'target-labels-drop.json')
)

# 4. Mapping

## 4.1 Binarizer

In [None]:
# Create binarizers and mapping lookup

features_binarizer, features_grouped, features_exploded = feature_target.create_mapping(
	values = values,
	labels = labels,
	order  = order
)

In [None]:
# Show keys and labels

for key in features_binarizer.keys() :
	classes = features_binarizer[key].classes_

	if len(classes) > 8 :
		print('{:17s} : {} ...'.format(key, classes[:8]))
	else :
		print('{:17s} : {}'.format(key, classes))

print()

In [None]:
# Split keep and drop

features_grouped_keep = {
	key : dataframe[dataframe['Transcript'].isin(keep_transcript)].copy()
	for key, dataframe in features_grouped.items()
}

features_exploded_keep = {
	key : dataframe[dataframe['Transcript'].isin(keep_transcript)].copy()
	for key, dataframe in features_exploded.items()
}

features_grouped_drop = {
	key : dataframe[dataframe['Transcript'].isin(drop_transcript)].copy()
	for key, dataframe in features_grouped.items()
}

features_exploded_drop = {
	key : dataframe[dataframe['Transcript'].isin(drop_transcript)].copy()
	for key, dataframe in features_exploded.items()
}

## 4.2 Save

In [None]:
# Save binarizers and mapping lookup

writer.write_pickle(
	data     = features_grouped_keep,
	filename = os.path.join(OUT_DATA, f'mapping-grouped-keep.pkl')
)

writer.write_pickle(
	data     = features_exploded_keep,
	filename = os.path.join(OUT_DATA, f'mapping-exploded-keep.pkl')
)

writer.write_pickle(
	data     = features_grouped_drop,
	filename = os.path.join(OUT_DATA, f'mapping-grouped-drop.pkl')
)

writer.write_pickle(
	data     = features_exploded_drop,
	filename = os.path.join(OUT_DATA, f'mapping-exploded-drop.pkl')
)

# 5. Bert

In [None]:
# Define generated combinations

combinations = [
	('global', 'mean', False, None),
	('tissue', 'mean', False, None),
	('tissue', 'mean', True,  None),
	('tissue', 'mean', True, 'seedling'),
	('group',  'mean', False, None),
	('group',  'mean', True,  None),
	('group',  'mean', True, 'young_seedling')
]

## 5.1 Transcript - 2150bp

In [None]:
# Constants

size = 2150
name = 'transcript'

In [None]:
# Generate full transcript tokens for bert model

if GENERATE_BERT_TRANSCRIPT_2150 :
	sequences = sequence_2150_keep
	features  = feature_base_keep
	group     = 'keep'

	generator = lambda : bert_data.data_prepare(
		sequences    = sequences,
		features     = features,
		directory    = OUT_DATA,
		valid_split  = 0.0,
		test_split   = 0.2,
		filename     = 'mapping-grouped-{}.pkl'.format(group),
		combinations = combinations
	)

	for data, filename in generator() :
		bert_data.create_kmers(
			filename     = os.path.join(OUT_DATA, 'dnabert-{}', '{}-{}'.format(name, size), '{}'.format(filename), '{}' + '-{}.tsv'.format(group)),
			generator    = dataset_split.generate_group_shuffle_split,
			max_tokens   = None,
			random_seed  = RANDOM_SEED,
			kmers        = KMERS,
			data         = data
		)

In [None]:
# Generate full transcript tokens for bert model

if GENERATE_BERT_TRANSCRIPT_2150 :
	sequences = sequence_2150_drop
	features  = feature_base_drop
	group     = 'drop'

	generator = lambda : bert_data.data_prepare(
		sequences    = sequences,
		features     = features,
		directory    = OUT_DATA,
		valid_split  = 0.0,
		test_split   = 0.2,
		filename     = 'mapping-grouped-{}.pkl'.format(group),
		combinations = combinations
	)

	for data, filename in generator() :
		bert_data.create_kmers(
			filename     = os.path.join(OUT_DATA, 'dnabert-{}', '{}-{}'.format(name, size), '{}'.format(filename), '{}' + '-{}.tsv'.format(group)),
			generator    = dataset_split.generate_group_shuffle_split,
			max_tokens   = None,
			random_seed  = RANDOM_SEED,
			kmers        = KMERS,
			data         = data
		)

## 5.2 Promoter - 512bp

In [None]:
# Constants

size = 512
name = 'promoter'

In [None]:
# Generate 512 promoter tokens for bert model

if GENERATE_BERT_PROMOTER_512 :
	sequences = sequence_prom_keep
	features  = feature_base_keep
	group     = 'keep'

	generator = lambda : bert_data.data_prepare(
		sequences    = sequences,
		features     = features,
		directory    = OUT_DATA,
		valid_split  = 0.0,
		test_split   = 0.2,
		filename     = 'mapping-grouped-{}.pkl'.format(group),
		combinations = combinations
	)

	for data, filename in generator() :
		bert_data.create_kmers(
			filename     = os.path.join(OUT_DATA, 'dnabert-{}', '{}-{}'.format(name, size), '{}'.format(filename), '{}' + '-{}.tsv'.format(group)),
			generator    = dataset_split.generate_group_shuffle_split,
			max_tokens   = -size,
			random_seed  = RANDOM_SEED,
			kmers        = KMERS,
			data         = data
		)

In [None]:
# Generate 512 promoter tokens for bert model

if GENERATE_BERT_PROMOTER_512 :
	sequences = sequence_prom_drop
	features  = feature_base_drop
	group     = 'drop'

	generator = lambda : bert_data.data_prepare(
		sequences    = sequences,
		features     = features,
		directory    = OUT_DATA,
		valid_split  = 0.0,
		test_split   = 0.2,
		filename     = 'mapping-grouped-{}.pkl'.format(group),
		combinations = combinations
	)

	for data, filename in generator() :
		bert_data.create_kmers(
			filename     = os.path.join(OUT_DATA, 'dnabert-{}', '{}-{}'.format(name, size), '{}'.format(filename), '{}' + '-{}.tsv'.format(group)),
			generator    = dataset_split.generate_group_shuffle_split,
			max_tokens   = -size,
			random_seed  = RANDOM_SEED,
			kmers        = KMERS,
			data         = data
		)

## 5.3 Promoter - 4096bp

In [None]:
# Constants

size = 4096
name = 'promoter'

In [None]:
# Generate 4096 promoter tokens for bert model

if GENERATE_BERT_PROMOTER_4096 :
	sequences = sequence_prom_keep
	features  = feature_base_keep
	group     = 'keep'

	generator = lambda : bert_data.data_prepare(
		sequences    = sequences,
		features     = features,
		directory    = OUT_DATA,
		valid_split  = 0.0,
		test_split   = 0.2,
		filename     = 'mapping-grouped-{}.pkl'.format(group),
		combinations = combinations
	)

	for data, filename in generator() :
		bert_data.create_kmers(
			filename     = os.path.join(OUT_DATA, 'dnabert-{}', '{}-{}'.format(name, size), '{}'.format(filename), '{}' + '-{}.tsv'.format(group)),
			generator    = dataset_split.generate_group_shuffle_split,
			max_tokens   = -size,
			random_seed  = RANDOM_SEED,
			kmers        = KMERS,
			data         = data
		)

In [None]:
# Generate 4096 promoter tokens for bert model

if GENERATE_BERT_PROMOTER_4096 :
	sequences = sequence_prom_drop
	features  = feature_base_drop
	group     = 'drop'

	generator = lambda : bert_data.data_prepare(
		sequences    = sequences,
		features     = features,
		directory    = OUT_DATA,
		valid_split  = 0.0,
		test_split   = 0.2,
		filename     = 'mapping-grouped-{}.pkl'.format(group),
		combinations = combinations
	)

	for data, filename in generator() :
		bert_data.create_kmers(
			filename     = os.path.join(OUT_DATA, 'dnabert-{}', '{}-{}'.format(name, size), '{}'.format(filename), '{}' + '-{}.tsv'.format(group)),
			generator    = dataset_split.generate_group_shuffle_split,
			max_tokens   = -size,
			random_seed  = RANDOM_SEED,
			kmers        = KMERS,
			data         = data
		)

## 5.4 Promoter - 4096bp (with UTR5)

In [None]:
# Constants

size = 4096
name = 'promoter-utr5'

In [None]:
# Generate 4096 promoter tokens for bert model

if GENERATE_BERT_PROMOTER_UTR5_4096 :
	sequences = sequence_prom_utr5_keep
	features  = feature_base_keep
	group     = 'keep'

	generator = lambda : bert_data.data_prepare(
		sequences    = sequences,
		features     = features,
		directory    = OUT_DATA,
		valid_split  = 0.0,
		test_split   = 0.2,
		filename     = 'mapping-grouped-{}.pkl'.format(group),
		combinations = combinations
	)

	for data, filename in generator() :
		bert_data.create_kmers(
			filename     = os.path.join(OUT_DATA, 'dnabert-{}', '{}-{}'.format(name, size), '{}'.format(filename), '{}' + '-{}.tsv'.format(group)),
			generator    = dataset_split.generate_group_shuffle_split,
			max_tokens   = -size,
			random_seed  = RANDOM_SEED,
			kmers        = KMERS,
			data         = data
		)

In [None]:
if GENERATE_BERT_PROMOTER_UTR5_4096 :
	sequences = sequence_prom_utr5_drop
	features  = feature_base_drop
	group     = 'drop'

	generator = lambda : bert_data.data_prepare(
		sequences    = sequences,
		features     = features,
		directory    = OUT_DATA,
		valid_split  = 0.0,
		test_split   = 0.2,
		filename     = 'mapping-grouped-{}.pkl'.format(group),
		combinations = combinations
	)

	for data, filename in generator() :
		bert_data.create_kmers(
			filename     = os.path.join(OUT_DATA, 'dnabert-{}', '{}-{}'.format(name, size), '{}'.format(filename), '{}' + '-{}.tsv'.format(group)),
			generator    = dataset_split.generate_group_shuffle_split,
			max_tokens   = -size,
			random_seed  = RANDOM_SEED,
			kmers        = KMERS,
			data         = data
		)