In [None]:
# Libraries

import numpy
import os
import platform
import shutil
import sys

In [None]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux'   : ROOT = '/d/hpc/projects/FRI/up4472/upolanc-thesis'
		elif platform.system() == 'Windows' : ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else : raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		print()

		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [None]:
# Code

from source.python                       import runtime
from source.python.dataset               import dataset_split
from source.python.dataset               import dataset_utils
from source.python.dataset.dataset_split import generate_group_shuffle_split
from source.python.dataset.dataset_split import generate_random_shuffle_split
from source.python.dataset.dataset_split import generate_stratified_shuffle_split
from source.python.io                    import loader
from source.python.io                    import writer

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

In [None]:
# System and device

DEVICE = runtime.get_device(only_cpu = False)
SYSTEM = runtime.get_system_info()

for key, value in SYSTEM.items() :
	print('{:25s} : {}'.format(key, value))

print()

# 1. Setup

In [None]:
# Define output and inputs

FILTER_ID     = 2
SUBFOLDER     = 'filter' + str(FILTER_ID)
SEQUENCE_TYPE = 'transcript-2150'

In [None]:
# Setup some directory paths.

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')

OUT_DATA  = os.path.join(OUT, 'nbp11-tensorflow')
RES_NBP04 = os.path.join(OUT, 'nbp04-feature', SUBFOLDER)
RES_NBP05 = os.path.join(OUT, 'nbp05-target',  SUBFOLDER)

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA, exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f' Resource Directory : {RES_NBP04}')
print(f' Resource Directory : {RES_NBP05}')
print()

In [None]:
# Define sequence paths

if SEQUENCE_TYPE == 'transcript-6150'    : sequence_keep = os.path.join(RES_NBP04, 'sequences-6150-keep.fasta')
if SEQUENCE_TYPE == 'transcript-2150'    : sequence_keep = os.path.join(RES_NBP04, 'sequences-2150-keep.fasta')
if SEQUENCE_TYPE == 'promoter-full-5000' : sequence_keep = os.path.join(RES_NBP04, 'sequences-promoter-full-keep.fasta')
if SEQUENCE_TYPE == 'promoter-utr5-5000' : sequence_keep = os.path.join(RES_NBP04, 'sequences-promoter-utr5-keep.fasta')

In [None]:
# Load the input transcript features

sequence_keep = loader.load_fasta(
	filename  = sequence_keep,
	to_string = True
)

feature_base_keep = loader.load_npz(
	filename = os.path.join(RES_NBP04, 'features-base-keep.npz')
)

In [None]:
# Configuration file

CONFIG = {
	'core/device'             : DEVICE,
	'core/random'             : None,
	'core/rootdir'            : ROOT,
	'core/verbose'            : True,
	'dataset/batch/test'      : 32,
	'dataset/batch/train'     : 32,
	'dataset/batch/valid'     : 32,
	'dataset/expanddim'       : None,
	'dataset/sequence/start'  : None,
	'dataset/sequence/end'    : None,
	'dataset/sequence/type'   : SEQUENCE_TYPE,
	'dataset/split/generator' : 'group',
	'dataset/split/test'      : 0.2,
	'dataset/split/valid'     : 0.2,
	'model/mode'              : 'regression',
	'model/output/explode'    : False,
	'model/output/filter'     : None,
	'model/output/heads'      : None,
	'model/output/size'       : None,
	'model/output/target'     : 'global',
	'model/output/type'       : 'mean',
}

CONFIG['core/random'] = runtime.lock_random(
	seed     = CONFIG['core/random'],
	generate = True
)

In [None]:
# Prints

print('Output Target : {}'.format(CONFIG['model/output/target']))
print('Output Type   : {}'.format(CONFIG['model/output/type']))
print('Random Seed   : {}'.format(CONFIG['core/random']))
print()

# 2. Dataset

In [None]:
# Create dataset

dataset_keep, dataframe_keep, target_value_keep, target_order_keep = dataset_utils.get_dataset(
	config    = CONFIG,
	sequence  = sequence_keep,
	feature   = feature_base_keep,
	directory = RES_NBP05,
	cached    = None,
	start     = None,
	end       = None,
	filename  = 'mapping-grouped-keep.pkl'
)

In [None]:
# Split dataset into train, valid and test

if isinstance(CONFIG['dataset/split/generator'], str) :
	if   CONFIG['dataset/split/generator'].startswith('stratified') : generator = generate_stratified_shuffle_split
	elif CONFIG['dataset/split/generator'].startswith('group')      : generator = generate_group_shuffle_split
	elif CONFIG['dataset/split/generator'].startswith('random')     : generator = generate_random_shuffle_split
	else : raise ValueError()

generator = generator(
	dataset     = dataset_keep,
	split_size  = {
		'test'  : CONFIG['dataset/split/test'],
		'valid' : CONFIG['dataset/split/valid']
	},
	random_seed = CONFIG['core/random']
)

indices = next(generator)

train_indices = indices[0]
valid_indices = indices[1] if indices[1] is not None else None
test_indices  = indices[2] if indices[2] is not None else None

# 3. Splits

In [None]:
# Select train, valud, test data

train_items = [dataset_keep[index] for index in train_indices]

train_keys    = numpy.array([item[0] for item in train_items], dtype = str)
train_onehot  = numpy.array([item[1] for item in train_items], dtype = numpy.float64)
train_onehot  = numpy.swapaxes(train_onehot, 1, 2)
train_feature = numpy.array([item[2] for item in train_items], dtype = numpy.float64)
train_target  = numpy.array([item[3] for item in train_items], dtype = numpy.float64)

print('Training   :')
print('Keys       : {}'.format(', '.join([str(x) for x in numpy.shape(train_keys)])))
print('Sequences  : {}'.format(', '.join([str(x) for x in numpy.shape(train_onehot)])))
print('Features   : {}'.format(', '.join([str(x) for x in numpy.shape(train_feature)])))
print('Targets    : {}'.format(', '.join([str(x) for x in numpy.shape(train_target)])))
print()

valid_items = [dataset_keep[index] for index in valid_indices]

valid_keys    = numpy.array([item[0] for item in valid_items], dtype = str)
valid_onehot  = numpy.array([item[1] for item in valid_items], dtype = numpy.float64)
valid_onehot  = numpy.swapaxes(valid_onehot, 1, 2)
valid_feature = numpy.array([item[2] for item in valid_items], dtype = numpy.float64)
valid_target  = numpy.array([item[3] for item in valid_items], dtype = numpy.float64)

print('Validation :')
print('Keys       : {}'.format(', '.join([str(x) for x in numpy.shape(valid_keys)])))
print('Sequences  : {}'.format(', '.join([str(x) for x in numpy.shape(valid_onehot)])))
print('Features   : {}'.format(', '.join([str(x) for x in numpy.shape(valid_feature)])))
print('Targets    : {}'.format(', '.join([str(x) for x in numpy.shape(valid_target)])))
print()

test_items = [dataset_keep[index] for index in test_indices]

test_keys    = numpy.array([item[0] for item in test_items], dtype = str)
test_onehot  = numpy.array([item[1] for item in test_items], dtype = numpy.float64)
test_onehot  = numpy.swapaxes(test_onehot, 1, 2)
test_feature = numpy.array([item[2] for item in test_items], dtype = numpy.float64)
test_target  = numpy.array([item[3] for item in test_items], dtype = numpy.float64)

print('Testing    :')
print('Keys       : {}'.format(', '.join([str(x) for x in numpy.shape(test_keys)])))
print('Sequences  : {}'.format(', '.join([str(x) for x in numpy.shape(test_onehot)])))
print('Features   : {}'.format(', '.join([str(x) for x in numpy.shape(test_feature)])))
print('Targets    : {}'.format(', '.join([str(x) for x in numpy.shape(test_target)])))
print()

# 4. Save

In [None]:
# Save data

numpy.savez(os.path.join(OUT_DATA, 'data.npz'), train_onehot, test_onehot, train_feature, test_feature, train_target, test_target)
numpy.savez(os.path.join(OUT_DATA, 'keys-train.npz'), id = train_keys)
numpy.savez(os.path.join(OUT_DATA, 'keys-valid.npz'), id = valid_keys)
numpy.savez(os.path.join(OUT_DATA, 'keys-test.npz'),  id = test_keys)