In [None]:
# Libraries

import argparse
import json
import numpy
import os
import platform
import shutil
import sys

In [None]:
# Root directory

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux' :
			ROOT = '/d/hpc/home/up4472/workspace/upolanc-thesis'
		elif platform.system() == 'Windows' :
			ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else :
			raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [None]:
# Code

from source.python.cnn.models import Zrimec2020r

from source.python.cnn import core     as cnn_core
from source.python.cnn import dataset  as cnn_dataset
from source.python.cnn import model    as cnn_model
from source.python.cnn import mutation as cnn_mutation
from source.python.cnn import plot     as cnn_plot
from source.python.cnn import util     as cnn_util
from source.python.io  import loader   as data_loader
from source.python.io  import writer   as data_writer

# 1. Setup

In [None]:
# Setup some directory paths.

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')

OUT_DATA  = os.path.join(OUT,      'nbp07-zrimec2020', 'regression')
OUT_MODEL = os.path.join(OUT_DATA, 'model')
OUT_PLOT  = os.path.join(OUT_DATA, 'plot')
RES_NBP02 = os.path.join(OUT,      'nbp02-anndata')
RES_NBP04 = os.path.join(OUT,      'nbp04-feature')
RES_NBP05 = os.path.join(OUT,      'nbp05-target')

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA,  exist_ok = True)
os.makedirs(OUT_MODEL, exist_ok = True)
os.makedirs(OUT_PLOT,  exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f'   Output Directory : {OUT_MODEL}')
print(f'   Output Directory : {OUT_PLOT}')
print(f' Resource Directory : {RES_NBP04}')
print(f' Resource Directory : {RES_NBP05}')

In [None]:
# System and device

DEVICE = cnn_core.get_device(only_cpu = False)
SYSTEM = cnn_core.get_system_info()

for key, value in SYSTEM.items() :
	print('{:25s} : {}'.format(key, value))

In [None]:
# Load the input transcript features

mutation_feature_sequence = data_loader.load_fasta(
	filename  = os.path.join(RES_NBP04, 'mutation-features-bp2150.fasta'),
	to_string = True
)

mutation_feature_base = data_loader.load_npz(
	filename = os.path.join(RES_NBP04, 'mutation-features-base.npz')
)

feature_sequence = data_loader.load_fasta(
	filename  = os.path.join(RES_NBP04, 'features-bp2150.fasta'),
	to_string = True
)

feature_base = data_loader.load_npz(
	filename = os.path.join(RES_NBP04, 'features-base.npz')
)

model_config = data_loader.load_json(
	filename = os.path.join(RES, 'tuner', 'zrimec2020.json')
)

# 2. Config

In [None]:
# Create final report

final_report = dict()

In [None]:
# Define output and inputs

MODEL_PARAMS   = None
MODEL_EPOCHS   = 5

TARGET_GROUP   = 'global'
TARGET_TYPE    = 'mean'
TARGET_EXPLODE = False
TARGET_FILTER  = None
TARGET_SIZE    = None

INPUT_FEATURES = None

In [None]:
# Check for args

if __name__ == '__main__' and 'get_ipython' in dir() :
	print('Running as .ipynb')

if __name__ == '__main__' and 'get_ipython' not in dir() :
	print('Running as .py')

	parser = argparse.ArgumentParser()

	parse_none = lambda x : None if x == 'none' else x
	parse_bool = lambda x : True if x == 'true' else False

	parser.add_argument('--target_group',   type = str, choices = ['tissue', 'group', 'age', 'global', 'perturbation'])
	parser.add_argument('--target_type',    type = str, choices = ['mean', 'max'])
	parser.add_argument('--target_explode', type = parse_bool)
	parser.add_argument('--target_filter',  type = parse_none)
	parser.add_argument('--model_epochs',   type = int)
	parser.add_argument('--model_params',   type = int)

	parser.set_defaults(
		target_group   = 'tissue',
		target_type    = 'mean',
		target_explode = False,
		target_filter  = None,
		model_epochs   = 100,
		model_params   = None
	)

	args = vars(parser.parse_args())

	TARGET_GROUP   = args['target_group']
	TARGET_TYPE    = args['target_type']
	TARGET_EXPLODE = args['target_explode']
	TARGET_FILTER  = args['target_filter']
	MODEL_EPOCHS   = args['model_epochs']
	MODEL_PARAMS   = args['model_params']

if TARGET_GROUP == 'global' :
	TARGET_EXPLODE = False

In [None]:
# Load features

dataframe, target_value, target_order = data_loader.load_feature_targets(
	group    = f'{TARGET_GROUP}-{TARGET_TYPE}',
	explode  = TARGET_EXPLODE,
	filters  = {
		'tissue'       : None,
		'group'        : None,
		'age'          : None,
		'perturbation' : None,
		'global'       : None
	} | {
		TARGET_GROUP : TARGET_FILTER
		if TARGET_FILTER is None
		else [TARGET_FILTER]
	},
	directory = RES_NBP05,
	filename  = 'mapping-grouped.pkl'
)

if 'Feature' in dataframe.columns :
	feature_base = {
		key : numpy.concatenate((feature_base[key.split('?')[-1]], value))
		for key, value in dataframe['Feature'].to_dict().items()
	}

TARGET_SIZE   = len(target_order)
INPUT_FEATURE = len(list(feature_base.values())[0])

dataframe

In [None]:
# Single global config

CONFIG = {
	'core/random'                  : None,
	'core/verbose'                 : False,
	'core/device'                  : DEVICE,
	'core/rootdir'                 : ROOT,
	'dataset/expanddim'            : None,
	'dataset/batch/train'          : 64,
	'dataset/batch/valid'          : 64,
	'dataset/batch/test'           : 64,
	'dataset/split/valid'          : 0.2,
	'dataset/split/test'           : 0.2,
	'criterion/name'               : 'mse',
	'criterion/reduction'          : 'mean',
	'optimizer/decay'              : 0.049112288361090203,
	'optimizer/lr'                 : 0.000855269783800900,
	'optimizer/momentum'           : 0.830793559890461997,
	'optimizer/name'               : 'adam',
	'scheduler/exponential/factor' : 0.993206525568888998,
	'scheduler/linear/factor'      : 0.044580434349908597,
	'scheduler/name'               : 'linear',
	'scheduler/plateau/factor'     : 0.255073201804133176,
	'scheduler/plateau/patience'   : 24,
	'scheduler/step/factor'        : 0.384810555304578694,
	'scheduler/step/patience'      : 14,
	'model/input/channels'         : 1,
	'model/input/height'           : 4,
	'model/input/width'            : 2150,
	'model/input/features'         : INPUT_FEATURE,
	'model/epochs'                 : MODEL_EPOCHS,
	'model/dropout'                : 0.09,
	'model/leakyrelu'              : 0.00,
	'model/conv1/dilation'         : 1,
	'model/conv1/filters'          : 256,
	'model/conv1/kernel'           : 11,
	'model/conv1/padding'          : 'none',
	'model/conv2/dilation'         : 1,
	'model/conv2/filters'          : 32,
	'model/conv2/kernel'           : 5,
	'model/conv2/padding'          : 'none',
	'model/conv3/dilation'         : 1,
	'model/conv3/filters'          : 32,
	'model/conv3/kernel'           : 31,
	'model/conv3/padding'          : 'none',
	'model/maxpool1/kernel'        : 5,
	'model/maxpool1/padding'       : 'same',
	'model/maxpool2/kernel'        : 5,
	'model/maxpool2/padding'       : 'same',
	'model/maxpool3/kernel'        : 9,
	'model/maxpool3/padding'       : 'same',
	'model/fc1/features'           : 64,
	'model/fc2/features'           : 128,
	'model/fc3/features'           : TARGET_SIZE,
	'model/output/target'          : TARGET_GROUP,
	'model/output/type'            : TARGET_TYPE,
	'model/output/size'            : TARGET_SIZE,
	'model/params/id'              : MODEL_PARAMS
}

CONFIG['core/random'] = cnn_core.lock_random(
	seed     = CONFIG['core/random'],
	generate = True
)

In [None]:
# Update specific params

if CONFIG['model/params/id'] is not None and CONFIG['model/params/id'] >= 0 :
	CONFIG.update(model_config[CONFIG['model/params/id']])

In [None]:
# Prints

print('Input Feature : {}'.format(CONFIG['model/input/features']))
print('Output Target : {}'.format(CONFIG['model/output/target']))
print('Output Type   : {}'.format(CONFIG['model/output/type']))
print('Output Size   : {}'.format(CONFIG['model/output/size']))
print('Random Seed   : {}'.format(CONFIG['core/random']))

# 3. Dataloaders

## 3.1 Normal

In [None]:
# Create dataset

dataset = cnn_dataset.to_dataset(
	sequences   = feature_sequence,
	features    = feature_base,
	targets     = target_value,
	expand_dims = CONFIG['dataset/expanddim'],
	groups      = None
)

In [None]:
# Create dataloader

dataloaders = cnn_dataset.to_dataloaders(
	dataset     = dataset,
	generator   = cnn_dataset.generate_group_shuffle_split,
	random_seed = CONFIG['core/random'],
	split_size  = {
		'valid' : CONFIG['dataset/split/valid'],
		'test'  : CONFIG['dataset/split/test']
	},
	batch_size  = {
		'train' : CONFIG['dataset/batch/train'],
		'valid' : CONFIG['dataset/batch/valid'],
		'test'  : CONFIG['dataset/batch/test']
	}
)

train_dataloader = dataloaders[0]
valid_dataloader = dataloaders[1]
test_dataloader  = dataloaders[2]

In [None]:
# Display train dataloader

cnn_dataset.show_dataloader(
	dataloader = train_dataloader,
	verbose    = CONFIG['core/verbose']
)

In [None]:
# Display valid dataloader

cnn_dataset.show_dataloader(
	dataloader = valid_dataloader,
	verbose    = CONFIG['core/verbose']
)

In [None]:
# Display test dataloader

cnn_dataset.show_dataloader(
	dataloader = test_dataloader,
	verbose    = CONFIG['core/verbose']
)

## 3.2 Mutation

In [None]:
# Update mutation features

if 'Feature' in dataframe.columns :
	mutation_feature_base_new = dict()

	for x in mutation_feature_base.keys() :
		df = dataframe.loc[dataframe['Transcript'] == x.split('-')[0]]

		for index, row in df.iterrows() :
			key = index + '-' + x.split('-')[-1]
			val = numpy.concatenate((mutation_feature_base[x], row['Feature']))

			mutation_feature_base_new[key] = val

	mutation_feature_base = mutation_feature_base_new

In [None]:
# Create mutation dataloader

mutation_dataloader = cnn_mutation.create_dataloader(
	sequences   = mutation_feature_sequence,
	features    = mutation_feature_base,
	targets     = target_value,
	expand_dims = CONFIG['dataset/expanddim']
)

In [None]:
# Display mutation dataloader

cnn_dataset.show_dataloader(
	dataloader = mutation_dataloader,
	verbose    = CONFIG['core/verbose']
)

# 4. Model

## 4.1 Structure

In [None]:
# Create and display the model structure

model = Zrimec2020r(params = CONFIG)

model.summary(
	batch_size  = CONFIG['dataset/batch/train'],
	in_height   = CONFIG['model/input/height'],
	in_width    = CONFIG['model/input/width'],
	in_features = CONFIG['model/input/features'],
)

In [None]:
# Convert the model to double and initialize weights and biases

model = model.double()
model = model.apply(cnn_model.he_uniform_weight)
model = model.apply(cnn_model.zero_bias)

## 4.2 Metrics

In [None]:
# Define metrics

METRICS = {
	'mse' : cnn_model.get_criterion(reduction = 'none', weights = None, query = 'mse'),
	'mae' : cnn_model.get_criterion(reduction = 'none', weights = None, query = 'mae'),
	'r2'  : cnn_model.get_criterion(reduction = 'none', weights = None, query = 'r2', output_size = CONFIG['model/output/size'])
}

METRICS = {
	k : v.to(DEVICE)
	for k, v in METRICS.items()
}

## 4.3 Parameters

In [None]:
# Set the model parameters

model_trainers = cnn_model.get_model_trainers(
	model  = model,
	config = CONFIG,
	epochs = CONFIG['model/epochs']
)

model_params = {
	'model'     : model,
	'savebest'  : os.path.join(OUT_MODEL, f'{model.__name__}-best.pth'),
	'savelast'  : os.path.join(OUT_MODEL, f'{model.__name__}-last.pth'),
	'epochs'    : CONFIG['model/epochs'],
	'criterion' : model_trainers['criterion'],
	'optimizer' : model_trainers['optimizer'],
	'scheduler' : model_trainers['scheduler'],
	'device'    : DEVICE,
	'verbose'   : CONFIG['core/verbose'],
	'metrics'   : METRICS,
	'train_dataloader' : train_dataloader,
	'valid_dataloader' : valid_dataloader,
	'test_dataloader'  : test_dataloader
}

## 4.4 Training

In [None]:
# Train and validate the model

report = cnn_model.train_regressor(
	model  = model,
	params = model_params
)

## 4.5 Visualization

In [None]:
# Display the training and validation loss curves

cnn_plot.show_loss(
	report   = report,
	title    = 'Loss Function',
	filename = os.path.join(OUT_PLOT, model.__name__)
)

In [None]:
# Display the training and validation learning rate curves

cnn_plot.show_lr(
	report   = report,
	title    = 'Learning Rate',
	filename = os.path.join(OUT_PLOT, model.__name__)
)

In [None]:
# Display the training and validation r2 score curves

cnn_plot.show_r2(
	report   = report,
	title    = 'R2 Score',
	filename = os.path.join(OUT_PLOT, model.__name__)
)

In [None]:
# Display the training metrics per batch

cnn_plot.show_metric_grid(
	report   = report,
	mode     = 'train',
	filename = os.path.join(OUT_PLOT, model.__name__ + '-train')
)

In [None]:
# Display the validation metrics per batch

cnn_plot.show_metric_grid(
	report   = report,
	mode     = 'valid',
	filename = os.path.join(OUT_PLOT, model.__name__ + '-valid')
)

# 5. Evaluation

## 5.1 Last Model

### 5.1.1 Evaluation

In [None]:
# Load last model

checkpoint = data_loader.load_torch(
	filename = os.path.join(OUT_MODEL, model.__name__ + '-last.pth')
)

final_report['evaluation/last/epoch'] = checkpoint['epoch']
final_report['evaluation/last/loss']  = checkpoint['loss']

print('Epoch : ' + '{}'.format(final_report['evaluation/last/epoch']))
print(' Loss : ' + '{:.5f}'.format(final_report['evaluation/last/loss']))

model.load_state_dict(checkpoint['models'])

In [None]:
# Evaluate the full model

report = cnn_model.eval_regressor(
	model  = model,
	params = model_params
)

final_report['evaluation/last/r2/mean']  = report['eval']['metric']['r2'].mean()
final_report['evaluation/last/mae/mean'] = report['eval']['metric']['mae'].mean()
final_report['evaluation/last/mse/mean'] = report['eval']['metric']['mse'].mean()

print('R2 Score : ' + '{: 8.5f}'.format(final_report['evaluation/last/r2/mean']))
print('MAE Loss : ' + '{: 8.5f}'.format(final_report['evaluation/last/mae/mean']))
print('MSE Loss : ' + '{: 8.5f}'.format(final_report['evaluation/last/mse/mean']))

## 5.2 Best Model

### 5.2.1 Evaluation

In [None]:
# Load best model

checkpoint = data_loader.load_torch(
	filename = os.path.join(OUT_MODEL, model.__name__ + '-best.pth')
)

final_report['evaluation/best/epoch'] = checkpoint['epoch']
final_report['evaluation/best/loss']  = checkpoint['loss']

print('Epoch : ' + '{}'.format(final_report['evaluation/best/epoch']))
print(' Loss : ' + '{:.5f}'.format(final_report['evaluation/best/loss']))

model.load_state_dict(checkpoint['models'])

In [None]:
# Evaluate the full model

report = cnn_model.eval_regressor(
	model  = model,
	params = model_params
)

final_report['evaluation/best/r2/mean']  = report['eval']['metric']['r2'].mean()
final_report['evaluation/best/mae/mean'] = report['eval']['metric']['mae'].mean()
final_report['evaluation/best/mse/mean'] = report['eval']['metric']['mse'].mean()

print('R2 Score : ' + '{: 8.5f}'.format(final_report['evaluation/best/r2/mean']))
print('MAE Loss : ' + '{: 8.5f}'.format(final_report['evaluation/best/mae/mean']))
print('MSE Loss : ' + '{: 8.5f}'.format(final_report['evaluation/best/mse/mean']))

### 5.2.2 Custom

In [None]:
# Display the first few evaluation item predictions and expectations

cnn_util.display_regression_predictions(report = report, n = 2)

### 5.2.3 Visualization

In [None]:
# Display prediction error per group

cnn_plot.show_prediction_error_grid(
	report   = report,
	order    = target_order,
	filename = os.path.join(OUT_PLOT, model.__name__ + '-best')
)

In [None]:
# Display prediction linear regression per group

cnn_plot.show_linear_regression_grid(
	report   = report,
	order    = target_order,
	filename = os.path.join(OUT_PLOT, model.__name__ + '-best')
)

### 5.2.4 Mutation

In [None]:
# Evaluate the mutations

report = cnn_model.eval_regressor(
	model  = model,
	params = model_params | {
		'test_dataloader' : mutation_dataloader,
		'metrics' : {
			'mse' : cnn_model.get_criterion(reduction = 'none', weights = None, query = 'mse').to(DEVICE),
			'mae' : cnn_model.get_criterion(reduction = 'none', weights = None, query = 'mae').to(DEVICE)
		}
	}
)

final_report['mutation/best/mae/mean'] = report['eval']['metric']['mae'].mean()
final_report['mutation/best/mse/mean'] = report['eval']['metric']['mse'].mean()

print('MAE Loss : ' + '{: 8.5f}'.format(final_report['mutation/best/mae/mean']))
print('MSE Loss : ' + '{: 8.5f}'.format(final_report['mutation/best/mse/mean']))

In [None]:
# Transform model report to mutation report

mutation_report = cnn_mutation.get_mutation_report(report = report)

transcript = list(mutation_report.keys())[0]
mutation   = list(mutation_report[transcript].keys())[0]

print('Transcripts : {}'.format(len(mutation_report)))
print('Mutations   : {}'.format(len(mutation_report[transcript])))
print('Variants    : {}'.format(len(mutation_report[transcript][mutation]['label'])))

In [None]:
# Print mutations

print(' ' + ' '.join('{:>20s}'.format(x) if i % 5 != 0 else '{:>20s}'.format(x) + '\n' for i, x in enumerate(mutation_report.keys(), start = 1)))

In [None]:
# Plot single transcript mutation variant predictions

cnn_mutation.plot_mutation_regression(
	report     = mutation_report,
	transcript = None,
	mutation   = 'M01',
	order      = target_order,
	filename   = os.path.join(OUT_PLOT, model.__name__ + '-mutation-m01')
)

In [None]:
# Plot single transcript mutation variant predictions

cnn_mutation.plot_mutation_regression(
	report     = mutation_report,
	transcript = None,
	mutation   = 'M05',
	order      = target_order,
	filename   = os.path.join(OUT_PLOT, model.__name__ + '-mutation-m05')
)

In [None]:
# Plot single transcript mutation variant predictions

cnn_mutation.plot_mutation_regression(
	report     = mutation_report,
	transcript = None,
	mutation   = 'M10',
	order      = target_order,
	filename   = os.path.join(OUT_PLOT, model.__name__ + '-mutation-m10')
)

# 6. Save

In [None]:
# Save results

for key, value in final_report.items() :
	if isinstance(value, numpy.float32) :
		final_report[key] = float(value)

data_writer.write_json(
	data     = final_report,
	filename = os.path.join(OUT_DATA, 'report.json')
)

data_writer.write_json(
	data     = {k : v for k, v in CONFIG.items() if k != 'core/device'},
	filename = os.path.join(OUT_DATA, 'config.json')
)