In [None]:
# Libraries

import json
import numpy
import os
import platform
import shutil
import sys

In [None]:
# Root directory

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux' :
			ROOT = '/d/hpc/home/up4472/workspace/upolanc-thesis'
		elif platform.system() == 'Windows' :
			ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else :
			raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

In [None]:
# Code

from source.python.cnn.models import Zrimec2020c

from source.python.cnn import core     as cnn_core
from source.python.cnn import dataset  as cnn_dataset
from source.python.cnn import model    as cnn_model
from source.python.cnn import mutation as cnn_mutation
from source.python.cnn import plot     as cnn_plot
from source.python.cnn import util     as cnn_util
from source.python.io  import loader   as data_loader
from source.python.io  import writer   as data_writer

# 1. Setup

In [None]:
raise Warning('Hardcoded Execution Stop - Not Maintained :: data_loader.load_feature_targets :: TPM_Label')

In [None]:
# Setup some directory paths.

CWD = ROOT
OUT = os.path.join(CWD, 'output')
RES = os.path.join(CWD, 'resources')

OUT_DATA  = os.path.join(OUT,      'nbp07-zrimec2020', 'classification')
OUT_MODEL = os.path.join(OUT_DATA, 'model')
OUT_PLOT  = os.path.join(OUT_DATA, 'plot')
RES_NBP02 = os.path.join(OUT,      'nbp02-anndata')
RES_NBP04 = os.path.join(OUT,      'nbp04-feature')
RES_NBP05 = os.path.join(OUT,      'nbp05-target')

shutil.rmtree(OUT_DATA, ignore_errors = True)

os.makedirs(OUT_DATA,  exist_ok = True)
os.makedirs(OUT_MODEL, exist_ok = True)
os.makedirs(OUT_PLOT,  exist_ok = True)

print(f'     Root Directory : {CWD}')
print(f'   Output Directory : {OUT_DATA}')
print(f'   Output Directory : {OUT_MODEL}')
print(f'   Output Directory : {OUT_PLOT}')
print(f' Resource Directory : {RES_NBP04}')
print(f' Resource Directory : {RES_NBP05}')

In [None]:
# System and device

DEVICE = cnn_core.get_device(only_cpu = False)
SYSTEM = cnn_core.get_system_info()

for key, value in SYSTEM.items() :
	print('{:25s} : {}'.format(key, value))

In [None]:
# Load the input transcript features

mutation_feature_sequence = data_loader.load_fasta(
	filename  = os.path.join(RES_NBP04, 'mutation-features-bp2150.fasta'),
	to_string = True
)

mutation_feature_base = data_loader.load_npz(
	filename = os.path.join(RES_NBP04, 'mutation-features-base.npz')
)

feature_sequence = data_loader.load_fasta(
	filename  = os.path.join(RES_NBP04, 'features-bp2150.fasta'),
	to_string = True
)

feature_base = data_loader.load_npz(
	filename = os.path.join(RES_NBP04, 'features-base.npz')
)

# 2. Config

In [None]:
# Define output and inputs

TARGET_GROUP   = 'tissue'
TARGET_TYPE    = 'mean'
TARGET_SIZE    = None
INPUT_FEATURES = None

In [None]:
# Load features (NEED to fix since the return is tpm_values and not tpm_labels with one-hot encode)

dataframe, target_value, target_order = data_loader.load_feature_targets(
	group    = f'{TARGET_GROUP}-{TARGET_TYPE}',
	explode  = True,
	filters  = {
		'tissue'       : None,
		'group'        : None,
		'age'          : None,
		'perturbation' : None
	},
	directory = RES_NBP05,
	filename  = f'mapping-{TARGET_TYPE}-grouped.pkl'
)

if 'Feature' in dataframe.columns :
	feature_base = {
		key : numpy.concatenate((feature_base[key.split('?')[-1]], value))
		for key, value in dataframe['Feature'].to_dict().items()
	}

TARGET_SIZE   = len(target_order)
INPUT_FEATURE = len(list(feature_base.values())[0])

dataframe

In [None]:
# Single global config

CONFIG = {
	'core/random'                  : None,
	'core/verbose'                 : False,
	'core/device'                  : DEVICE,
	'core/rootdir'                 : ROOT,
	'dataset/expanddim'            : None,
	'dataset/batch/train'          : 64,
	'dataset/batch/valid'          : 64,
	'dataset/batch/test'           : 64,
	'dataset/split/valid'          : 0.2,
	'dataset/split/test'           : 0.2,
	'criterion/name'               : 'mse',
	'criterion/reduction'          : 'mean',
	'optimizer/decay'              : 0.049112288361090203,
	'optimizer/lr'                 : 0.000855269783800900,
	'optimizer/momentum'           : 0.830793559890461997,
	'optimizer/name'               : 'adam',
	'scheduler/exponential/factor' : 0.993206525568888998,
	'scheduler/linear/factor'      : 0.044580434349908597,
	'scheduler/name'               : 'linear',
	'scheduler/plateau/factor'     : 0.255073201804133176,
	'scheduler/plateau/patience'   : 24,
	'scheduler/step/factor'        : 0.384810555304578694,
	'scheduler/step/patience'      : 14,
	'model/input/channels'         : 1,
	'model/input/height'           : 4,
	'model/input/width'            : 2150,
	'model/input/features'         : INPUT_FEATURE,
	'model/epochs'                 : 10,
	'model/dropout'                : 0.09,
	'model/leakyrelu'              : 0.00,
	'model/conv1/dilation'         : 1,
	'model/conv1/filters'          : 256,
	'model/conv1/kernel'           : 11,
	'model/conv1/padding'          : 'none',
	'model/conv2/dilation'         : 1,
	'model/conv2/filters'          : 32,
	'model/conv2/kernel'           : 5,
	'model/conv2/padding'          : 'none',
	'model/conv3/dilation'         : 1,
	'model/conv3/filters'          : 32,
	'model/conv3/kernel'           : 31,
	'model/conv3/padding'          : 'none',
	'model/maxpool1/kernel'        : 5,
	'model/maxpool1/padding'       : 'same',
	'model/maxpool2/kernel'        : 5,
	'model/maxpool2/padding'       : 'same',
	'model/maxpool3/kernel'        : 9,
	'model/maxpool3/padding'       : 'same',
	'model/fc1/features'           : 64,
	'model/fc2/features'           : 128,
	'model/fc3/features'           : TARGET_SIZE,
	'model/fc3/heads'              : TARGET_HEADS,
	'model/output/target'          : TARGET_GROUP,
	'model/output/type'            : TARGET_TYPE,
	'model/output/size'            : TARGET_SIZE,
	'model/output/heads'           : TARGET_HEADS
}

CONFIG['core/random'] = cnn_core.lock_random(
	seed     = CONFIG['core/random'],
	generate = True
)

In [None]:
# Prints

print('Input Feature : {}'.format(CONFIG['model/input/features']))
print('Output Target : {}'.format(CONFIG['model/output/target']))
print('Output Type   : {}'.format(CONFIG['model/output/type']))
print('Output Size   : {}'.format(CONFIG['model/output/size']))
print('Random Seed   : {}'.format(CONFIG['core/random']))

# 3. Dataloaders

## 3.1 Normal

In [None]:
# Create dataset

dataset = cnn_dataset.to_dataset(
	sequences   = feature_sequence,
	features    = feature_base,
	targets     = target_value,
	expand_dims = CONFIG['dataset/expanddim']
)

In [None]:
# Create dataloader

dataloaders = cnn_dataset.to_dataloaders(
	dataset     = dataset,
	random_seed = CONFIG['core/random'],
	split_size  = {
		'valid' : CONFIG['dataset/split/valid'],
		'test'  : CONFIG['dataset/split/test']
	},
	batch_size  = {
		'train' : CONFIG['dataset/batch/train'],
		'valid' : CONFIG['dataset/batch/valid'],
		'test'  : CONFIG['dataset/batch/test']
	}
)

train_dataloader = dataloaders[0]
valid_dataloader = dataloaders[1]
test_dataloader  = dataloaders[2]

In [None]:
# Display train dataloader

cnn_dataset.show_dataloader(
	dataloader = train_dataloader,
	verbose    = CONFIG['core/verbose']
)

In [None]:
# Display valid dataloader

cnn_dataset.show_dataloader(
	dataloader = valid_dataloader,
	verbose    = CONFIG['core/verbose']
)

In [None]:
# Display test dataloader

cnn_dataset.show_dataloader(
	dataloader = test_dataloader,
	verbose    = CONFIG['core/verbose']
)

## 3.2 Mutation

In [None]:
# Update mutation features

if 'Feature' in dataframe.columns :
	mutation_feature_base_new = dict()

	for x in mutation_feature_base.keys() :
		df = dataframe.loc[dataframe['Transcript'] == x.split('-')[0]]

		for index, row in df.iterrows() :
			key = index + '-' + x.split('-')[-1]
			val = numpy.concatenate((mutation_feature_base[x], row['Feature']))

			mutation_feature_base_new[key] = val

	mutation_feature_base = mutation_feature_base_new

In [None]:
# Create mutation dataloader

mutation_dataloader = cnn_mutation.create_dataloader(
	sequences   = mutation_feature_sequence,
	features    = mutation_feature_base,
	targets     = target_value,
	expand_dims = CONFIG['dataset/expanddim'],
	random_seed = CONFIG['core/random']
)

In [None]:
# Display mutation dataloader

cnn_dataset.show_dataloader(
	dataloader = mutation_dataloader,
	verbose    = CONFIG['core/verbose']
)

# 4. Model

## 4.1 Structure

In [None]:
# Create and display the model structure

model = Zrimec2020c(params = CONFIG)

model.summary(
	batch_size  = CONFIG['dataset/batch/train'],
	in_height   = CONFIG['model/input/height'],
	in_width    = CONFIG['model/input/width'],
	in_features = CONFIG['model/input/features']
)

In [None]:
# Convert the model to double and initialize weights and biases

model = model.double()
model = model.apply(cnn_model.he_uniform_weight)
model = model.apply(cnn_model.zero_bias)

## 4.2 Metrics

In [None]:
# Define metrics

METRICS = {
	'entropy'  : cnn_model.get_criterion(reduction = 'none', weights = None, query = 'entropy'),
	'accuracy' : cnn_model.get_criterion(reduction = 'none', weights = None, query = 'accuracy')
}

METRICS = {
	k : v.to(DEVICE)
	for k, v in METRICS.items()
}

## 4.3 Parameters

In [None]:
# Set the model parameters

model_trainers = cnn_model.get_model_trainers(
	model  = model,
	config = CONFIG,
	epochs = CONFIG['model/epochs']
)

model_params = {
	'model'     : model,
	'savebest'  : os.path.join(OUT_MODEL, f'{model.__name__}-best.pth'),
	'savelast'  : os.path.join(OUT_MODEL, f'{model.__name__}-last.pth'),
	'epochs'    : CONFIG['model/epochs'],
	'criterion' : model_trainers['criterion'],
	'optimizer' : model_trainers['optimizer'],
	'scheduler' : model_trainers['scheduler'],
	'device'    : DEVICE,
	'verbose'   : CONFIG['core/verbose'],
	'metrics'   : METRICS,
	'train_dataloader' : train_dataloader,
	'valid_dataloader' : valid_dataloader,
	'test_dataloader'  : test_dataloader
}

## 4.4 Training

In [None]:
# Train and validate the model

report = cnn_model.train_classifier(
	model  = model,
	params = model_params
)

## 4.5 Visualization

In [None]:
# Display the training and validation loss curves

cnn_plot.show_loss(
	report   = report,
	title    = 'Loss Function',
	filename = os.path.join(OUT_PLOT, model.__name__)
)

In [None]:
# Display the training and validation learning rate curves

cnn_plot.show_lr(
	report   = report,
	title    = 'Learning Rate',
	filename = os.path.join(OUT_PLOT, model.__name__)
)

In [None]:
# Display the training and validation accuracy curves

cnn_plot.show_accuracy(
	report   = report,
	title    = 'Accuracy',
	filename = os.path.join(OUT_PLOT, model.__name__)
)

In [None]:
# Display the training metrics per batch

cnn_plot.show_metric_grid(
	report   = report,
	mode     = 'train',
	filename = os.path.join(OUT_PLOT, model.__name__ + '-train')
)

In [None]:
# Display the validation metrics per batch

cnn_plot.show_metric_grid(
	report   = report,
	mode     = 'valid',
	filename = os.path.join(OUT_PLOT, model.__name__ + '-valid')
)

# 5. Evaluation

## 5.1 Last Model

### 5.1.1 Evaluation

In [None]:
# Load last model

checkpoint = data_loader.load_torch(
	filename = os.path.join(OUT_MODEL, model.__name__ + '-last.pth')
)

print('Epoch : ' + '{}'.format(checkpoint['epoch']))
print('Loss  : ' + '{:.5f}'.format(checkpoint['loss']))

model.load_state_dict(checkpoint['models'])

In [None]:
# Evaluate the full model

report = cnn_model.eval_classifier(
	model  = model,
	params = model_params
)

print('Accuracy : ' + '{: 8.5f}'.format(report['eval']['metric']['accuracy'].mean()))
print('Entropy  : ' + '{: 8.5f}'.format(report['eval']['metric']['entropy'].mean()))

## 5.2 Best Model

### 5.2.1 Evaluation

In [None]:
# Load best model

checkpoint = data_loader.load_torch(
	filename = os.path.join(OUT_MODEL, model.__name__ + '-best.pth')
)

print('Epoch : ' + '{}'.format(checkpoint['epoch']))
print(' Loss : ' + '{:.5f}'.format(checkpoint['loss']))

model.load_state_dict(checkpoint['models'])

In [None]:
# Evaluate the full model

report = cnn_model.eval_classifier(
	model  = model,
	params = model_params
)

print('Accuracy : ' + '{: 8.5f}'.format(report['eval']['metric']['accuracy'].mean()))
print('Entropy  : ' + '{: 8.5f}'.format(report['eval']['metric']['entropy'].mean()))

### 5.2.2 Custom

In [None]:
# Display the first few evaluation item predictions and expectations

cnn_util.display_classification_predictions(report = report, n = 2)

In [None]:
# Display the evaluation accuracy

dataframe = cnn_util.display_classification_accuracy(
	report = report,
	order  = target_order
).sort_values('Accuracy', ascending = False)

print('Average Accuracy : {:8.5f} %'.format(dataframe['Accuracy'].mean()))
print('Average Missed 1 : {:8.5f} %'.format(dataframe['Missed_1'].mean()))
print('Average Missed 2 : {:8.5f} %'.format(dataframe['Missed_2'].mean()))
print('Average Missed 3 : {:8.5f} %'.format(dataframe['Missed_3'].mean()))
print('Average Missed 4 : {:8.5f} %'.format(dataframe['Missed_4'].mean()))
print()

dataframe

### 5.2.3 Visualization

### 5.2.4 Mutation

In [None]:
# Evaluate the mutations

report = cnn_model.eval_classifier(
	model  = model,
	params = model_params | {
		'test_dataloader' : mutation_dataloader,
		'metrics' : {
			'entropy'  : cnn_model.get_criterion(reduction = 'none', weights = None, query = 'entropy' ).to(DEVICE),
			'accuracy' : cnn_model.get_criterion(reduction = 'none', weights = None, query = 'accuracy').to(DEVICE)
		}
	}
)

print('Accuracy : ' + '{: 8.5f}'.format(report['eval']['metric']['accuracy'].mean()))
print('Entropy  : ' + '{: 8.5f}'.format(report['eval']['metric']['entropy'].mean()))

In [None]:
# Transform model report to mutation report

mutation_report = cnn_mutation.get_mutation_report(report = report)

transcript = list(mutation_report.keys())[0]
mutation   = list(mutation_report[transcript].keys())[0]

print('Transcripts : {}'.format(len(mutation_report)))
print('Mutations   : {}'.format(len(mutation_report[transcript])))
print('Variants    : {}'.format(len(mutation_report[transcript][mutation]['label'])))

In [None]:
# Print mutations

print(' ' + ' '.join('{:>20s}'.format(x) if i % 5 != 0 else '{:>20s}'.format(x) + '\n' for i, x in enumerate(mutation_report.keys(), start = 1)))

In [None]:
# Plot single transcript mutation variant predictions

cnn_mutation.plot_mutation_classification(
	report     = mutation_report,
	transcript = None,
	mutation   = 'M01',
	order      = target_order
)

In [None]:
# Plot single transcript mutation variant predictions

cnn_mutation.plot_mutation_classification(
	report     = mutation_report,
	transcript = None,
	mutation   = 'M05',
	order      = target_order
)

In [None]:
# Plot single transcript mutation variant predictions

cnn_mutation.plot_mutation_classification(
	report     = mutation_report,
	transcript = None,
	mutation   = 'M10',
	order      = target_order
)