In [None]:
# Libraries

import argparse
import glob
import logging
import numpy
import os
import platform
import random
import re
import shutil
import sys
import torch

In [None]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux' :
			ROOT = '/d/hpc/home/up4472/workspace/upolanc-thesis'
		elif platform.system() == 'Windows' :
			ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else :
			raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

sys.path.append(os.path.join(ROOT, 'source', 'python', 'bert', 'dnabert', 'src'))
sys.path.append(os.path.join(ROOT, 'source', 'python', 'bert', 'dnabert', 'src', 'transformers'))

In [None]:
# Transformers

from transformers.data.processors.glue import DnaPromProcessor
from transformers.data.processors.glue import StsbProcessor

from transformers import WEIGHTS_NAME
from transformers import AdamW
from transformers import AlbertConfig
from transformers import AlbertForSequenceClassification
from transformers import AlbertTokenizer
from transformers import BertConfig
from transformers import BertForSequenceClassification
from transformers import BertForLongSequenceClassification
from transformers import BertForLongSequenceClassificationCat
from transformers import BertTokenizer
from transformers import DNATokenizer
from transformers import DistilBertConfig
from transformers import DistilBertForSequenceClassification
from transformers import DistilBertTokenizer
from transformers import FlaubertConfig
from transformers import FlaubertForSequenceClassification
from transformers import FlaubertTokenizer
from transformers import RobertaConfig
from transformers import RobertaForSequenceClassification
from transformers import RobertaTokenizer
from transformers import XLMConfig
from transformers import XLMForSequenceClassification
from transformers import XLMRobertaConfig
from transformers import XLMRobertaForSequenceClassification
from transformers import XLMRobertaTokenizer
from transformers import XLMTokenizer
from transformers import XLNetConfig
from transformers import XLNetForSequenceClassification
from transformers import XLNetTokenizer
from transformers import DataProcessor
from transformers import InputExample
from transformers import BertModel
from transformers import BertPreTrainedModel

from transformers import glue_convert_examples_to_features  as convert_examples_to_features
from transformers import glue_output_modes                  as output_modes
from transformers import glue_processors                    as processors
from transformers import glue_compute_metrics               as compute_metrics
from transformers import get_linear_schedule_with_warmup

try :
	from torch.utils.tensorboard import SummaryWriter
except :
	from tensorboardX import SummaryWriter

In [None]:
# Other

from multiprocessing              import Pool
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data             import DataLoader
from torch.utils.data             import RandomSampler
from torch.utils.data             import SequentialSampler
from torch.utils.data             import TensorDataset
from torch.nn                     import Softmax

from copy import deepcopy
from tqdm import tqdm
from tqdm import trange

In [None]:
# Code

from source.python.bert.bert_models     import RegressionBertFC1
from source.python.bert.bert_models     import RegressionBertFC3
from source.python.bert.bert_processors import RegressionProcessor
from source.python.bert.bert_metrics    import compute_metrics
from source.python                      import runtime

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

# 1. Setup

In [None]:
# Define logger

logger = logging.getLogger(__name__)

In [None]:
# Update models

ALL_MODELS = sum((
	tuple(conf.pretrained_config_archive_map.keys())
	for conf in (
		      BertConfig,  XLNetConfig,        XLMConfig,  RobertaConfig,
		DistilBertConfig, AlbertConfig, XLMRobertaConfig, FlaubertConfig
	)),
	()
)

MODEL_CLASSES = {
	"dna"        : (      BertConfig,       BertForSequenceClassification,        DNATokenizer),
	"dnalong"    : (      BertConfig,   BertForLongSequenceClassification,        DNATokenizer),
	"dnalongcat" : (      BertConfig,   BertForLongSequenceClassification,        DNATokenizer),
	"bert"       : (      BertConfig,       BertForSequenceClassification,       BertTokenizer),
	"xlnet"      : (     XLNetConfig,      XLNetForSequenceClassification,      XLNetTokenizer),
	"xlm"        : (       XLMConfig,        XLMForSequenceClassification,        XLMTokenizer),
	"roberta"    : (   RobertaConfig,    RobertaForSequenceClassification,    RobertaTokenizer),
	"distilbert" : (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
	"albert"     : (    AlbertConfig,     AlbertForSequenceClassification,     AlbertTokenizer),
	"xlmroberta" : (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
	"flaubert"   : (  FlaubertConfig,   FlaubertForSequenceClassification,   FlaubertTokenizer),
	'rbertfc1'   : (      BertConfig,                   RegressionBertFC1,        DNATokenizer),
	'rbertfc3'   : (      BertConfig,                   RegressionBertFC3,        DNATokenizer)
}

TOKEN_ID_GROUP = [
	'bert',
	'dnalong',
	'dnalongcat',
	'xlnet',
	'albert'
]

print('Added model_type = <rbertfc1>   :: bert for sequence regression with 1 FC layer')
print('Added model_type = <rbertfc3>   :: bert for sequence regression with 3 FC layer')

In [None]:
# Update processors and output modes

processors   = processors   | {'regression' : RegressionProcessor}
output_modes = output_modes | {'regression' : 'regression'}

print('Added task_type  = <regression> :: defines regression output_mode and processor')
print()

for k, v in processors.items() :
	print('{:12s} : {}'.format(k, v))

print()

for k, v in output_modes.items() :
	print('{:12s} : {}'.format(k, v))

# 2. Seed

In [None]:
def set_seed (args) :
	"""
	Doc
	"""

	random.seed(args.seed)
	numpy.random.seed(args.seed)
	torch.manual_seed(args.seed)

	if args.n_gpu > 0 :
		torch.cuda.manual_seed_all(args.seed)

# 3. Checkpoint

In [None]:
def _sorted_checkpoints (args, checkpoint_prefix = 'checkpoint', use_mtime = False) :
	"""
	Doc
	"""

	ordering_and_checkpoint_path = list()
	glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))

	for path in glob_checkpoints :
		if use_mtime :
			ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
		else :
			regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)

			if regex_match and regex_match.groups() :
				ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

	checkpoints_sorted = sorted(ordering_and_checkpoint_path)
	checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]

	return checkpoints_sorted

def _rotate_checkpoints (args, checkpoint_prefix = 'checkpoint', use_mtime = False) :
	"""
	Doc
	"""

	if not args.save_total_limit :
		return
	if args.save_total_limit <= 0 :
		return

	# Check if we should delete older checkpoint(s)
	checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)

	if len(checkpoints_sorted) <= args.save_total_limit :
		return

	number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
	checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]

	for checkpoint in checkpoints_to_be_deleted :
		logger.info('Deleting older checkpoint [{}] due to args.save_total_limit'.format(checkpoint))
		shutil.rmtree(checkpoint)

# 4. Train

In [None]:
def train (args, train_dataset, model, tokenizer) :
	"""
	Doc
	"""

	if args.local_rank in [-1, 0] :
		tb_writer = SummaryWriter()

	args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

	train_sampler    = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
	train_dataloader = DataLoader(train_dataset, sampler = train_sampler, batch_size = args.train_batch_size)

	if args.max_steps > 0 :
		t_total = args.max_steps
		args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
	else :
		t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

	#
	# Prepare optimizer and schedule (linear warmup and decay)
	#

	no_decay = ['bias', 'LayerNorm.weight']
	optimizer_grouped_parameters = [
		{
			'params'       : [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
			'weight_decay' : args.weight_decay,
		},
		{
			'params'       : [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
			'weight_decay' : 0.0
		},
	]

	warmup_steps = args.warmup_steps if args.warmup_percent == 0 else int(args.warmup_percent * t_total)

	optimizer = AdamW(optimizer_grouped_parameters, lr = args.learning_rate, eps = args.adam_epsilon, betas = (args.beta1, args.beta2))
	scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = t_total)

	#
	# Check if saved optimizer or scheduler states exist
	#

	if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')) :
		optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
		scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))

	if args.fp16 :
		try :
			from apex import amp
		except ImportError :
			raise ImportError('Please install apex from https://www.github.com/nvidia/apex to use fp16 training.')

		model, optimizer = amp.initialize(model, optimizer, opt_level = args.fp16_opt_level)

	#
	# Multi GPU traninig
	#

	if args.n_gpu > 1 :
		model = torch.nn.DataParallel(model)

	#
	# Distributed training (should be after apex fp16 initialization)
	#

	if args.local_rank != -1 :
		model = torch.nn.parallel.DistributedDataParallel(model,
			device_ids    = [args.local_rank],
			output_device = args.local_rank,
			find_unused_parameters = True,
		)

	#
	# Start training
	#

	logger.info('***** Running training *****')
	logger.info('  Num examples = %d', len(train_dataset))
	logger.info('  Num Epochs = %d', args.num_train_epochs)
	logger.info('  Instantaneous batch size per GPU = %d', args.per_gpu_train_batch_size)

	logger.info(
		'  Total train batch size (w. parallel, distributed & accumulation) = %d',
		args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)
	)

	logger.info('  Gradient Accumulation steps = %d', args.gradient_accumulation_steps)
	logger.info('  Total optimization steps = %d', t_total)

	global_step = 0
	epochs_trained = 0
	steps_trained_in_current_epoch = 0

	#
	# Check for checkpoints
	#

	if os.path.exists(args.model_name_or_path) :
		try :
			global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0])
		except :
			global_step = 0

		epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
		steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

		logger.info('  Continuing training from checkpoint, will skip to saved global_step')
		logger.info('  Continuing training from epoch %d', epochs_trained)
		logger.info('  Continuing training from global step %d', global_step)
		logger.info('  Will skip the first %d steps in the first epoch', steps_trained_in_current_epoch)

	tr_loss      = 0.0
	logging_loss = 0.0

	model.zero_grad()

	train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc = 'Epoch', disable = args.local_rank not in [-1, 0])
	set_seed(args)

	best_auc = 0
	last_auc = 0
	stop_count = 0

	#
	# Train loop
	#

	for _ in train_iterator :
		epoch_iterator = tqdm(train_dataloader, desc = 'Iteration', disable = args.local_rank not in [-1, 0])

		for step, batch in enumerate(epoch_iterator) :
			if steps_trained_in_current_epoch > 0 :
				steps_trained_in_current_epoch -= 1
				continue

			model.train()

			batch = tuple(t.to(args.device) for t in batch)
			inputs = {
				'input_ids'      : batch[0],
				'attention_mask' : batch[1],
				'labels'         : batch[3]
			}

			# XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
			if args.model_type != 'distilbert' :
				inputs['token_type_ids'] = (batch[2] if args.model_type in TOKEN_ID_GROUP else None)

			outputs = model(**inputs)
			loss = outputs[0] 

			if args.n_gpu > 1 :
				loss = loss.mean()
			if args.gradient_accumulation_steps > 1 :
				loss = loss / args.gradient_accumulation_steps

			if args.fp16 :
				with amp.scale_loss(loss, optimizer) as scaled_loss :
					scaled_loss.backward()
			else :
				loss.backward()

			tr_loss += loss.item()

			if (step + 1) % args.gradient_accumulation_steps == 0 :
				if args.fp16 :
					torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
				else :
					torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

				optimizer.step()
				scheduler.step()
				model.zero_grad()

				global_step += 1

				if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0 :
					logs = {}

					# Only evaluate when single GPU otherwise metrics may not average well
					if (args.local_rank == -1 and args.evaluate_during_training) :
						results = evaluate(args, model, tokenizer)

						if args.task_name == 'dna690' :
							if results['auc'] > best_auc:
								best_auc = results['auc']

						#
						# Early stopping with AUC ?
						#

						if args.early_stop != 0 :
							if results['auc'] < last_auc :
								stop_count = stop_count + 1
							else:
								stop_count = 0

							last_auc = results['auc']

							if stop_count == args.early_stop :
								logger.info('Early stop')

								return global_step, tr_loss / global_step

						for key, value in results.items():
							eval_key = 'eval_{}'.format(key)
							logs[eval_key] = value

					loss_scalar = (tr_loss - logging_loss) / args.logging_steps
					learning_rate_scalar = scheduler.get_lr()[0]

					logs['learning_rate'] = learning_rate_scalar
					logs['loss'] = loss_scalar

					logging_loss = tr_loss

					for key, value in logs.items() :
						tb_writer.add_scalar(key, value, global_step)

					print({**logs, **{'step' : global_step}})

				if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0 :
					if args.task_name == 'dna690' and results['auc'] < best_auc :
						continue

					# Save model checkpoint
					checkpoint_prefix = 'checkpoint'
					output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))

					if not os.path.exists(output_dir) :
						os.makedirs(output_dir)

					model_to_save = (model.module if hasattr(model, 'module') else model)
					model_to_save.save_pretrained(output_dir)
					tokenizer.save_pretrained(output_dir)

					logger.info('Saving model checkpoint to %s', output_dir)

					_rotate_checkpoints(args, checkpoint_prefix)

					if args.task_name != 'dna690' :
						torch.save(args, os.path.join(output_dir, 'training_args.bin'))
						torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
						torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))

					logger.info('Saving optimizer and scheduler states to %s', output_dir)

			if args.max_steps > 0 and global_step > args.max_steps :
				epoch_iterator.close()
				break

		if args.max_steps > 0 and global_step > args.max_steps :
			train_iterator.close()
			break

	if args.local_rank in [-1, 0] :
		tb_writer.close()

	return global_step, tr_loss / global_step

# 5. Evaluate

In [None]:
def evaluate (args, model, tokenizer, prefix = '', evaluate = True) :
	"""
	Doc
	"""

	# Loop to handle MNLI double evaluation (matched, mis-matched)
	eval_task_names = ('mnli', 'mnli-mm') if args.task_name == 'mnli' else (args.task_name,)
	eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == 'mnli' else (args.output_dir,)

	if args.task_name[:3] == 'dna' :
		softmax = Softmax(dim = 1)

	results = {}

	for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs) :
		eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=evaluate)

		if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0] :
			os.makedirs(eval_output_dir)

		args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

		eval_sampler    = SequentialSampler(eval_dataset)
		eval_dataloader = DataLoader(eval_dataset, sampler = eval_sampler, batch_size = args.eval_batch_size)

		#
		# Multi GPU evaluation
		#

		if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel) :
			model = torch.nn.DataParallel(model)

		#
		# Start evaluation
		#

		logger.info('***** Running evaluation {} *****'.format(prefix))
		logger.info('  Num examples = %d', len(eval_dataset))
		logger.info('  Batch size = %d', args.eval_batch_size)

		eval_loss     = 0.0
		nb_eval_steps = 0
		preds         = None
		probs         = None
		out_label_ids = None

		#
		# Evaluation loop
		#

		for batch in tqdm(eval_dataloader, desc = 'Evaluating') :
			model.eval()
			batch = tuple(t.to(args.device) for t in batch)

			with torch.no_grad() :
				inputs = {
					'input_ids'      : batch[0],
					'attention_mask' : batch[1],
					'labels'         : batch[3]
				}

				# XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
				if args.model_type != 'distilbert' :
					inputs['token_type_ids'] = (batch[2] if args.model_type in TOKEN_ID_GROUP else None)

				outputs = model(**inputs)
				tmp_eval_loss, logits = outputs[:2]
				eval_loss += tmp_eval_loss.mean().item()
	
			nb_eval_steps += 1

			if preds is None :
				preds = logits.detach().cpu().numpy()
				out_label_ids = inputs['labels'].detach().cpu().numpy()
			else :
				preds = numpy.append(preds, logits.detach().cpu().numpy(), axis = 0)
				out_label_ids = numpy.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis = 0)

		eval_loss = eval_loss / nb_eval_steps

		#
		# Handle classification tasks
		#

		if args.output_mode == 'classification' :
			if args.task_name[:3] == 'dna' and args.task_name != 'dnasplice' :
				if args.do_ensemble_pred :
					probs = softmax(torch.tensor(preds, dtype=torch.float32)).numpy()
				else :
					probs = softmax(torch.tensor(preds, dtype=torch.float32))[:, 1].numpy()
			elif args.task_name == 'dnasplice' :
				probs = softmax(torch.tensor(preds, dtype=torch.float32)).numpy()

			preds = numpy.argmax(preds, axis = 1)

		#
		# Handle regression tasks
		#

		elif args.output_mode == 'regression' :
			preds = numpy.squeeze(preds)

		#
		# Compute metrics
		#

		if args.do_ensemble_pred :
			result = compute_metrics(eval_task, preds, out_label_ids, probs[:, 1])
		else :
			result = compute_metrics(eval_task, preds, out_label_ids, probs)

		#
		# Update results
		#

		results.update(result)

		#
		# Handle DNA690 task
		#

		if args.task_name == 'dna690' :
			eval_output_dir = args.result_dir

			if not os.path.exists(args.result_dir) :
				os.makedirs(args.result_dir)

		#
		# Log evaluation results
		#

		output_eval_file = os.path.join(eval_output_dir, prefix, 'eval_results.txt')

		with open(output_eval_file, 'a') as writer :
			if args.task_name[:3] == 'dna' :
				eval_result = args.data_dir.split('/')[-1] + ' '
			else:
				eval_result = ''

			logger.info('***** Eval results {} *****'.format(prefix))

			for key in sorted(result.keys()) :
				logger.info('  %s = %s', key, str(result[key]))
				eval_result = eval_result + str(result[key])[:5] + ' '

			writer.write(eval_result + '\n')

	if args.do_ensemble_pred :
		return results, eval_task, preds, out_label_ids, probs
	else:
		return results

# 6. Predict

In [None]:
def predict (args, model, tokenizer, prefix = '') :
	"""
	Doc
	"""

	# Loop to handle MNLI double evaluation (matched, mis-matched)
	pred_task_names   = (args.task_name,)
	pred_outputs_dirs = (args.predict_dir,)

	if not os.path.exists(args.predict_dir) :
		os.makedirs(args.predict_dir)

	softmax = Softmax(dim = 1)

	predictions = {}

	for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs) :
		pred_dataset = load_and_cache_examples(args, pred_task, tokenizer, evaluate=True)

		if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0] :
			os.makedirs(pred_output_dir)

		args.pred_batch_size = args.per_gpu_pred_batch_size * max(1, args.n_gpu)

		pred_sampler    = SequentialSampler(pred_dataset)
		pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size=args.pred_batch_size)

		#
		# Multi GPU prediction
		#

		if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel) :
			model = torch.nn.DataParallel(model)

		#
		# Stat prediction
		#

		logger.info('***** Running prediction {} *****'.format(prefix))
		logger.info('  Num examples = %d', len(pred_dataset))
		logger.info('  Batch size = %d', args.pred_batch_size)

		pred_loss     = 0.0
		nb_pred_steps = 0
		preds         = None
		out_label_ids = None

		for batch in tqdm(pred_dataloader, desc = 'Predicting') :
			model.eval()
			batch = tuple(t.to(args.device) for t in batch)

			with torch.no_grad() :
				inputs = {
					'input_ids'      : batch[0],
					'attention_mask' : batch[1],
					'labels'         : batch[3]
				}

				# XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
				if args.model_type != 'distilbert' :
					inputs['token_type_ids'] = (batch[2] if args.model_type in TOKEN_ID_GROUP else None)

				outputs = model(**inputs)
				_, logits = outputs[:2]

			if preds is None :
				preds = logits.detach().cpu().numpy()
				out_label_ids = inputs['labels'].detach().cpu().numpy()
			else :
				preds = numpy.append(preds, logits.detach().cpu().numpy(), axis = 0)
				out_label_ids = numpy.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis = 0)

		#
		# Handle classification tasks
		#

		if args.output_mode == 'classification' :
			if args.task_name[:3] == 'dna' and args.task_name != 'dnasplice' :
				if args.do_ensemble_pred :
					probs = softmax(torch.tensor(preds, dtype = torch.float32)).numpy()
				else :
					probs = softmax(torch.tensor(preds, dtype = torch.float32))[:, 1].numpy()
			elif args.task_name == 'dnasplice' :
				probs = softmax(torch.tensor(preds, dtype = torch.float32)).numpy()

			preds = numpy.argmax(preds, axis = 1)

		#
		# Handle regression tasks
		#

		elif args.output_mode == 'regression' :
			preds = numpy.squeeze(preds)

		#
		# Compute metrics
		#

		if args.do_ensemble_pred :
			result = compute_metrics(pred_task, preds, out_label_ids, probs[:, 1])
		else :
			result = compute_metrics(pred_task, preds, out_label_ids, probs)

		#
		# Log prediction results
		#

		pred_output_dir = args.predict_dir

		if not os.path.exists(pred_output_dir) :
			os.makedir(pred_output_dir)

		output_pred_file = os.path.join(pred_output_dir, 'pred_results.npy')
		logger.info('***** Pred results {} *****'.format(prefix))

		for key in sorted(result.keys()) :
			logger.info('  %s = %s', key, str(result[key]))

		numpy.save(output_pred_file, probs)

# 7. Attention

In [None]:
def format_attention (attention) :
	"""
	Doc
	"""

	squeezed = list()

	for layer_attention in attention :
		# 1 x num_heads x seq_len x seq_len
		if len(layer_attention.shape) != 4 :
			raise ValueError('The attention tensor does not have the correct number of dimensions. Make sure you set output_attentions=True when initializing your model.')

		squeezed.append(layer_attention.squeeze(0))

	# num_layers x num_heads x seq_len x seq_len
	return torch.stack(squeezed).unsqueeze(0)

# 8. Visualize

In [None]:
def visualize (args, model, tokenizer, kmer, prefix = '') :
	"""
	Doc
	"""

	pred_task_names   = (args.task_name,)
	pred_outputs_dirs = (args.predict_dir,)

	if not os.path.exists(args.predict_dir) :
		os.makedirs(args.predict_dir)

	softmax = torch.nn.Softmax(dim = 1)

	for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs) :
		evaluate     = False if args.visualize_train else True
		pred_dataset = load_and_cache_examples(args, pred_task, tokenizer, evaluate = evaluate)

		if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0] :
			os.makedirs(pred_output_dir)

		args.pred_batch_size = args.per_gpu_pred_batch_size * max(1, args.n_gpu)

		pred_sampler    = SequentialSampler(pred_dataset)
		pred_dataloader = DataLoader(pred_dataset, sampler = pred_sampler, batch_size = args.pred_batch_size)

		#
		# Multi GPU visualization
		#

		if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel) :
			model = torch.nn.DataParallel(model)

		#
		# Start prediciton
		#

		logger.info('***** Running prediction {} *****'.format(prefix))
		logger.info('  Num examples = %d', len(pred_dataset))
		logger.info('  Batch size = %d', args.pred_batch_size)

		pred_loss     = 0.0
		nb_pred_steps = 0
		batch_size    = args.pred_batch_size

		if args.task_name != 'dnasplice' :
			preds = numpy.zeros([len(pred_dataset), 2])
		else :
			preds = numpy.zeros([len(pred_dataset), 3])

		attention_scores = numpy.zeros([len(pred_dataset), 12, args.max_seq_length, args.max_seq_length])

		for index, batch in enumerate(tqdm(pred_dataloader, desc = 'Predicting')) :
			model.eval()
			batch = tuple(t.to(args.device) for t in batch)

			with torch.no_grad() :
				inputs = {
					'input_ids'      : batch[0],
					'attention_mask' : batch[1],
					'labels'         : batch[3]
				}

				# XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
				if args.model_type != 'distilbert' :
					inputs['token_type_ids'] = (batch[2] if args.model_type in TOKEN_ID_GROUP else None)

				outputs   = model(**inputs)
				attention = outputs[-1][-1]
				_, logits = outputs[:2]

				preds[index * batch_size:index * batch_size + len(batch[0]), :] = logits.detach().cpu().numpy()
				attention_scores[index * batch_size:index * batch_size + len(batch[0]), :, :, :] = attention.cpu().numpy()

		#
		# Alaways applies Softmax no mather the task (so not good for regression)
		#

		if args.task_name != 'dnasplice' :
			probs = softmax(torch.tensor(preds, dtype = torch.float32))[:, 1].numpy()
		else :
			probs = softmax(torch.tensor(preds, dtype = torch.float32)).numpy()

		scores = numpy.zeros([attention_scores.shape[0], attention_scores.shape[-1]])

		for index, attention_score in enumerate(attention_scores) :
			attn_score = []

			for i in range(1, attention_score.shape[-1] - kmer + 2) :
				attn_score.append(float(attention_score[:, 0, i].sum()))

			for i in range(len(attn_score) - 1) :
				if attn_score[i + 1] == 0 :
					attn_score[i] = 0
					break

			counts      = numpy.zeros([len(attn_score) + kmer - 1])
			real_scores = numpy.zeros([len(attn_score) + kmer - 1])

			for i, score in enumerate(attn_score) :
				for j in range(kmer) :
					counts[i + j] += 1.0
					real_scores[i + j] += score

			real_scores = real_scores / counts
			real_scores = real_scores / numpy.linalg.norm(real_scores)

			scores[index] = real_scores

	return scores, probs

# 9. Cache

In [None]:
def load_and_cache_examples (args, task, tokenizer, evaluate = False) :
	"""
	Doc
	"""

	# Make sure only the first process in distributed training process the dataset, and the others will use the cache
	if args.local_rank not in [-1, 0] and not evaluate :
		torch.distributed.barrier()

	processor   = processors[task]()
	output_mode = output_modes[task]

	#
	# Load data features from cache or dataset file
	#

	cached_features_file = os.path.join(
		args.data_dir,
		'cached_{}_{}_{}_{}'.format(
			'dev' if evaluate else 'train',
			list(filter(None, args.model_name_or_path.split('/'))).pop(),
			str(args.max_seq_length),
			str(task),
		),
	)

	if args.do_predict :
		cached_features_file = os.path.join(
			args.data_dir,
			'cached_{}_{}_{}'.format(
				'dev' if evaluate else 'train',
				str(args.max_seq_length),
				str(task),
			),
		)

	if os.path.exists(cached_features_file) and not args.overwrite_cache :
		logger.info('Loading features from cached file %s', cached_features_file)
		features = torch.load(cached_features_file)
	else :
		logger.info('Creating features from dataset file at %s', args.data_dir)
		label_list = processor.get_labels()

		if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta', 'xlmroberta'] :
			label_list[1], label_list[2] = label_list[2], label_list[1]

		examples = (
			processor.get_dev_examples(args.data_dir)
			if evaluate
			else processor.get_train_examples(args.data_dir)
		)

		# Params for convert_examples_to_features
		max_length  = args.max_seq_length
		pad_on_left = bool(args.model_type in ['xlnet'])
		pad_token   = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]

		if args.model_type in ['xlnet'] :
			pad_token_segment_id = 4
		else :
			pad_token_segment_id = 0

		if args.n_process == 1 :
			features = convert_examples_to_features(
				examples,
				tokenizer,
				label_list  = label_list,
				max_length  = max_length,
				output_mode = output_mode,
				pad_on_left = pad_on_left,
				pad_token   = pad_token,
				pad_token_segment_id = pad_token_segment_id,
			)

		else :
			nproc = int(args.n_process)

			if evaluate :
				nproc = max(int(nproc / 4), 1)

			print('Number of processes for converting feature: ' + str(nproc))

			pool      = Pool(nproc)
			indexes   = [0]
			len_slice = int(len(examples) / nproc)

			for i in range(1, nproc + 1) :
				if i != nproc:
					indexes.append(len_slice * (i))
				else :
					indexes.append(len(examples))

			results  = list()
			features = list()

			for i in range(nproc) :
				results.append(pool.apply_async(
					convert_examples_to_features,
					args = (
						examples[indexes[i]:indexes[i + 1]],
						tokenizer, max_length, None, label_list, output_mode,
						pad_on_left, pad_token, pad_token_segment_id, True
					)
				))
				print(str(i + 1) + ' processor started !')

			pool.close()
			pool.join()

			for result in results :
				features.extend(result.get())

		if args.local_rank in [-1, 0] :
			logger.info('Saving features into cached file %s', cached_features_file)
			torch.save(features, cached_features_file)

	# Make sure only the first process in distributed training process the dataset, and the others will use the cache
	if args.local_rank == 0 and not evaluate :
		torch.distributed.barrier()

	# Convert to tensors and build dataset
	all_input_ids      = torch.tensor([f.input_ids      for f in features], dtype = torch.long)
	all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype = torch.long)
	all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype = torch.long)

	if output_mode == 'classification' :
		all_labels = torch.tensor([f.label for f in features], dtype = torch.long)
	elif output_mode == 'regression' :
		all_labels = torch.tensor([f.label for f in features], dtype = torch.float)

	return TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)

# 10. Main

In [None]:
# Main method

def main () :
	"""
	Doc
	"""

	parser  =  argparse.ArgumentParser()

	# Required parameters
	parser.add_argument('--data_dir',           default = None, type = str, required = True, help = 'The input data dir. Should contain the .tsv files (or other data files) for the task.')
	parser.add_argument('--model_type',         default = None, type = str, required = True, help = 'Model type selected in the list: ' + ', '.join(MODEL_CLASSES.keys()))
	parser.add_argument('--model_name_or_path', default = None, type = str, required = True, help = 'Path to pre-trained model or shortcut name selected in the list: ' + ', '.join(ALL_MODELS))
	parser.add_argument('--task_name',          default = None, type = str, required = True, help = 'The name of the task to train selected in the list: ' + ', '.join(processors.keys()))
	parser.add_argument('--output_dir',         default = None, type = str, required = True, help = 'The output directory where the model predictions and checkpoints will be written.')

	# Other parameters
	parser.add_argument('--n_process',                    default = 2,      type = int,   help = 'The number of processes used for data process')
	parser.add_argument('--visualize_data_dir',           default = None,   type = str,   help = 'The input data dir. Should contain the .tsv files for the task.')
	parser.add_argument('--result_dir',                   default = None,   type = str,   help = 'The directory where the dna690 and mouse will save results.')
	parser.add_argument('--config_name',                  default = '',     type = str,   help = 'Pretrained config name or path if not the same as model_name')
	parser.add_argument('--tokenizer_name',               default = '',     type = str,   help = 'Pretrained tokenizer name or path if not the same as model_name')
	parser.add_argument('--cache_dir',                    default = '',     type = str,   help = 'Where do you want to store the pre-trained models downloaded from s3')
	parser.add_argument('--predict_dir',                  default = None,   type = str,   help = 'The output directory of predicted result. (when do_predict)')
	parser.add_argument('--max_seq_length',               default = 128,    type = int,   help = 'The maximum total input sequence length after tokenization.')
	parser.add_argument('--per_gpu_train_batch_size',     default = 8,      type = int,   help = 'Batch size per GPU/CPU for training.')
	parser.add_argument('--per_gpu_eval_batch_size',      default = 8,      type = int,   help = 'Batch size per GPU/CPU for evaluation.')
	parser.add_argument('--per_gpu_pred_batch_size',      default = 8,      type = int,   help = 'Batch size per GPU/CPU for prediction.')
	parser.add_argument('--early_stop',                   default = 0,      type = int,   help = 'set this to a positive integet if you want to perfrom early stop.')
	parser.add_argument('--predict_scan_size',            default = 1,      type = int,   help = 'Number of updates steps to accumulate before performing a backward/update pass.')
	parser.add_argument('--gradient_accumulation_steps',  default = 1,      type = int,   help = 'Number of updates steps to accumulate before performing a backward/update pass.')
	parser.add_argument('--learning_rate',                default = 5e-5,   type = float, help = 'The initial learning rate for Adam.')
	parser.add_argument('--weight_decay',                 default = 0.0,    type = float, help = 'Weight decay if we apply some.')
	parser.add_argument('--adam_epsilon',                 default = 1e-8,   type = float, help = 'Epsilon for Adam optimizer.')
	parser.add_argument('--beta1',                        default = 0.9,    type = float, help = 'Beta1 for Adam optimizer.')
	parser.add_argument('--beta2',                        default = 0.999,  type = float, help = 'Beta2 for Adam optimizer.')
	parser.add_argument('--max_grad_norm',                default = 1.0,    type = float, help = 'Max gradient norm.')
	parser.add_argument('--attention_probs_dropout_prob', default = 0.1,    type = float, help = 'Dropout rate of attention.')
	parser.add_argument('--hidden_dropout_prob',          default = 0.1,    type = float, help = 'Dropout rate of intermidiete layer.')
	parser.add_argument('--rnn_dropout',                  default = 0.0,    type = float, help = 'Dropout rate of intermidiete layer.')
	parser.add_argument('--rnn',                          default = 'lstm', type = str,   help = 'What kind of RNN to use')
	parser.add_argument('--num_rnn_layer',                default = 2,      type = int,   help = 'Number of rnn layers in dnalong model.')
	parser.add_argument('--rnn_hidden',                   default = 768,    type = int,   help = 'Number of hidden unit in a rnn layer.')
	parser.add_argument('--num_train_epochs',             default = 3.0,    type = float, help = 'Total number of training epochs to perform.')
	parser.add_argument('--max_steps',                    default = -1,     type = int,   help = 'If > 0: set total number of training steps to perform. Override num_train_epochs.')
	parser.add_argument('--warmup_steps',                 default = 0,      type = int,   help = 'Linear warmup over warmup_steps.')
	parser.add_argument('--warmup_percent',               default = 0,      type = float, help = 'Linear warmup over warmup_percent*total_steps.')
	parser.add_argument('--logging_steps',                default = 500,    type = int,   help = 'Log every X updates steps.')
	parser.add_argument('--save_steps',                   default = 500,    type = int,   help = 'Save checkpoint every X updates steps.')
	parser.add_argument('--save_total_limit',             default = None,   type = int,   help = 'Limit the total amount of checkpoints.')
	parser.add_argument('--visualize_models',             default = None,   type = int,   help = 'The model used to do visualization. If None, use 3456.')
	parser.add_argument('--seed',                         default = 42,     type = int,   help = 'random seed for initialization')
	parser.add_argument('--fp16_opt_level',               default = 'O1',   type = str,   help = 'For fp16 see details at https://nvidia.github.io/apex/amp.html')
	parser.add_argument('--local_rank',                   default = -1,     type = int,   help = 'For distributed training: local_rank')
	parser.add_argument('--server_ip',                    default = '',     type = str,   help = 'For distant debugging.')
	parser.add_argument('--server_port',                  default = '',     type = str,   help = 'For distant debugging.')

	parser.add_argument('--should_continue',          action = 'store_true', help = 'Whether to continue from latest checkpoint in output_dir')
	parser.add_argument('--do_train',                 action = 'store_true', help = 'Whether to run training.')
	parser.add_argument('--do_eval',                  action = 'store_true', help = 'Whether to run eval on the dev set.')
	parser.add_argument('--do_predict',               action = 'store_true', help = 'Whether to do prediction on the given dataset.')
	parser.add_argument('--do_visualize',             action = 'store_true', help = 'Whether to calculate attention score.')
	parser.add_argument('--visualize_train',          action = 'store_true', help = 'Whether to visualize train.tsv or dev.tsv.')
	parser.add_argument('--do_ensemble_pred',         action = 'store_true', help = 'Whether to do ensemble prediction with kmer 3456.')
	parser.add_argument('--evaluate_during_training', action = 'store_true', help = 'Run evaluation during training at each logging step.')
	parser.add_argument('--do_lower_case',            action = 'store_true', help = 'Set this flag if you are using an uncased model.')
	parser.add_argument('--eval_all_checkpoints',     action = 'store_true', help = 'Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number')
	parser.add_argument('--no_cuda',                  action = 'store_true', help = 'Avoid using CUDA when available')
	parser.add_argument('--overwrite_output_dir',     action = 'store_true', help = 'Overwrite the content of the output directory')
	parser.add_argument('--overwrite_cache',          action = 'store_true', help = 'Overwrite the cached training and evaluation sets')
	parser.add_argument('--fp16',                     action = 'store_true', help = 'Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit')

	args = parser.parse_args()

	if args.should_continue :
		sorted_checkpoints = _sorted_checkpoints(args)

		if len(sorted_checkpoints) == 0 :
			raise ValueError('Used --should_continue but no checkpoint was found in --output_dir.')
		else :
			args.model_name_or_path = sorted_checkpoints[-1]

	if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir) :
		raise ValueError('Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.'.format(args.output_dir))

	if args.server_ip and args.server_port:
		import ptvsd

		ptvsd.enable_attach(address = (args.server_ip, args.server_port), redirect_output = True)
		ptvsd.wait_for_attach()

	if args.local_rank == -1 or args.no_cuda :
		device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
		args.n_gpu = torch.cuda.device_count()
	else :
		torch.cuda.set_device(args.local_rank)
		device = torch.device('cuda', args.local_rank)
		torch.distributed.init_process_group(backend = 'nccl')
		args.n_gpu = 1

	args.device = device

	logging.basicConfig(
		format  = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
		datefmt = '%m/%d/%Y %H:%M:%S',
		level  = logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
	)

	logger.warning(
		'Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s',
		args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16
	)

	set_seed(args)

	#
	# Prepare GLUE task
	#

	args.task_name = args.task_name.lower()

	if args.task_name not in processors :
		raise ValueError('Task not found: %s' % (args.task_name))

	processor        = processors[args.task_name]()
	args.output_mode = output_modes[args.task_name]
	label_list       = processor.get_labels()
	num_labels       = len(label_list)

	# Load pretrained model and tokenizer
	# Make sure only the first process in distributed training will download model & vocab
	if args.local_rank not in [-1, 0] :
		torch.distributed.barrier() 

	args.model_type = args.model_type.lower()

	config_class    = MODEL_CLASSES[args.model_type][0]
	model_class     = MODEL_CLASSES[args.model_type][1]
	tokenizer_class = MODEL_CLASSES[args.model_type][2]

	if not args.do_visualize and not args.do_ensemble_pred :
		config = config_class.from_pretrained(
			args.config_name if args.config_name else args.model_name_or_path,
			num_labels      = num_labels,
			finetuning_task = args.task_name,
			cache_dir       = args.cache_dir if args.cache_dir else None
		)

		config.hidden_dropout_prob          = args.hidden_dropout_prob
		config.attention_probs_dropout_prob = args.attention_probs_dropout_prob

		if args.model_type in ['dnalong', 'dnalongcat'] :
			assert args.max_seq_length % 512 == 0

		config.split         = int(args.max_seq_length / 512)
		config.rnn           = args.rnn
		config.num_rnn_layer = args.num_rnn_layer
		config.rnn_dropout   = args.rnn_dropout
		config.rnn_hidden    = args.rnn_hidden

		tokenizer = tokenizer_class.from_pretrained(
			args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
			do_lower_case = args.do_lower_case,
			cache_dir     = args.cache_dir if args.cache_dir else None,
		)

		model = model_class.from_pretrained(
			args.model_name_or_path,
			from_tf   = bool(".ckpt" in args.model_name_or_path),
			config    = config,
			cache_dir = args.cache_dir if args.cache_dir else None,
		)

		logger.info('Finish loading model')

		# Make sure only the first process in distributed training will download model & vocab
		if args.local_rank == 0 :
			torch.distributed.barrier()

		model.to(args.device)

		logger.info('Training/evaluation parameters %s', args)

	#
	# Training
	#

	if args.do_train :
		train_dataset        = load_and_cache_examples(args, args.task_name, tokenizer, evaluate = False)
		global_step, tr_loss = train(args, train_dataset, model, tokenizer)

		logger.info('global_step = %s, average loss = %s', global_step, tr_loss)


	if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and args.task_name != 'dna690' :
		if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0] :
			os.makedirs(args.output_dir)

		logger.info('Saving model checkpoint to %s', args.output_dir)

		# Save a trained model, configuration and tokenizer using `save_pretrained()`.
		# They can then be reloaded using `from_pretrained()`
		# Good practice: save your training arguments together with the trained model
		model_to_save = (model.module if hasattr(model, 'module') else model)
		model_to_save.save_pretrained(args.output_dir)
		tokenizer.save_pretrained(args.output_dir)
		torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

		# Load a trained model and vocabulary that you have fine-tuned
		model     = model_class.from_pretrained(args.output_dir)
		tokenizer = tokenizer_class.from_pretrained(args.output_dir)

		model.to(args.device)

	#
	# Evaluation
	#

	results = dict()

	if args.do_eval and args.local_rank in [-1, 0] :
		tokenizer   = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
		checkpoints = [args.output_dir]

		if args.eval_all_checkpoints :
			checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive = True)))

			# Reduce logging
			logging.getLogger('transformers.modeling_utils').setLevel(logging.WARN) 

		logger.info('Evaluate the following checkpoints: %s', checkpoints)

		for checkpoint in checkpoints :
			global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ''
			prefix      = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ''

			model = model_class.from_pretrained(checkpoint)
			model.to(args.device)

			result = evaluate(args, model, tokenizer, prefix = prefix)
			result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())

			results.update(result)

	#
	# Prediction
	#

	predictions = dict()

	if args.do_predict and args.local_rank in [-1, 0] :
		tokenizer  = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
		checkpoint = args.output_dir
		prefix     = ''

		logger.info('Predict using the following checkpoint: %s', checkpoint)

		model = model_class.from_pretrained(checkpoint)
		model.to(args.device)

		prediction = predict(args, model, tokenizer, prefix = prefix)

	#
	# Visualize
	#

	if args.do_visualize and args.local_rank in [-1, 0] :
		visualization_models = [3, 4, 5, 6] if not args.visualize_models else [args.visualize_models]

		scores    = None
		all_probs = None

		for kmer in visualization_models :
			output_dir = args.output_dir.replace('/690', '/690/' + str(kmer))

			tokenizer = tokenizer_class.from_pretrained(
				'dna' + str(kmer),
				do_lower_case = args.do_lower_case,
				cache_dir     = args.cache_dir if args.cache_dir else None,
			)

			checkpoint = output_dir
			logger.info('Calculate attention score using the following checkpoint: %s', checkpoint)
			prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ''

			config = config_class.from_pretrained(
				output_dir,
				num_labels      = num_labels,
				finetuning_task = args.task_name,
				cache_dir       = args.cache_dir if args.cache_dir else None,
			)

			config.output_attentions = True

			model = model_class.from_pretrained(
				checkpoint,
				from_tf   = bool('.ckpt' in args.model_name_or_path),
				config    = config,
				cache_dir = args.cache_dir if args.cache_dir else None,
			)

			model.to(args.device)
			attention_scores, probs = visualize(args, model, tokenizer, prefix = prefix, kmer = kmer)

			if scores is not None :
				all_probs += probs
				scores    += attention_scores
			else :
				all_probs = deepcopy(probs)
				scores    = deepcopy(attention_scores)

		all_probs = all_probs / float(len(visualization_models))

		numpy.save(os.path.join(args.predict_dir, 'atten.npy'), scores)
		numpy.save(os.path.join(args.predict_dir, 'pred_results.npy'), all_probs)

	#
	# Ensemble
	#

	if args.do_ensemble_pred and args.local_rank in [-1, 0] :
		for kmer in range(3, 7) :
			output_dir = os.path.join(args.output_dir, str(kmer))
			tokenizer  = tokenizer_class.from_pretrained(
				'dna' + str(kmer),
				do_lower_case = args.do_lower_case,
				cache_dir     = args.cache_dir if args.cache_dir else None
			)

			checkpoint = output_dir
			logger.info('Calculate attention score using the following checkpoint: %s', checkpoint)
			prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ''

			config = config_class.from_pretrained(
				output_dir,
				num_labels      = num_labels,
				finetuning_task = args.task_name,
				cache_dir       = args.cache_dir if args.cache_dir else None
			)

			config.output_attentions = True

			model = model_class.from_pretrained(
				args.model_name_or_path,
				from_tf   = bool('.ckpt' in args.model_name_or_path),
				config    = config,
				cache_dir = args.cache_dir if args.cache_dir else None,
			)

			model.to(args.device)

			if kmer == 3 :
				args.data_dir = os.path.join(args.data_dir, str(kmer))
			else:
				args.data_dir = args.data_dir.replace('/' + str(kmer - 1), '/' + str(kmer))

			if args.result_dir.split('/')[-1] == 'test.npy' :
				results, eval_task, _, out_label_ids, probs = evaluate(args, model, tokenizer, prefix = prefix)
			elif args.result_dir.split('/')[-1] == 'train.npy':
				results, eval_task, _, out_label_ids, probs = evaluate(args, model, tokenizer, prefix = prefix, evaluate = False)
			else:
				raise ValueError('File name in result_dir should be either test.npy or train.npy')

			if kmer == 3 :
				all_probs = deepcopy(probs)
				cat_probs = deepcopy(probs)
			else :
				all_probs += probs
				cat_probs = numpy.concatenate((cat_probs, probs), axis = 1)

			print(cat_probs[0])

		all_probs = all_probs / 4.0
		all_preds = numpy.argmax(all_probs, axis = 1)

		# Save label and data for stuck ensemble
		labels = numpy.array(out_label_ids)
		labels = labels.reshape(labels.shape[0], 1)
		data   = numpy.concatenate((cat_probs, labels), axis = 1)

		random.shuffle(data)

		root_path = args.result_dir.replace(args.result_dir.split('/')[-1], '')

		if not os.path.exists(root_path) :
			os.makedirs(root_path)

		numpy.save(args.result_dir, data)

		ensemble_results = compute_metrics(eval_task, all_preds, out_label_ids, all_probs[:, 1])

		logger.info('***** Ensemble results {} *****'.format(prefix))

		for key in sorted(ensemble_results.keys()) :
			logger.info('  %s = %s', key, str(ensemble_results[key]))

	return results

# 11. Testing

In [None]:
# Testing RegressionProcessor vs DnaPromProcessor vs StsbProcessor

def test_processor () :
	"""
	Doc
	"""

	processor0 = DnaPromProcessor()
	processor1 = RegressionProcessor()

	path = os.path.join(ROOT, 'source', 'python', 'bert', 'dnabert', 'examples', 'sample_data', 'ft', '6')

	examples0 = processor0.get_train_examples(path)
	examples1 = processor1.get_train_examples(path)

	for name, examples in zip(['DnaProm', 'Regression'], [examples0, examples1]) :
		print()
		print(name + 'Processor')
		print('GUID  : {}'.format(examples[0].guid))
		print('Text  : {} ... {}'.format(examples[0].text_a[:30], examples[0].text_a[-30:]))
		print('Label : {}'.format(examples[0].label))

# 12. Launcher

In [None]:
if __name__ == '__main__' and 'get_ipython' in dir() :
	print('Running as .ipynb')

	test_processor()

if __name__ == '__main__' and 'get_ipython' not in dir() :
	print('Running as .py')

	main()