In [1]:
# Libraries

import argparse
import logging
import json
import os
import platform
import sys

In [2]:
# Ensure source path

ROOT = os.getcwd()

while not ROOT.endswith('upolanc-thesis') :
	ROOT = os.path.abspath(os.path.join(ROOT, os.pardir))

	if len(ROOT) < len('upolanc-thesis') :
		if   platform.system() == 'Linux' :
			ROOT = '/d/hpc/home/up4472/workspace/upolanc-thesis'
		elif platform.system() == 'Windows' :
			ROOT = 'C:\\Developer\\Workspace\\PyCharm\\Projects\\upolanc-thesis'
		else :
			raise ValueError()

		print(f'Warning : could not find correct directory, using default : {ROOT}')
		break

if ROOT not in sys.path :
	sys.path.append(ROOT)

os.chdir(ROOT)

sys.path.append(os.path.join(ROOT, 'source', 'python', 'bert', 'dnabert', 'src'))
sys.path.append(os.path.join(ROOT, 'source', 'python', 'bert', 'dnabert', 'src', 'transformers'))

In [3]:
# Code

from source.python.bert.bert_constants import MODELS
from source.python.bert.bert_constants import MODES
from source.python.bert.bert_constants import PRETRAINED_MODELS
from source.python.bert.bert_constants import PROCESSORS

from source.python.bert.bert_main      import bert_init_args
from source.python.bert.bert_main      import bert_init_classes
from source.python.bert.bert_main      import bert_train
from source.python.bert.bert_main      import bert_evaluate
from source.python.bert.bert_main      import bert_predict
from source.python.bert.bert_main      import bert_visualize
from source.python.bert.bert_main      import bert_ensamble
from source.python                     import runtime

runtime.set_numpy_format()
runtime.set_pandas_format()
runtime.set_plot_theme()

# 1. Setup

In [4]:
# Display all possible models

def print_possible_models () :
	"""
	Doc
	"""

	for k, v in MODELS.items() :
		print('{:12s} : {:>16s} {:>36s} {:>20s}'.format(k, v[0].__name__, v[1].__name__, v[2].__name__))

	print()

In [5]:
# Display all possible processors

def print_possible_processors () :
	"""
	Doc
	"""

	for k, v in PROCESSORS.items() :
		print('{:12s} : {:>24s}'.format(k, v.__name__))

	print()

In [6]:
# Display all possible output modes

def print_possible_output_modes () :
	"""
	Doc
	"""

	for k, v in MODES.items() :
		print('{:12s} : {:>14s}'.format(k, v))

	print()

# 2. Main

In [7]:
# Main method

def main () :
	"""
	Doc
	"""

	parser  =  argparse.ArgumentParser()

	# Required parameters
	parser.add_argument('--data_dir',           default = None, type = str, required = True, help = 'The input data dir. Should contain the .tsv files (or other data files) for the task.')
	parser.add_argument('--model_type',         default = None, type = str, required = True, help = 'Model type selected in the list: ' + ', '.join(MODELS.keys()))
	parser.add_argument('--model_name_or_path', default = None, type = str, required = True, help = 'Path to pre-trained model or shortcut name selected in the list: ' + ', '.join(PRETRAINED_MODELS))
	parser.add_argument('--task_name',          default = None, type = str, required = True, help = 'The name of the task to train selected in the list: ' + ', '.join(PROCESSORS.keys()))
	parser.add_argument('--output_dir',         default = None, type = str, required = True, help = 'The output directory where the model predictions and checkpoints will be written.')

	# Other parameters
	parser.add_argument('--n_process',                    default = 2,      type = int,   help = 'The number of processes used for data process')
	parser.add_argument('--visualize_data_dir',           default = None,   type = str,   help = 'The input data dir. Should contain the .tsv files for the task.')
	parser.add_argument('--result_dir',                   default = None,   type = str,   help = 'The directory where the dna690 and mouse will save results.')
	parser.add_argument('--config_name',                  default = '',     type = str,   help = 'Pretrained config name or path if not the same as model_name')
	parser.add_argument('--tokenizer_name',               default = '',     type = str,   help = 'Pretrained tokenizer name or path if not the same as model_name')
	parser.add_argument('--cache_dir',                    default = '',     type = str,   help = 'Where do you want to store the pre-trained models downloaded from s3')
	parser.add_argument('--predict_dir',                  default = None,   type = str,   help = 'The output directory of predicted result. (when do_predict)')
	parser.add_argument('--max_seq_length',               default = 128,    type = int,   help = 'The maximum total input sequence length after tokenization.')
	parser.add_argument('--per_gpu_train_batch_size',     default = 8,      type = int,   help = 'Batch size per GPU/CPU for training.')
	parser.add_argument('--per_gpu_eval_batch_size',      default = 8,      type = int,   help = 'Batch size per GPU/CPU for evaluation.')
	parser.add_argument('--per_gpu_pred_batch_size',      default = 8,      type = int,   help = 'Batch size per GPU/CPU for prediction.')
	parser.add_argument('--early_stop',                   default = 0,      type = int,   help = 'set this to a positive integet if you want to perfrom early stop.')
	parser.add_argument('--predict_scan_size',            default = 1,      type = int,   help = 'Number of updates steps to accumulate before performing a backward/update pass.')
	parser.add_argument('--gradient_accumulation_steps',  default = 1,      type = int,   help = 'Number of updates steps to accumulate before performing a backward/update pass.')
	parser.add_argument('--learning_rate',                default = 5e-5,   type = float, help = 'The initial learning rate for Adam.')
	parser.add_argument('--weight_decay',                 default = 0.0,    type = float, help = 'Weight decay if we apply some.')
	parser.add_argument('--adam_epsilon',                 default = 1e-8,   type = float, help = 'Epsilon for Adam optimizer.')
	parser.add_argument('--beta1',                        default = 0.9,    type = float, help = 'Beta1 for Adam optimizer.')
	parser.add_argument('--beta2',                        default = 0.999,  type = float, help = 'Beta2 for Adam optimizer.')
	parser.add_argument('--max_grad_norm',                default = 1.0,    type = float, help = 'Max gradient norm.')
	parser.add_argument('--attention_probs_dropout_prob', default = 0.1,    type = float, help = 'Dropout rate of attention.')
	parser.add_argument('--hidden_dropout_prob',          default = 0.1,    type = float, help = 'Dropout rate of intermidiete layer.')
	parser.add_argument('--rnn_dropout',                  default = 0.0,    type = float, help = 'Dropout rate of intermidiete layer.')
	parser.add_argument('--rnn',                          default = 'lstm', type = str,   help = 'What kind of RNN to use')
	parser.add_argument('--num_rnn_layer',                default = 2,      type = int,   help = 'Number of rnn layers in dnalong model.')
	parser.add_argument('--rnn_hidden',                   default = 768,    type = int,   help = 'Number of hidden unit in a rnn layer.')
	parser.add_argument('--num_train_epochs',             default = 3.0,    type = float, help = 'Total number of training epochs to perform.')
	parser.add_argument('--max_steps',                    default = -1,     type = int,   help = 'If > 0: set total number of training steps to perform. Override num_train_epochs.')
	parser.add_argument('--warmup_steps',                 default = 0,      type = int,   help = 'Linear warmup over warmup_steps.')
	parser.add_argument('--warmup_percent',               default = 0,      type = float, help = 'Linear warmup over warmup_percent*total_steps.')
	parser.add_argument('--logging_steps',                default = 500,    type = int,   help = 'Log every X updates steps.')
	parser.add_argument('--save_steps',                   default = 500,    type = int,   help = 'Save checkpoint every X updates steps.')
	parser.add_argument('--save_total_limit',             default = None,   type = int,   help = 'Limit the total amount of checkpoints.')
	parser.add_argument('--visualize_models',             default = None,   type = int,   help = 'The model used to do visualization. If None, use 3456.')
	parser.add_argument('--seed',                         default = 42,     type = int,   help = 'random seed for initialization')
	parser.add_argument('--fp16_opt_level',               default = 'O1',   type = str,   help = 'For fp16 see details at https://nvidia.github.io/apex/amp.html')
	parser.add_argument('--local_rank',                   default = -1,     type = int,   help = 'For distributed training: local_rank')
	parser.add_argument('--server_ip',                    default = '',     type = str,   help = 'For distant debugging.')
	parser.add_argument('--server_port',                  default = '',     type = str,   help = 'For distant debugging.')

	parser.add_argument('--should_continue',          action = 'store_true', help = 'Whether to continue from latest checkpoint in output_dir')
	parser.add_argument('--do_train',                 action = 'store_true', help = 'Whether to run training.')
	parser.add_argument('--do_eval',                  action = 'store_true', help = 'Whether to run eval on the dev set.')
	parser.add_argument('--do_predict',               action = 'store_true', help = 'Whether to do prediction on the given dataset.')
	parser.add_argument('--do_visualize',             action = 'store_true', help = 'Whether to calculate attention score.')
	parser.add_argument('--visualize_train',          action = 'store_true', help = 'Whether to visualize train.tsv or dev.tsv.')
	parser.add_argument('--do_ensemble_pred',         action = 'store_true', help = 'Whether to do ensemble prediction with kmer 3456.')
	parser.add_argument('--evaluate_during_training', action = 'store_true', help = 'Run evaluation during training at each logging step.')
	parser.add_argument('--do_lower_case',            action = 'store_true', help = 'Set this flag if you are using an uncased model.')
	parser.add_argument('--eval_all_checkpoints',     action = 'store_true', help = 'Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number')
	parser.add_argument('--no_cuda',                  action = 'store_true', help = 'Avoid using CUDA when available')
	parser.add_argument('--overwrite_output_dir',     action = 'store_true', help = 'Overwrite the content of the output directory')
	parser.add_argument('--overwrite_cache',          action = 'store_true', help = 'Overwrite the cached training and evaluation sets')
	parser.add_argument('--fp16',                     action = 'store_true', help = 'Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit')

	# Added parameters
	parser.add_argument('--optimizer',     default = 'adamw', type = str)
	parser.add_argument('--freeze_layers', default = 12,      type = int)
	parser.add_argument('--num_features',  default = 72,      type = int)

	args   = parser.parse_args()
	logger = logging.getLogger(__name__)

	#
	# Features
	#

	use_features = args.num_features > 0
	num_features = args.num_features

	#
	# Init arguments
	#

	args = bert_init_args(
		args   = args,
		logger = logger
	)

	#
	# Init classes
	#

	output = bert_init_classes(
		args         = args,
		logger       = logger,
		use_features = use_features,
		num_features = num_features
	)

	model         = output['model']
	tokenizer     = output['tokenizer']
	config        = output['config']
	model_cls     = output['model_cls']
	tokenizer_cls = output['tokenizer_cls']
	config_cls    = output['config_cls']
	num_labels    = output['num_labels']

	#
	# Save args
	#

	os.makedirs(args.output_dir, exist_ok = True)

	with open(os.path.join(args.output_dir, 'args.json'), mode = 'w') as handle :
		json.dump(
			vars(args),
			handle,
			indent     = 4,
			separators = (',', ' : '),
			sort_keys  = True,
			default    = lambda o : '<not serializable>'
		)

	#
	# Training
	#

	bert_train(
		args          = args,
		model         = model,
		tokenizer     = tokenizer,
		model_cls     = model_cls,
		tokenizer_cls = tokenizer_cls,
		logger        = logger,
		use_features  = use_features
	)

	#
	# Evaluation
	#

	bert_evaluate(
		args          = args,
		model_cls     = model_cls,
		tokenizer_cls = tokenizer_cls,
		logger        = logger,
		use_features  = use_features
	)

	#
	# Prediction
	#

	bert_predict(
		args          = args,
		model_cls     = model_cls,
		tokenizer_cls = tokenizer_cls,
		logger        = logger,
		use_features  = use_features
	)

	#
	# Visualize
	#

	bert_visualize(
		args          = args,
		model_cls     = model_cls,
		tokenizer_cls = tokenizer_cls,
		config_cls    = config_cls,
		num_labels    = num_labels,
		logger        = logger,
		use_features  = use_features
	)

	#
	# Ensemble
	#

	bert_ensamble(
		args          = args,
		model_cls     = model_cls,
		tokenizer_cls = tokenizer_cls,
		config_cls    = config_cls,
		num_labels    = num_labels,
		logger        = logger,
		use_features  = use_features
	)

# 3. Testing

In [8]:
# Testing RegressionProcessor vs DnaPromProcessor vs StsbProcessor

def test_processor () :
	"""
	Doc
	"""

	from transformers.data.processors.glue  import DnaPromProcessor
	from source.python.bert.bert_processors import RegressionProcessor

	path = os.path.join(ROOT, 'source', 'python', 'bert', 'dnabert', 'examples', 'sample_data', 'ft', '6')

	for processor in [DnaPromProcessor, RegressionProcessor] :
		name     = processor.__name__
		examples = processor().get_train_examples(path)

		print()
		print(name)
		print('GUID  : {}'.format(examples[0].guid))
		print('Text  : {} ... {}'.format(examples[0].text_a[:30], examples[0].text_a[-30:]))
		print('Label : {}'.format(examples[0].label))

		if name == 'RegressionProcessor' :
			print('Feats : {}'.format(examples[0].feature))

# 4. Launcher

In [9]:
if __name__ == '__main__' and 'get_ipython' in dir() :
	print('Running as .ipynb')

	print_possible_models()
	print_possible_processors()
	print_possible_output_modes()
	test_processor()

if __name__ == '__main__' and 'get_ipython' not in dir() :
	print('Running as .py')

	main()

Running as .ipynb
dna          :       BertConfig        BertForSequenceClassification         DNATokenizer
dnalong      :       BertConfig    BertForLongSequenceClassification         DNATokenizer
dnalongcat   :       BertConfig    BertForLongSequenceClassification         DNATokenizer
bert         :       BertConfig        BertForSequenceClassification        BertTokenizer
xlnet        :      XLNetConfig       XLNetForSequenceClassification       XLNetTokenizer
xlm          :        XLMConfig         XLMForSequenceClassification         XLMTokenizer
roberta      :    RobertaConfig     RobertaForSequenceClassification     RobertaTokenizer
distilbert   : DistilBertConfig  DistilBertForSequenceClassification  DistilBertTokenizer
albert       :     AlbertConfig      AlbertForSequenceClassification      AlbertTokenizer
xlmroberta   : XLMRobertaConfig  XLMRobertaForSequenceClassification  XLMRobertaTokenizer
flaubert     :   FlaubertConfig    FlaubertForSequenceClassification    FlaubertTo