Here we are fine tuning the GPT-2 model, by first loading the weights `gpt2` and then train on `MedQuAD Cancer dataset`

Reference:
* This jupyter file is rewritten using previous version of [Transformers](https://github.com/huggingface/transformers/blob/master/examples/contrib/legacy/run_language_modeling.py) example file `run_language_modeling.py` (note: this file has been deleted now), and some changes are made for argument parsers 
* The argument parsers and it's functions are created with help of patil-suraj's [example](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) file `longformer_qa_training.ipynb`

**NOTE**: Currently, all paths for model load, save and data load, json save are hardcoded. So, please make sure you change them according to your requriements 

---------
## Load packages

In [1]:
import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional
import json
import torch

import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)
from transformers.trainer_utils import is_main_process

## Load the logger

In [2]:
logger = logging.getLogger(__name__)


MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

## Argument helper `dataclass` functions for Model and Data Arguments

In [3]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
        },
    )
    model_type: Optional[str] = field(
        default=None,
        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )
    server_ip: Optional[str] = field(
        default="", 
        metadata={"help":"Can be used for distant debugging."},
    )
    server_port: Optional[str] = field(
        default="", 
        metadata={"help":"Can be used for distant debugging."},
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    train_data_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a text file)."}
    )
    eval_data_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
    line_by_line: bool = field(
        default=False,
        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
    )

    mlm: bool = field(
        default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
    )
    mlm_probability: float = field(
        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
    )

    block_size: int = field(
        default=-1,
        metadata={
            "help": "Optional input sequence length after tokenization."
            "The training dataset will be truncated in block of this size for training."
            "Default to the model max input length for single sentence inputs (take into account special tokens)."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )

## Function to load the data

In [4]:
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache
        )

## Create and save JSON dictionary for model argument parameter

In [5]:
args_dict = {
    "model_type": 'gpt2' ,
    "model_name_or_path": 'gpt2' ,
    "do_train": True ,
    "do_eval": True ,
    "train_data_file": '../../dataset/new-prepared-data/text-generation/train_text-generation.txt' ,
    "eval_data_file": '../../dataset/new-prepared-data/text-generation/dev_text-generation.txt' ,
    "learning_rate": 5e-5 ,
    "num_train_epochs": 20 ,
#     "max_seq_length": 500 ,
#     "doc_stride": 128 ,
    "output_dir": './GPT2_text_generator' ,
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 1 ,
    "per_device_train_batch_size": 1 ,
    "save_steps": 100,
    "eval_steps": 100,
    "logging_steps": 100,
    "n_gpu": 4,
#     "do_lower_case": True,
    "line_by_line":True,
#     "no_cuda": True,
}

with open('args.json', 'w') as f:
    json.dump(args_dict, f)

## Run function for model

In [6]:
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
#     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath('args.json'))

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument."
        )

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

#####################
    # Setup distant debugging if needed
    if model_args.server_ip and model_args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(model_args.server_ip, model_args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if training_args.local_rank == -1 or training_args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not training_args.no_cuda else "cpu")
        model_args.n_gpu = 0 if training_args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(training_args.local_rank)
        device = torch.device("cuda", training_args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        model_args.n_gpu = 1
    model_args.device = device
#####################  
        
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

######################
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
######################
    
    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

######################
    if training_args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()
######################
    
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name"
        )

    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)
    
######################
    if training_args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()
######################
    
    #Added support for <BOS> and <EOS> tag.
    special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling)."
        )

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.model_max_length
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.model_max_length)

    # Get datasets

    train_dataset = get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Training
    if training_args.do_train:
        model_path = (
            model_args.model_name_or_path
            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
            else None
        )
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        
        #Fixed Tokenizer bug
        tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        #if trainer.is_world_master():
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results

## Start training

In [7]:
main()

11/29/2020 04:34:05 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='./GPT2_text_generator', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, model_parallel=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=20, max_steps=-1, warmup_steps=0, logging_dir='runs/Nov29_04-34-05_c0109', logging_first_step=False, logging_steps=100, save_steps=100, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=100, dataloader_num_workers=0, past_index=-1, run_name='./

Step,Training Loss
100,4.72681
200,1.689075
300,1.414264
400,1.235811
500,1.11665
600,1.054854
700,0.969745
800,0.882694
900,0.847991
1000,0.810387


[INFO|trainer.py:1168] 2020-11-29 04:35:07,230 >> Saving model checkpoint to ./GPT2_text_generator/checkpoint-100
[INFO|configuration_utils.py:284] 2020-11-29 04:35:07,235 >> Configuration saved in ./GPT2_text_generator/checkpoint-100/config.json
[INFO|modeling_utils.py:741] 2020-11-29 04:35:08,044 >> Model weights saved in ./GPT2_text_generator/checkpoint-100/pytorch_model.bin
[INFO|trainer.py:1168] 2020-11-29 04:35:47,817 >> Saving model checkpoint to ./GPT2_text_generator/checkpoint-200
[INFO|configuration_utils.py:284] 2020-11-29 04:35:47,820 >> Configuration saved in ./GPT2_text_generator/checkpoint-200/config.json
[INFO|modeling_utils.py:741] 2020-11-29 04:35:48,622 >> Model weights saved in ./GPT2_text_generator/checkpoint-200/pytorch_model.bin
[INFO|trainer.py:1168] 2020-11-29 04:36:26,984 >> Saving model checkpoint to ./GPT2_text_generator/checkpoint-300
[INFO|configuration_utils.py:284] 2020-11-29 04:36:26,987 >> Configuration saved in ./GPT2_text_generator/checkpoint-300/con

[INFO|trainer.py:1168] 2020-11-29 04:43:08,282 >> Saving model checkpoint to ./GPT2_text_generator/checkpoint-1300
[INFO|configuration_utils.py:284] 2020-11-29 04:43:08,286 >> Configuration saved in ./GPT2_text_generator/checkpoint-1300/config.json
[INFO|modeling_utils.py:741] 2020-11-29 04:43:09,026 >> Model weights saved in ./GPT2_text_generator/checkpoint-1300/pytorch_model.bin
[INFO|trainer.py:1168] 2020-11-29 04:43:48,671 >> Saving model checkpoint to ./GPT2_text_generator/checkpoint-1400
[INFO|configuration_utils.py:284] 2020-11-29 04:43:48,675 >> Configuration saved in ./GPT2_text_generator/checkpoint-1400/config.json
[INFO|modeling_utils.py:741] 2020-11-29 04:43:49,534 >> Model weights saved in ./GPT2_text_generator/checkpoint-1400/pytorch_model.bin
[INFO|trainer.py:1168] 2020-11-29 04:44:28,067 >> Saving model checkpoint to ./GPT2_text_generator/checkpoint-1500
[INFO|configuration_utils.py:284] 2020-11-29 04:44:28,071 >> Configuration saved in ./GPT2_text_generator/checkpoint-

[INFO|trainer.py:1168] 2020-11-29 04:51:07,932 >> Saving model checkpoint to ./GPT2_text_generator/checkpoint-2500
[INFO|configuration_utils.py:284] 2020-11-29 04:51:07,936 >> Configuration saved in ./GPT2_text_generator/checkpoint-2500/config.json
[INFO|modeling_utils.py:741] 2020-11-29 04:51:08,671 >> Model weights saved in ./GPT2_text_generator/checkpoint-2500/pytorch_model.bin
[INFO|trainer.py:1168] 2020-11-29 04:51:48,169 >> Saving model checkpoint to ./GPT2_text_generator/checkpoint-2600
[INFO|configuration_utils.py:284] 2020-11-29 04:51:48,172 >> Configuration saved in ./GPT2_text_generator/checkpoint-2600/config.json
[INFO|modeling_utils.py:741] 2020-11-29 04:51:48,942 >> Model weights saved in ./GPT2_text_generator/checkpoint-2600/pytorch_model.bin
[INFO|trainer.py:1168] 2020-11-29 04:52:28,626 >> Saving model checkpoint to ./GPT2_text_generator/checkpoint-2700
[INFO|configuration_utils.py:284] 2020-11-29 04:52:28,630 >> Configuration saved in ./GPT2_text_generator/checkpoint-

11/29/2020 04:54:03 - INFO - __main__ -   ***** Eval results *****
11/29/2020 04:54:03 - INFO - __main__ -     perplexity = 2.5901412982272793


{'perplexity': 2.5901412982272793}