<a href="https://colab.research.google.com/github/xSakix/AI_colab_notebooks/blob/master/gpt2v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/huggingface/transformers
!pip install transformers
!pip install -r transformers/examples/requirements.txt

Cloning into 'transformers'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects:   2% (1/40)[Kremote: Counting objects:   5% (2/40)[Kremote: Counting objects:   7% (3/40)[Kremote: Counting objects:  10% (4/40)[Kremote: Counting objects:  12% (5/40)[Kremote: Counting objects:  15% (6/40)[Kremote: Counting objects:  17% (7/40)[Kremote: Counting objects:  20% (8/40)[Kremote: Counting objects:  22% (9/40)[Kremote: Counting objects:  25% (10/40)[Kremote: Counting objects:  27% (11/40)[Kremote: Counting objects:  30% (12/40)[Kremote: Counting objects:  32% (13/40)[Kremote: Counting objects:  35% (14/40)[Kremote: Counting objects:  37% (15/40)[Kremote: Counting objects:  40% (16/40)[Kremote: Counting objects:  42% (17/40)[Kremote: Counting objects:  45% (18/40)[Kremote: Counting objects:  47% (19/40)[Kremote: Counting objects:  50% (20/40)[Kremote: Counting objects:  52% (21/40)[Kremote: Counting objects:  55% (22/40)[Kremote: Coun

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""


import argparse
import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import numpy as np
import torch
import transformers
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import trange, tqdm

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    # AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

from torch.optim import AdamW, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau,CyclicLR,OneCycleLR


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


logger = logging.getLogger(__name__)


MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)


class TextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)

        block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()

            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)


class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info("Creating features from dataset file at %s", file_path)

        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)


def load_and_cache_examples(args, tokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)


def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """

    if tokenizer.mask_token is None:
        raise ValueError(
            "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
        )

    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels


def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
    )

    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))

    # parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    # parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    # parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    # parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")

    optimizer = torch.optim.AdamW(model.parameters(), lr=6.25e-5, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-6)
    scheduler = transformers.get_linear_schedule_with_warmup(optimizer,0.2*t_total,t_total)
  
    # if (
    #     args.model_name_or_path
    #     and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
    # ):
    #     # Load in optimizer and scheduler states
    #     optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
    #     logger.info("Loading optimizer from %s = %s",os.path.join(args.model_name_or_path, "optimizer.pt"), optimizer)

    # if (
    #     args.model_name_or_path
    #     and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    # ):
    #     # Load in optimizer and scheduler states
    #     scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
    #     logger.info("Loading scheduler from %s = %s", os.path.join(args.model_name_or_path, "scheduler.pt"),scheduler)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0    

    train_iterator = range(epochs_trained, int(args.num_train_epochs))
    
    set_seed(args)  # Added here for reproducibility
    global_loss = 0.0
    for epoch in train_iterator:
        tr_loss = []
        epoch_iterator = train_dataloader

        for step, batch in enumerate(epoch_iterator):
            inputs, labels =  (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]
            loss.backward()
            tr_loss.append(outputs[0].item())

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            if (step+1) % 500 == 0:
                print(f'Epoch: {epoch} | Step: {step} | Loss: {np.mean(tr_loss)} | Perplexity: {np.exp(np.mean(tr_loss))}')

            if (step+1) % 5000 == 0:
                logger.info("Saving model checkpoint to %s", args.output_dir)
                # Save a trained model, configuration and tokenizer using `save_pretrained()`.
                # They can then be reloaded using `from_pretrained()`
                model_to_save = (
                    model.module if hasattr(model, "module") else model
                ) 
                model_to_save.save_pretrained(args.output_dir)
                tokenizer.save_pretrained(args.output_dir)
                
                # torch.save(optimizer.state_dict(), os.path.join(args.output_dir, "optimizer.pt"))
                # torch.save(scheduler.state_dict(), os.path.join(args.output_dir, "scheduler.pt"))

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break


        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
        global_loss += np.mean(tr_loss)


    logger.info("Training done, saving model to %s",args.output_dir)
    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
    model_to_save.save_pretrained(args.output_dir)    
    tokenizer.save_pretrained(args.output_dir)
    torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
    # torch.save(optimizer.state_dict(), os.path.join(args.output_dir, "optimizer.pt"))
    # torch.save(scheduler.state_dict(), os.path.join(args.output_dir, "scheduler.pt"))

    print(optimizer)

    return global_step, global_loss/int(args.num_train_epochs)


def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result


def main(args):
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.",
    )

    # Other parameters
    parser.add_argument(
        "--eval_data_file",
        default=None,
        type=str,
        help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
    )
    parser.add_argument(
        "--line_by_line",
        action="store_true",
        help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
    )
    parser.add_argument(
        "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
    )

    parser.add_argument(
        "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling."
    )
    parser.add_argument(
        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
    )

    parser.add_argument(
        "--config_name",
        default=None,
        type=str,
        help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
    )
    parser.add_argument(
        "--tokenizer_name",
        default=None,
        type=str,
        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
    )
    parser.add_argument(
        "--cache_dir",
        default=None,
        type=str,
        help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
    )
    parser.add_argument(
        "--block_size",
        default=-1,
        type=int,
        help="Optional input sequence length after tokenization."
        "The training dataset will be truncated in block of this size for training."
        "Default to the model max input length for single sentence inputs (take into account special tokens).",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
    )

    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--save_total_limit",
        type=int,
        default=None,
        help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args(args)

    if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling)."
        )
    if args.eval_data_file is None and args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument."
        )
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab

    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
    else:
        # When we release a pip version exposing CONFIG_MAPPING,
        # we can do `config = CONFIG_MAPPING[args.model_type]()`.
        raise ValueError(
            "You are instantiating a new config instance from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --config_name"
        )

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name"
        )

    if args.block_size <= 0:
        args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        args.block_size = min(args.block_size, tokenizer.max_len)

    if args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    model.to(args.device)

    if args.local_rank == 0:
        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        if args.local_rank not in [-1, 0]:
            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)

        if args.local_rank == 0:
            torch.distributed.barrier()

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    evaluate(args, model, tokenizer)    

In [4]:
main([
      "--output_dir=/content/drive/My Drive/gpt2v3/output",
      "--model_type=gpt2",
      "--model_name_or_path=/content/drive/My Drive/gpt2v3/output", 
      "--block_size=512",
      "--tokenizer_name=/content/drive/My Drive/gpt2v3/output",
      "--config_name=/content/drive/My Drive/gpt2v3/output",
      "--do_train",
      "--train_data_file=/content/drive/My Drive/model_data/merged.txt",
      "--num_train_epochs=4",
      "--overwrite_output_dir",
      "--save_steps=0",
      "--do_eval",
      "--eval_data_file=/content/drive/My Drive/model_data/eval-text-u.txt"])


07/20/2020 08:29:01 - INFO - transformers.configuration_utils -   loading configuration file /content/drive/My Drive/gpt2v3/output/config.json
07/20/2020 08:29:01 - INFO - transformers.configuration_utils -   Model config GPT2Config {
  "_num_labels": 2,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "eos_token_ids": null,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 512,
  "output_past": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "vocab_size": 30000
}

07/20/2020 08:29:01 - INFO - transformers.configuration_utils -   loading configuration file /content/drive/My Drive/gpt2v3/output/config.json
07/

Epoch: 0 | Step: 499 | Loss: 1.7934464187026025 | Perplexity: 6.010130239045617
Epoch: 0 | Step: 999 | Loss: 1.6754935744442045 | Perplexity: 5.341430892335851
Epoch: 0 | Step: 1499 | Loss: 1.6095699962507934 | Perplexity: 5.000660462700722
Epoch: 0 | Step: 1999 | Loss: 1.5538989322832786 | Perplexity: 4.729875743488867
Epoch: 0 | Step: 2499 | Loss: 1.5223749769194053 | Perplexity: 4.583097029518575
Epoch: 0 | Step: 2999 | Loss: 1.4853584946386205 | Perplexity: 4.416548437944817
Epoch: 0 | Step: 3499 | Loss: 1.450120474216129 | Perplexity: 4.263628141487044
Epoch: 0 | Step: 3999 | Loss: 1.4126024010956753 | Perplexity: 4.106628604853594
Epoch: 0 | Step: 4499 | Loss: 1.380707898904259 | Perplexity: 3.9777164521895


07/20/2020 08:55:10 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 08:55:10 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 0 | Step: 4999 | Loss: 1.3497290091144851 | Perplexity: 3.856380345160876


07/20/2020 08:55:11 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 0 | Step: 5499 | Loss: 1.3196117190201766 | Perplexity: 3.7419681600871595
Epoch: 0 | Step: 5999 | Loss: 1.2916976538900828 | Perplexity: 3.6389590078422422
Epoch: 0 | Step: 6499 | Loss: 1.2645619342496548 | Perplexity: 3.5415409684439445
Epoch: 0 | Step: 6999 | Loss: 1.2391855192341443 | Perplexity: 3.452800079940835
Epoch: 0 | Step: 7499 | Loss: 1.2112476222133886 | Perplexity: 3.3576711455315777
Epoch: 0 | Step: 7999 | Loss: 1.1849120104298345 | Perplexity: 3.2703990477991116
Epoch: 0 | Step: 8499 | Loss: 1.1591291735870215 | Perplexity: 3.1871566071375184
Epoch: 0 | Step: 8999 | Loss: 1.1312357658043555 | Perplexity: 3.099484371258588
Epoch: 0 | Step: 9499 | Loss: 1.1077309349038589 | Perplexity: 3.0274810432432107


07/20/2020 09:20:49 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 09:20:49 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 0 | Step: 9999 | Loss: 1.083050413750275 | Perplexity: 2.953675756156278


07/20/2020 09:20:50 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 0 | Step: 10499 | Loss: 1.05986574081092 | Perplexity: 2.8859834934525743
Epoch: 0 | Step: 10999 | Loss: 1.0379588272480336 | Perplexity: 2.8234479834697868
Epoch: 0 | Step: 11499 | Loss: 1.0165738665391892 | Perplexity: 2.7637096864391717
Epoch: 0 | Step: 11999 | Loss: 0.9960155736019175 | Perplexity: 2.707472583205686
Epoch: 0 | Step: 12499 | Loss: 0.9758123868478462 | Perplexity: 2.6533218594417844
Epoch: 0 | Step: 12999 | Loss: 0.9558166816853112 | Perplexity: 2.6007937364430416
Epoch: 0 | Step: 13499 | Loss: 0.938163327712113 | Perplexity: 2.555283886634404
Epoch: 0 | Step: 13999 | Loss: 0.9195001635625252 | Perplexity: 2.5080364685711025
Epoch: 0 | Step: 14499 | Loss: 0.900552339137869 | Perplexity: 2.460962021473995


07/20/2020 09:46:28 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 09:46:28 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 0 | Step: 14999 | Loss: 0.8832534561485362 | Perplexity: 2.418756236398346


07/20/2020 09:46:30 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 0 | Step: 15499 | Loss: 0.8666070024070983 | Perplexity: 2.3788257947927614
Epoch: 0 | Step: 15999 | Loss: 0.850903710849514 | Perplexity: 2.341762171844924
Epoch: 0 | Step: 16499 | Loss: 0.8358503895286492 | Perplexity: 2.306774871632814
Epoch: 0 | Step: 16999 | Loss: 0.8217232754293329 | Perplexity: 2.2744159073790806
Epoch: 0 | Step: 17499 | Loss: 0.8075530376479662 | Perplexity: 2.242414164791144
Epoch: 0 | Step: 17999 | Loss: 0.7938618184298442 | Perplexity: 2.2119219946861457
Epoch: 0 | Step: 18499 | Loss: 0.7812835613204704 | Perplexity: 2.184274116709116
Epoch: 0 | Step: 18999 | Loss: 0.7690079215077246 | Perplexity: 2.1576246586270904
Epoch: 0 | Step: 19499 | Loss: 0.7565071767766838 | Perplexity: 2.130820627769449


07/20/2020 10:12:06 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 10:12:06 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 0 | Step: 19999 | Loss: 0.7451455863295283 | Perplexity: 2.1067481263821084


07/20/2020 10:12:07 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 0 | Step: 20499 | Loss: 0.7350973266904617 | Perplexity: 2.085684975443044
Epoch: 0 | Step: 20999 | Loss: 0.7247248774192828 | Perplexity: 2.0641631239528726
Epoch: 0 | Step: 21499 | Loss: 0.715451479046703 | Perplexity: 2.045109798081192
Epoch: 0 | Step: 21999 | Loss: 0.706765091507476 | Perplexity: 2.0274221140698643
Epoch: 0 | Step: 22499 | Loss: 0.6993885063497376 | Perplexity: 2.012521686895651
Epoch: 0 | Step: 22999 | Loss: 0.6918444573636415 | Perplexity: 1.9973962499584135
Epoch: 0 | Step: 23499 | Loss: 0.6847423418667812 | Perplexity: 1.9832607664326665
Epoch: 0 | Step: 23999 | Loss: 0.6776003458261645 | Perplexity: 1.9691467868782393
Epoch: 1 | Step: 499 | Loss: 0.34312612140458076 | Perplexity: 1.4093464994780156
Epoch: 1 | Step: 999 | Loss: 0.3531309092141164 | Perplexity: 1.4235174827915196
Epoch: 1 | Step: 1499 | Loss: 0.3607403464782013 | Perplexity: 1.4343909678546463
Epoch: 1 | Step: 1999 | Loss: 0.3729864506690501 | Perplexity: 1.452064665194777
Epoch: 1 | Step

07/20/2020 10:59:17 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 10:59:17 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 1 | Step: 4999 | Loss: 0.3954604869414237 | Perplexity: 1.4850678878615642


07/20/2020 10:59:18 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 1 | Step: 5499 | Loss: 0.39719444087453565 | Perplexity: 1.4876451609573356
Epoch: 1 | Step: 5999 | Loss: 0.40084415871218276 | Perplexity: 1.4930845661465055
Epoch: 1 | Step: 6499 | Loss: 0.40464684815311697 | Perplexity: 1.498773112092506
Epoch: 1 | Step: 6999 | Loss: 0.407930691720211 | Perplexity: 1.5037029384944256
Epoch: 1 | Step: 7499 | Loss: 0.4106383698262507 | Perplexity: 1.5077799992120746
Epoch: 1 | Step: 7999 | Loss: 0.41385905358296804 | Perplexity: 1.5126439101193014
Epoch: 1 | Step: 8499 | Loss: 0.4182411970943452 | Perplexity: 1.5192870778413927
Epoch: 1 | Step: 8999 | Loss: 0.41976269162478275 | Perplexity: 1.5216004242461711
Epoch: 1 | Step: 9499 | Loss: 0.42168571051651627 | Perplexity: 1.5245293058524125


07/20/2020 11:24:55 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 11:24:55 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 1 | Step: 9999 | Loss: 0.4241480126896029 | Perplexity: 1.5282877830294583


07/20/2020 11:24:56 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 1 | Step: 10499 | Loss: 0.42828741785281593 | Perplexity: 1.5346270968116689
Epoch: 1 | Step: 10999 | Loss: 0.43136672494554923 | Perplexity: 1.539359968157926
Epoch: 1 | Step: 11499 | Loss: 0.43455639219638115 | Perplexity: 1.544277853274614
Epoch: 1 | Step: 11999 | Loss: 0.43613128192368456 | Perplexity: 1.5467118367265262
Epoch: 1 | Step: 12499 | Loss: 0.4383401654786081 | Perplexity: 1.5501321191790312
Epoch: 1 | Step: 12999 | Loss: 0.44081088565218557 | Perplexity: 1.5539667971349707
Epoch: 1 | Step: 13499 | Loss: 0.44252938226827515 | Perplexity: 1.5566395797435182
Epoch: 1 | Step: 13999 | Loss: 0.4450926462016297 | Perplexity: 1.5606347760192902
Epoch: 1 | Step: 14499 | Loss: 0.4479206822952878 | Perplexity: 1.565054554195291


07/20/2020 11:50:33 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 11:50:33 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 1 | Step: 14999 | Loss: 0.44960983686917266 | Perplexity: 1.5677004072521805


07/20/2020 11:50:34 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 1 | Step: 15499 | Loss: 0.45203108683461135 | Perplexity: 1.5715008008038007
Epoch: 1 | Step: 15999 | Loss: 0.4549098909756849 | Perplexity: 1.5760313619870796
Epoch: 1 | Step: 16499 | Loss: 0.4584384248939071 | Perplexity: 1.5816022648838797
Epoch: 1 | Step: 16999 | Loss: 0.4608589484117333 | Perplexity: 1.585435207354161
Epoch: 1 | Step: 17499 | Loss: 0.4633961842716921 | Perplexity: 1.5894629379063656
Epoch: 1 | Step: 17999 | Loss: 0.4671816741672794 | Perplexity: 1.595491236630416
Epoch: 1 | Step: 18499 | Loss: 0.47014704857388906 | Perplexity: 1.6002294873811151
Epoch: 1 | Step: 18999 | Loss: 0.4737448576685626 | Perplexity: 1.6059971768857335
Epoch: 1 | Step: 19499 | Loss: 0.4767484148039097 | Perplexity: 1.610828132559595


07/20/2020 12:16:11 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 12:16:11 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 1 | Step: 19999 | Loss: 0.48122398044805015 | Perplexity: 1.6180536567010837


07/20/2020 12:16:13 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 1 | Step: 20499 | Loss: 0.488175027523194 | Perplexity: 1.6293400043623174
Epoch: 1 | Step: 20999 | Loss: 0.49427378067321426 | Perplexity: 1.6393073100204651
Epoch: 1 | Step: 21499 | Loss: 0.5009856811642233 | Perplexity: 1.6503471853868117
Epoch: 1 | Step: 21999 | Loss: 0.5061889979386086 | Perplexity: 1.6589568445468792
Epoch: 1 | Step: 22499 | Loss: 0.5120019797530996 | Perplexity: 1.6686284136086706
Epoch: 1 | Step: 22999 | Loss: 0.5174884345142674 | Perplexity: 1.6778084277518546
Epoch: 1 | Step: 23499 | Loss: 0.5231624862839643 | Perplexity: 1.6873554591921045
Epoch: 1 | Step: 23999 | Loss: 0.5290584696880093 | Perplexity: 1.6973334651498726
Epoch: 2 | Step: 499 | Loss: 0.6554406569697894 | Perplexity: 1.9259910317826456
Epoch: 2 | Step: 999 | Loss: 0.6631695769545622 | Perplexity: 1.940934536546041
Epoch: 2 | Step: 1499 | Loss: 0.6585243592678259 | Perplexity: 1.9319393815095331
Epoch: 2 | Step: 1999 | Loss: 0.6693927768045105 | Perplexity: 1.9530510226156463
Epoch: 2 | 

07/20/2020 13:03:21 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 13:03:21 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 2 | Step: 4999 | Loss: 0.7013774987559067 | Perplexity: 2.016528560748007


07/20/2020 13:03:23 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 2 | Step: 5499 | Loss: 0.7036880926613865 | Perplexity: 2.0211933264649438
Epoch: 2 | Step: 5999 | Loss: 0.7047548516420259 | Perplexity: 2.0233506030400616
Epoch: 2 | Step: 6499 | Loss: 0.7081996576871651 | Perplexity: 2.0303326724618236
Epoch: 2 | Step: 6999 | Loss: 0.7100455150611898 | Perplexity: 2.0340838379902086
Epoch: 2 | Step: 7499 | Loss: 0.7158803983920564 | Perplexity: 2.04598717338513
Epoch: 2 | Step: 7999 | Loss: 0.7200641816748248 | Perplexity: 2.0545650718396535
Epoch: 2 | Step: 8499 | Loss: 0.7247625978206393 | Perplexity: 2.0642409864828672
Epoch: 2 | Step: 8999 | Loss: 0.7283388794674021 | Perplexity: 2.0716365099925547
Epoch: 2 | Step: 9499 | Loss: 0.7338254984407195 | Perplexity: 2.0830340285031523


07/20/2020 13:29:00 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 13:29:00 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 2 | Step: 9999 | Loss: 0.7364672767765355 | Perplexity: 2.0885442178171667


07/20/2020 13:29:02 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 2 | Step: 10499 | Loss: 0.7396962864300946 | Perplexity: 2.0952990470937514
Epoch: 2 | Step: 10999 | Loss: 0.7416000719011169 | Perplexity: 2.099291846487835
Epoch: 2 | Step: 11499 | Loss: 0.7443213394976869 | Perplexity: 2.1050123613611302
Epoch: 2 | Step: 11999 | Loss: 0.747830125896008 | Perplexity: 2.11241137328805
Epoch: 2 | Step: 12499 | Loss: 0.7506278761444054 | Perplexity: 2.1183296477989813
Epoch: 2 | Step: 12999 | Loss: 0.7532729500474301 | Perplexity: 2.123940203164019
Epoch: 2 | Step: 13499 | Loss: 0.7562467641322184 | Perplexity: 2.130265807379148
Epoch: 2 | Step: 13999 | Loss: 0.7583989945537843 | Perplexity: 2.1348555675949092
Epoch: 2 | Step: 14499 | Loss: 0.7619754387874413 | Perplexity: 2.1425044291897795


07/20/2020 13:54:39 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 13:54:39 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 2 | Step: 14999 | Loss: 0.76536803676545 | Perplexity: 2.1497854291534786


07/20/2020 13:54:40 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 2 | Step: 15499 | Loss: 0.7687924586173387 | Perplexity: 2.157159820661239
Epoch: 2 | Step: 15999 | Loss: 0.7725439575841301 | Perplexity: 2.1652676021577055
Epoch: 2 | Step: 16499 | Loss: 0.7772943817499645 | Perplexity: 2.1755780117206944
Epoch: 2 | Step: 16999 | Loss: 0.7814390849295305 | Perplexity: 2.184613849320459
Epoch: 2 | Step: 17499 | Loss: 0.785512476106926 | Perplexity: 2.1935307848540337
Epoch: 2 | Step: 17999 | Loss: 0.7899044050129661 | Perplexity: 2.203185802670578
Epoch: 2 | Step: 18499 | Loss: 0.7934990622147075 | Perplexity: 2.211119751753278
Epoch: 2 | Step: 18999 | Loss: 0.7976828778944113 | Perplexity: 2.220390048323929
Epoch: 2 | Step: 19499 | Loss: 0.8020365797258997 | Perplexity: 2.2300780385515777


07/20/2020 14:20:18 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 14:20:18 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 2 | Step: 19999 | Loss: 0.8061718268357043 | Perplexity: 2.2393190560920218


07/20/2020 14:20:19 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 2 | Step: 20499 | Loss: 0.8108056923261242 | Perplexity: 2.24971983869079
Epoch: 2 | Step: 20999 | Loss: 0.8144561296924339 | Perplexity: 2.257947307847703
Epoch: 2 | Step: 21499 | Loss: 0.8180975269995641 | Perplexity: 2.266184379217086
Epoch: 2 | Step: 21999 | Loss: 0.8216929051309284 | Perplexity: 2.2743468337381767
Epoch: 2 | Step: 22499 | Loss: 0.8259558585816218 | Perplexity: 2.2840629634113134
Epoch: 2 | Step: 22999 | Loss: 0.83043203542133 | Perplexity: 2.2943097492524553
Epoch: 2 | Step: 23499 | Loss: 0.835523417831998 | Perplexity: 2.3060207448350214
Epoch: 2 | Step: 23999 | Loss: 0.8394410313891082 | Perplexity: 2.315072562152041
Epoch: 3 | Step: 499 | Loss: 0.9113834680144209 | Perplexity: 2.487761892720502
Epoch: 3 | Step: 999 | Loss: 0.9141004349790747 | Perplexity: 2.4945302501370845
Epoch: 3 | Step: 1499 | Loss: 0.9226079494454121 | Perplexity: 2.5158430331843435
Epoch: 3 | Step: 1999 | Loss: 0.9257223796318867 | Perplexity: 2.5236906647784005
Epoch: 3 | Step: 24

07/20/2020 15:07:29 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 15:07:29 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 3 | Step: 4999 | Loss: 0.9625848020788516 | Perplexity: 2.6184559240545355


07/20/2020 15:07:31 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 3 | Step: 5499 | Loss: 0.9666802681096491 | Perplexity: 2.629201710827215
Epoch: 3 | Step: 5999 | Loss: 0.9666479842854897 | Perplexity: 2.6291168315116247
Epoch: 3 | Step: 6499 | Loss: 0.972268587338559 | Perplexity: 2.6439356599659205
Epoch: 3 | Step: 6999 | Loss: 0.9732910047915565 | Perplexity: 2.646640248302831
Epoch: 3 | Step: 7499 | Loss: 0.9773770481394294 | Perplexity: 2.6574766590287724
Epoch: 3 | Step: 7999 | Loss: 0.9862396381364379 | Perplexity: 2.6811334605060226
Epoch: 3 | Step: 8499 | Loss: 0.9892533387176637 | Perplexity: 2.6892257817661345
Epoch: 3 | Step: 8999 | Loss: 0.9949422091688288 | Perplexity: 2.704568037528186
Epoch: 3 | Step: 9499 | Loss: 1.0017202449486786 | Perplexity: 2.7229619633786264


07/20/2020 15:33:10 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 15:33:10 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 3 | Step: 9999 | Loss: 1.0066087417190661 | Perplexity: 2.7363057430810205


07/20/2020 15:33:11 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 3 | Step: 10499 | Loss: 1.0102369986614435 | Perplexity: 2.7462517958963857
Epoch: 3 | Step: 10999 | Loss: 1.0148662397203416 | Perplexity: 2.758994328840475
Epoch: 3 | Step: 11499 | Loss: 1.0188970137729876 | Perplexity: 2.7701376546195076
Epoch: 3 | Step: 11999 | Loss: 1.0223528231451637 | Perplexity: 2.7797272827081017
Epoch: 3 | Step: 12499 | Loss: 1.0266681887495703 | Perplexity: 2.791748742053942
Epoch: 3 | Step: 12999 | Loss: 1.030086645517337 | Perplexity: 2.80130854501215
Epoch: 3 | Step: 13499 | Loss: 1.0351819068431267 | Perplexity: 2.8156183692913497
Epoch: 3 | Step: 13999 | Loss: 1.0406469450650704 | Perplexity: 2.831047954532005
Epoch: 3 | Step: 14499 | Loss: 1.0463661653293328 | Perplexity: 2.8472857308114903


07/20/2020 15:58:50 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 15:58:50 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 3 | Step: 14999 | Loss: 1.0517789626537357 | Perplexity: 2.8627392971780314


07/20/2020 15:58:51 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 3 | Step: 15499 | Loss: 1.0564569845466214 | Perplexity: 2.8761626271199803
Epoch: 3 | Step: 15999 | Loss: 1.0611164260976802 | Perplexity: 2.889595208634396
Epoch: 3 | Step: 16499 | Loss: 1.065129903734563 | Perplexity: 2.901215838353889
Epoch: 3 | Step: 16999 | Loss: 1.0680679932886816 | Perplexity: 2.9097524047601944
Epoch: 3 | Step: 17499 | Loss: 1.0735152810487762 | Perplexity: 2.9256459123737173
Epoch: 3 | Step: 17999 | Loss: 1.0781717937028217 | Perplexity: 2.939300987427132
Epoch: 3 | Step: 18499 | Loss: 1.0819713546380627 | Perplexity: 2.950490284382338
Epoch: 3 | Step: 18999 | Loss: 1.0872377518964564 | Perplexity: 2.9660697260835676
Epoch: 3 | Step: 19499 | Loss: 1.0919468354833886 | Perplexity: 2.9800701350242145


07/20/2020 16:24:29 - INFO - __main__ -   Saving model checkpoint to /content/drive/My Drive/gpt2v3/output
07/20/2020 16:24:29 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json


Epoch: 3 | Step: 19999 | Loss: 1.0960138056511874 | Perplexity: 2.9922146703532864


07/20/2020 16:24:31 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin


Epoch: 3 | Step: 20499 | Loss: 1.1008803190662435 | Perplexity: 3.0068118129738717
Epoch: 3 | Step: 20999 | Loss: 1.1054203768694617 | Perplexity: 3.0204939477480646
Epoch: 3 | Step: 21499 | Loss: 1.1102199254880747 | Perplexity: 3.0350258005736857
Epoch: 3 | Step: 21999 | Loss: 1.113877247860052 | Perplexity: 3.046146191365133
Epoch: 3 | Step: 22499 | Loss: 1.1188365068035304 | Perplexity: 3.061290339946716
Epoch: 3 | Step: 22999 | Loss: 1.1236603474428208 | Perplexity: 3.076093191290671
Epoch: 3 | Step: 23499 | Loss: 1.1273095086829519 | Perplexity: 3.087338857480658
Epoch: 3 | Step: 23999 | Loss: 1.1320722718915883 | Perplexity: 3.1020781935252355


07/20/2020 16:46:05 - INFO - __main__ -   Training done, saving model to /content/drive/My Drive/gpt2v3/output
07/20/2020 16:46:05 - INFO - transformers.configuration_utils -   Configuration saved in /content/drive/My Drive/gpt2v3/output/config.json
07/20/2020 16:46:06 - INFO - transformers.modeling_utils -   Model weights saved in /content/drive/My Drive/gpt2v3/output/pytorch_model.bin
07/20/2020 16:46:07 - INFO - __main__ -    global_step = 96812, average loss = 0.7952666567900588
07/20/2020 16:46:07 - INFO - __main__ -   Loading features from cached file /content/drive/My Drive/model_data/gpt2_cached_lm_512_eval-text-u.txt


AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    initial_lr: 6.25e-05
    lr: 0.0
    weight_decay: 1e-06
)


07/20/2020 16:46:07 - INFO - __main__ -   ***** Running evaluation  *****
07/20/2020 16:46:07 - INFO - __main__ -     Num examples = 13
07/20/2020 16:46:07 - INFO - __main__ -     Batch size = 4


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…




07/20/2020 16:46:08 - INFO - __main__ -   ***** Eval results  *****
07/20/2020 16:46:08 - INFO - __main__ -     perplexity = tensor(388.6265)
