In [25]:
import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

In [26]:
import pandas as pd
import numpy as np
import torch

In [27]:
from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

In [28]:
!pip install transformers



In [29]:
from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


In [30]:
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [31]:
class Args():
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

In [32]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
rick = pd.read_csv("/content/drive/My Drive/dataset/RickAndMortyScripts.csv")

In [34]:
rick.head(10)

Unnamed: 0,index,season no.,episode no.,episode name,name,line
0,0,1,1,Pilot,Rick,Morty! You gotta come on. Jus'... you gotta co...
1,1,1,1,Pilot,Morty,"What, Rick? What’s going on?"
2,2,1,1,Pilot,Rick,"I got a surprise for you, Morty."
3,3,1,1,Pilot,Morty,It's the middle of the night. What are you tal...
4,4,1,1,Pilot,Rick,"Come on, I got a surprise for you. Come on, h..."
5,5,1,1,Pilot,Morty,Ow! Ow! You're tugging me too hard!
6,6,1,1,Pilot,Rick,"We gotta go, gotta get outta here, come on. Go..."
7,7,1,1,Pilot,Rick,"What do you think of this... flying vehicle, M..."
8,8,1,1,Pilot,Morty,"Yeah, Rick... I-it's great. Is this the surprise?"
9,9,1,1,Pilot,Rick,Morty. I had to... I had to do it. I had— I ha...


In [35]:
cont = []
n = 8
for i in range(n, len(rick['line'])):
  row = []
  prev = i - 1 - n # subtract 1 prev, row contain 8 responses  
  for j in range(i, prev, -1):
    row.append(rick['line'][j])
  cont.append(row)
columns = ['response', 'context'] 
columns = columns + ['context/'+str(i) for i in range(n-1)]
df = pd.DataFrame.from_records(cont, columns=columns)
df.head(10)

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5,context/6
0,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...,"I got a surprise for you, Morty.","What, Rick? What’s going on?",Morty! You gotta come on. Jus'... you gotta co...
1,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...,"I got a surprise for you, Morty.","What, Rick? What’s going on?"
2,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...,"I got a surprise for you, Morty."
3,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h...",It's the middle of the night. What are you tal...
4,T-t-that's absolutely crazy!,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!,"Come on, I got a surprise for you. Come on, h..."
5,"Come on, Morty. Just take it easy, Morty. It's...",T-t-that's absolutely crazy!,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go...",Ow! Ow! You're tugging me too hard!
6,Jessica? From my math class?,"Come on, Morty. Just take it easy, Morty. It's...",T-t-that's absolutely crazy!,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M...","We gotta go, gotta get outta here, come on. Go..."
7,"When I drop the bomb you know, I want you to h...",Jessica? From my math class?,"Come on, Morty. Just take it easy, Morty. It's...",T-t-that's absolutely crazy!,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?","What do you think of this... flying vehicle, M..."
8,Ohh...,"When I drop the bomb you know, I want you to h...",Jessica? From my math class?,"Come on, Morty. Just take it easy, Morty. It's...",T-t-that's absolutely crazy!,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...,"Yeah, Rick... I-it's great. Is this the surprise?"
9,And Jessica's gonna be Eve.,Ohh...,"When I drop the bomb you know, I want you to h...",Jessica? From my math class?,"Come on, Morty. Just take it easy, Morty. It's...",T-t-that's absolutely crazy!,We're gonna drop it down there just get a whol...,What?! A bomb?!,Morty. I had to... I had to do it. I had— I ha...


In [36]:
train_df, val_df = train_test_split(df, test_size = 0.1)

In [42]:
def make_convo(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    convo = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    convo = flatten(convo)
    return convo

class ConvoDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features)
            with open(cached_features, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                convo = make_convo(row, tokenizer)
                self.examples.append(convo)

            logger.info("Save to cached file %s", cached_features)
            with open(cached_features, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)
      
# Cache and store data & checkpoints

def load_and_cache(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConvoDataset(tokenizer, args, df_val if evaluate else df_trn)

def setseed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

def sort_the_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint = []

    globcheckpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in globcheckpoints:
        if use_mtime:
            ordering_and_checkpoint.append((os.path.getmtime(path), path))
        else:
            reg_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if reg_match and reg_match.groups():
                ordering_and_checkpoint.append((int(reg_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted

def turn_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Delete older checkpoint(s) or not :
    checkpoints_sorted = sort_the_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_deleted = checkpoints_sorted[:checkpoints_to_delete]
    for checkpoint in checkpoints_deleted:
        logger.info("Delete older checkpoint [{}] : args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

In [38]:
# Train & evaluation

In [43]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    # Model train
    if args.local_rank in [-1, 0]:
        writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # distributed & parallel training
    model = model.module if hasattr(model, "module") else model
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)


    # Prepare optimizer and schedule (linear warmup, decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=total
    )

    # Check for saved optimizer or scheduler states exist :
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load optimizer and scheduler states :
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train
    logger.info("***** now training *****")
    logger.info("  Number examples = %d", len(train_dataset))
    logger.info("  Number epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    # Check if training continued from a checkpoint :
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
      
            # set global_step to gobal_step of last saved checkpoint from model path :
            checkpoint_add = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_add)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Fine-tuning start.")

    train_loss, log_loss = 0.0, 0.0

    model.zero_grad()
    train_iterate = trange(
        epochs_trained, int(args.num_train_epochs), desc="epoch", disable=args.local_rank not in [-1, 0]
    )
    setseed(args)  # for reproducibility
    for _ in train_iterate:
        epoch_iterate = tqdm(train_dataloader, desc="iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterate):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # outputs are always tuple in transformers

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            train_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            writer.add_scalar("eval_{}".format(key), value, global_step)
                    writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    writer.add_scalar("loss", (train_loss - log_loss) / args.logging_steps, global_step)
                    log_loss = train_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Distributed and parallel training
                    model_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    turn_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterate.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterate.close()
            break

    if args.local_rank in [-1, 0]:
        writer.close()

    return global_step, train_loss / global_step

# Evaluate model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_train, df_val, prefix="") -> Dict:
    # Loop : MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache(args, tokenizer, df_train, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        eval_steps += 1

    eval_loss = eval_loss / eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [44]:
def main(df_trn, df_val):
    args = Args()

    if args.should_continue:
        sorted_checkpoints = sort_the_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # CUDA, GPU, distributed training
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Set up logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    setseed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)

    logger.info("Training, evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, train_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, train_loss)

    # Save best-practices: if use save_pretrained for the model and tokenizer, reload them from from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # Reload using `from_pretrained()`

        model_save = (
            model.module if hasattr(model, "module") else model
        )  # distributed and parallel training
        model_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Save training arguments with the trained model!!!
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that was fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

In [45]:
main(train_df, val_df)

04/16/2022 06:32:32 - INFO - __main__ -   Training, evaluation parameters <__main__.Args object at 0x7f67d77d37d0>
04/16/2022 06:32:32 - INFO - __main__ -   Creating features from dataset file at cached
04/16/2022 06:32:36 - INFO - __main__ -   Save to cached file cached/gpt2_cached_lm_512
04/16/2022 06:32:36 - INFO - __main__ -   ***** now training *****
04/16/2022 06:32:36 - INFO - __main__ -     Number examples = 1707
04/16/2022 06:32:36 - INFO - __main__ -     Number epochs = 3
04/16/2022 06:32:36 - INFO - __main__ -     Instantaneous batch size per GPU = 4
04/16/2022 06:32:36 - INFO - __main__ -     Total train batch size (parallel, distributed & accumulation) = 4
04/16/2022 06:32:36 - INFO - __main__ -     Gradient accumulation steps = 1
04/16/2022 06:32:36 - INFO - __main__ -     Total optimization steps = 1278


epoch:   0%|          | 0/3 [00:00<?, ?it/s]

iteration:   0%|          | 0/426 [00:00<?, ?it/s]

iteration:   0%|          | 0/426 [00:00<?, ?it/s]

iteration:   0%|          | 0/426 [00:00<?, ?it/s]

04/16/2022 06:46:04 - INFO - __main__ -    global_step = 1278, average loss = 1.9912802111095107
04/16/2022 06:46:04 - INFO - __main__ -   Saving model checkpoint to output-small
04/16/2022 06:46:08 - INFO - __main__ -   Evaluate the following checkpoints: ['output-small']
04/16/2022 06:46:10 - INFO - __main__ -   Creating features from dataset file at cached
04/16/2022 06:46:10 - INFO - __main__ -   Save to cached file cached/gpt2_cached_lm_512
04/16/2022 06:46:10 - INFO - __main__ -   ***** Running evaluation  *****
04/16/2022 06:46:10 - INFO - __main__ -     Num examples = 190
04/16/2022 06:46:10 - INFO - __main__ -     Batch size = 4


Evaluating:   0%|          | 0/47 [00:00<?, ?it/s]

04/16/2022 06:46:18 - INFO - __main__ -   ***** Eval results  *****
04/16/2022 06:46:18 - INFO - __main__ -     perplexity = tensor(3.9619)


{'perplexity_': tensor(3.9619)}

In [46]:
# chat with Rick Sanchez

In [48]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('output-small')
# 8 lines with Rick Sanchez
for step in range(8):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input)
# append the new user input tokens to the chat history
    bot_input = torch.cat([chat_history, new_user_input], dim=-1) if step > 0 else new_user_input
# generated a response (limiting total chat history to 1000 tokens)
    chat_history = model.generate(
        bot_input, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature = 0.8
    )
    
    # Last output token from bot print :
    print("RickSanchezBot: {}".format(tokenizer.decode(chat_history[:, bot_input.shape[-1]:][0], skip_special_tokens=True)))



>> User:yo
RickSanchezBot: You're a real pickle, Morty.
>> User:pickle rick
RickSanchezBot: Rick, you're a pickle!
>> User:certainly you are
RickSanchezBot: You are a pickles.
>> User:a mad pickles
RickSanchezBot: You think I'm a picklet?
>> User:picklest picklet
RickSanchezBot: !!!
>> User:Eat pickles
RickSanchezBot: !
>> User:Where is Rick?
RickSanchezBot: !!!!?!!!
>> User:Say something
RickSanchezBot: !?
