In [2]:
import os
import math
import logging
from pprint import pformat
from argparse import ArgumentParser
from collections import defaultdict
from itertools import chain

import torch
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, TensorDataset
#from ignite.engine import Engine, Events
#from ignite.handlers import ModelCheckpoint, global_step_from_engine
#from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
#from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
#from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
#from transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)
from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer
from optim import AdamW
#from utils import get_dataset, make_logdir

In [3]:
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<speaker1>', '<speaker2>']}
MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]

def add_special_tokens_(model, tokenizer):
    """ Add special tokens to the tokenizer and the model if they have not already been added. """
    num_added_tokens = tokenizer.set_special_tokens(SPECIAL_TOKENS) # doesn't add if they are already there
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    #orig_num_tokens = len(tokenizer.encoder)
    #num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there 
    #if num_added_tokens > 0:
        #model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)

#tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
#model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
add_special_tokens_(model, tokenizer)
#train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [4]:
def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
    """ Build a sequence of input from 3 segments: persona, history and last reply. """
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
    
    instance = {}
    instance["input_ids"] = list(chain(*sequence))
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    
    if lm_labels:
        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
    else:
        instance["lm_labels"] = [-1] * len(instance["input_ids"])
        
    return instance

persona = [["i", "like", "playing", "football", "."],
           ["i", "am", "from", "NYC", "."]]
history = [["hello", "how", "are", "you", "?"],
           ["i", "am", "fine", "thanks", "."]]
reply = ["great", "to", "hear"]
I = build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False)
len( I['input_ids']), I['lm_labels']

(28,
 [-1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1])

In [5]:
def pad_dataset(dataset, pd=0):
    #for name in ["input_ids", "lm_labels", "token_type_ids"]:
        #dataset[name] = [x + [padding if name != "lm_labels" else -100] * (max_l - len(x)) for x in dataset[name]]
    L = max(len(x) for x in dataset["input_ids"])
    dataset['input_ids'] = [ x + (L-len(x))*[pd] for x in dataset['input_ids'] ]
    dataset['token_type_ids'] = [ x + (L-len(x))*[pd] for x in dataset['token_type_ids'] ]
    dataset['lm_labels'] = [ x + (L-len(x))*[-1] for x in dataset['lm_labels'] ]
    return dataset

def pad_list(LIST, pad):
    L = max(len(x) for x in LIST ) 
    return [ x + (L-len(x))*[pad] for x in LIST ]

In [6]:
from collections import defaultdict

NUM_CANDIDATES = 2 # Cap on number of Train Candidates
PERSONALITY_PERM = 1 # Number of permutations of personality sentences
MXHST_K = 2
MAX_HISTORY = 2*(MXHST_K)+1 # Number of previous exchanges to keep in history

def get_data_loaders(tokenizer):
    personachat = torch.load('convai_data.tkn')

    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for dataset_name, dataset in personachat.items():
        
        #num_candidates = len(dataset[0]["utterances"][0]["candidates"])
        num_candidates =  20 #MAXIMUM NUM OF DISTRACTOR+GT_REPLY in our dataset
        if dataset_name == 'train': num_candidates = min(NUM_CANDIDATES, num_candidates) # Number of candidates for training
        datasets[dataset_name]["n_candidates"] = num_candidates
        
        for dialog in dataset: #dialog= [ personality:[], utterances:[history:[], candidates:[]] ]
            persona = dialog["personality"].copy()
            #for _ in range(PERSONALITY_PERM):  ----------------------------------------------
            for utterance in dialog["utterances"]:
                history = utterance["history"][-MAX_HISTORY:] #MAX_HISTORY per person
                for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
                    lm_labels = bool(j == num_candidates-1)
                    instance = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels)
                    #for input_name, input_array in instance.items(): #datasets[dataset_name][input_name].append(input_array)
                    datasets[dataset_name]['input_ids'].append( instance['input_ids'] )
                    datasets[dataset_name]['token_type_ids'].append( instance['token_type_ids'] )
                    datasets[dataset_name]['mc_token_ids'].append( instance['mc_token_ids'] )
                    datasets[dataset_name]['lm_labels'].append( instance['lm_labels'] )
                    
                datasets[dataset_name]["mc_labels"].append(num_candidates-1) #TODO: make this 0
                
            #persona = [persona[-1]] + persona[:-1]  #permuted personalities
            # PERSONALITY_PERM LOOP ----------------------------------------------------------
    return personachat
    personachat = None; del personachat
    #dataset['train'/'valid'] = {'input_ids', 'lm_labels', 'token_type_ids', 'mc_token_ids'}
    # The dataset contains lists that are grouped N=num_candidates objects
    
    #input_ids: sequence of token ids
    #lm_labels: sequence of token ids with highlisted reply (lang modeling)
    #token_type_ids: speaker annotation for each token
    #mc_token_ids: length of input id-1 (some index that indicates when padding starts)
    #mc_labels: index of the ground truth candidate (Multiple choice)
    
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        
        #dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids('<pad>') ) ---------------------------------
        pad = tokenizer.convert_tokens_to_ids('<pad>')
        dataset['input_ids'] = pad_list( dataset['input_ids'], pad)
        dataset['token_type_ids'] = pad_list( dataset['token_type_ids'], pad)
        dataset['lm_labels'] = pad_list( dataset['lm_labels'], -1)
        #-------------------------------------------------------------------------------------------------------------------
        
        for input_name in ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]:
            tensor = torch.tensor(dataset[input_name])
            if input_name != "mc_labels":
                
                #-----------------------------------------------------------------------------------------------------------
                N,L = datasets[dataset_name]["n_candidates"], tensor.shape[1:]
                tensor = tensor.view((-1, N) + L) 
                #L = tensor.shape[-1]; tensor = tensor.view((-1, N, L)) # Simpler version ----------------------------------
                
            tensor_datasets[dataset_name].append(tensor)

    train_dataset = TensorDataset(*tensor_datasets["train"])
    #train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
    #valid_dataset = TensorDataset(*tensor_datasets["valid"])
    #valid_loader = DataLoader(valid_dataset, batch_size=args.valid_batch_size, shuffle=False)
    #train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
    #valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
    return train_dataset#, valid_loader#, train_sampler, valid_sampler

raw_dataset = get_data_loaders(tokenizer)
torch.save(raw_dataset, 'raw_dataset.pyobj')
#train_dataset = get_data_loaders(tokenizer)
#torch.save(train_dataset, 'train_dataset.pyobj')

<torch.utils.data.dataloader.DataLoader at 0x7f623dd4fc88>

In [None]:
# INSPECT DATA

train_dataset = torch.load('train_dataset.pyobj')
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
#"input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"

#input_ids: sequence of token ids
#mc_token_ids: length of input id-1 (some index that indicates when padding starts)
#lm_labels: sequence of token ids with highlisted reply (lang modeling)
#mc_labels: index of the ground truth candidate (Multiple choice)
#token_type_ids: speaker annotation for each token

#print( D[0].shape, D[1].shape, D[2].shape, D[3].shape, D[4].shape )
D = next( iter(train_loader) )
#D = train_dataset[0]
input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = D
lm_labels[0]

In [3]:
from apex import amp
LM_COEF = 1.0
MC_COEF = 1.0
DEVICE = 0
FP16 = True
MAX_NORM = 1.0 # Clipping Gradient Norm
GRAD_ACCUM_STEPS = 4
train_batch_size = 4

def update(b, batch):
    # model.train()
    
    batch = [input_tensor.to(DEVICE) for input_tensor in batch]
    input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
    
    #(lm_loss), (mc_loss), *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels)
    lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids) 
    
    loss = (lm_loss * LM_COEF + mc_loss * MC_COEF) / GRAD_ACCUM_STEPS
    
    if FP16:
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), MAX_NORM)
    else:
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_NORM)
        
    if b % GRAD_ACCUM_STEPS == 0:
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()

def inference(engine, batch):
    model.eval()
    with torch.no_grad():
        batch = tuple(input_tensor.to(DEVICE) for input_tensor in batch)
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
        logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
        # if we dont send labels to model, it doesnt return losses
        lm_logits, mc_logits, *_ = model(
            input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
        )
        lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
        lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
        return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)

In [4]:
import torch

#train_dataset = torch.load('train_dataset.pyobj')
#train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
#train_loader = torch.load('dataloader.pyobj')
model = model.cuda(DEVICE)
optimizer = AdamW(model.parameters(), lr=6.25e-5, correct_bias=True)
train_dataset = torch.load('train_dataset.pyobj')
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)

if FP16: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') #O1/O2 #https://nvidia.github.io/apex/amp.html

EPOCHS = 100
B = len(train_loader)
for e in range(EPOCHS):
    for b,batch in enumerate(train_loader):
        loss = update( b, batch)
        if b%(B//300) == 0: print(e,str(b)+'/'+str(B), loss )

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
keep_batchnorm_fp32    : None
cast_model_type        : None
opt_level              : O1
loss_scale             : dynamic
patch_torch_functions  : True
enabled                : True
master_weights         : None
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
keep_batchnorm_fp32    : None
cast_model_type        : None
opt_level              : O1
loss_scale             : dynamic
patch_torch_functions  : True
enabled                : True
master_weights         : None
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
0 0/32860 1.2796852588653564
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0


KeyboardInterrupt: 

In [None]:
import torch
a = torch.FloatTensor(10,2).cuda()

In [12]:
a

tensor([[ 4.7370e+27,  4.5569e-41],
        [ 1.6375e+22,  4.5569e-41],
        [ 4.5113e+27,  4.5569e-41],
        [ 4.6717e+27,  4.5569e-41],
        [ 4.7370e+27,  4.5569e-41],
        [-1.7097e+16,  4.5567e-41],
        [ 4.5113e+27,  4.5569e-41],
        [ 8.7380e+24,  4.5569e-41],
        [ 4.7370e+27,  4.5569e-41],
        [-1.3176e+17,  4.5567e-41]], device='cuda:0')

In [1]:
import torch
D = torch.load('dataloader.pyobj')

In [7]:
next( iter(D) )

[tensor([[[40478,   249,   649,  ..., 40482, 40482, 40482],
          [40478,   249,   649,  ..., 40482, 40482, 40482]],
 
         [[40478,   249,   649,  ..., 40482, 40482, 40482],
          [40478,   249,   649,  ..., 40482, 40482, 40482]]]),
 tensor([[100,  93],
         [126, 123]]),
 tensor([[[-100, -100, -100,  ..., -100, -100, -100],
          [-100, -100, -100,  ..., -100, -100, -100]],
 
         [[-100, -100, -100,  ..., -100, -100, -100],
          [-100, -100, -100,  ..., -100, -100, -100]]]),
 tensor([1, 1]),
 tensor([[[40480, 40480, 40480,  ..., 40482, 40482, 40482],
          [40480, 40480, 40480,  ..., 40482, 40482, 40482]],
 
         [[40480, 40480, 40480,  ..., 40482, 40482, 40482],
          [40480, 40480, 40480,  ..., 40482, 40482, 40482]]])]