In [15]:
import transformers
from transformers import BertModel,BertTokenizer,AlbertTokenizer,RobertaTokenizer,XLNetTokenizer
from transformers import AdamW,get_linear_schedule_with_warmup
from transformers.modeling_utils import SequenceSummary

In [2]:
from processor import *

In [21]:
import argparse
parser = argparse.ArgumentParser()
# data arguments
parser.add_argument("--data_dir",type = str,default = "dataset/CSQA")
parser.add_argument("--cs_dir",type = str, default = "OMCS")
parser.add_argument("--test_file",type= str,default = "test_rand_split_no_answers.jsonl")
parser.add_argument("--dev_file",type= str,default = "dev_rand_split.jsonl")
parser.add_argument("--train_file",type = str,default = "train_rand_split.jsonl")
parser.add_argument("--output_dir",type = str,default = "model")
parser.add_argument("--save_model_name",type = str,default = "bert_csqa_2e-5_wholeQA-Match_cslen5")
parser.add_argument("--tokenizer_name_or_path",type = str,default = "bert-base-cased")
parser.add_argument("--origin_model",type = str,default = "bert-base-cased", help = "origin model dir for training")
parser.add_argument("--omcs_file",type=str,default = "omcs-free-origin.json")
# hyper parameters
parser.add_argument("--max_length",type=int,default = 80 )
parser.add_argument("--gradient_accumulation_steps",type=int,default=1,help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--num_train_epochs",default=5,type=int)
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
parser.add_argument("--train_batch_size", default=15, type=int, help="Batch size for training.")
parser.add_argument("--eval_batch_size", default=6, type=int, help="Batch size for eval.")
parser.add_argument("--check_loss_step",default = 400,type = int,help = "output current average loss of training")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument("--cs_len",type = int, default = 5)
# settings
parser.add_argument("--n_gpu",type=int , default = 1)
parser.add_argument("--fp16",action = "store_true")
parser.add_argument("--save_method",type = str,default = "Best_Current")
parser.add_argument("--do_finetune",action = "store_true",default = False)
parser.add_argument("--cs_mode",type = str,default = "QAconcept-Match")
parser.add_argument("--cs_save_mode",type = str,default = "id")
parser.add_argument("--seed",type = int,default = None,help = "freeze seed")
parser.add_argument('--tpu',action = "store_true")
parser.add_argument('--task_name',type = str, default = "baseline")
parser.add_argument("--test",action = "store_true")
parser.add_argument("--dev",action = "store_true")

args = parser.parse_args([])

In [9]:
args.__dict__

{'data_dir': 'dataset/CSQA',
 'cs_dir': 'OMCS',
 'test_file': 'test_rand_split_no_answers.jsonl',
 'dev_file': 'dev_rand_split.jsonl',
 'train_file': 'train_rand_split.jsonl',
 'output_dir': 'model',
 'save_model_name': 'bert_csqa_2e-5_wholeQA-Match_cslen5',
 'tokenizer_name_or_path': 'bert-base-cased',
 'origin_model': 'bert-base-cased',
 'omcs_file': 'omcs-free-origin.json',
 'max_length': 80,
 'gradient_accumulation_steps': 1,
 'num_train_epochs': 5,
 'weight_decay': 0.0,
 'adam_epsilon': 1e-08,
 'learning_rate': 2e-05,
 'warmup_steps': 0,
 'train_batch_size': 15,
 'eval_batch_size': 6,
 'check_loss_step': 400,
 'max_grad_norm': 1.0,
 'cs_len': 5,
 'n_gpu': 1,
 'fp16': False,
 'save_method': 'Best_Current',
 'do_finetune': False,
 'cs_mode': 'wholeQA-Match',
 'cs_save_mode': 'id',
 'seed': None,
 'tpu': False,
 'task_name': 'baseline',
 'test': False,
 'dev': False}

In [22]:
omcs_corpus = load_omcs(args)

In [25]:
type(omcs_corpus)

list

In [17]:
def select_tokenizer(args):
    if "albert" in args.origin_model:
        return AlbertTokenizer.from_pretrained(args.origin_model)
    elif "roberta" in args.origin_model:
        return RobertaTokenizer.from_pretrained(args.origin_model)
    elif "bert" in args.origin_model:
        return BertTokenizer.from_pretrained(args.origin_model)
    elif "xlnet" in args.origin_model:
        return XLNetTokenizer.from_pretrained(args.origin_model)

In [18]:
def select_model(args,model_name = None):
    if not model_name:
        model_name = args.origin_model
        cache = os.path.join(args.output_dir,"cache")
    else:
        cache = model_name
    if args.task_name == "rerank_csqa":
        if "albert" in model_name:
            return AlbertAttRanker.from_pretrained(model_name,cache_dir = cache,cs_len = args.cs_len)
        elif "roberta" in model_name:
            return RobertaAttRanker.from_pretrained(model_name,cache_dir = cache,cs_len = args.cs_len)
        elif "bert" in model_name:
            return BertAttRanker.from_pretrained(model_name,cache_dir = cache,cs_len = args.cs_len)
        elif "xlnet" in model_name:
            return XLNetAttRanker.from_pretrained(model_name,cache_dir = cache,cs_len = args.cs_len)
    elif args.task_name == "rerank_csqa_without_rerank":
        if "bert" in model_name:
            return BertAttRankerDontRank.from_pretrained(model_name,cache_dir = cache,cs_len = args.cs_len)
    else:
        if "albert" in model_name:
            return AlbertForMultipleChoice.from_pretrained(model_name,cache_dir = cache)
        elif "roberta" in model_name:
            return RobertaForMultipleChoice.from_pretrained(model_name,cache_dir = cache)
        elif "bert" in model_name:
            return BertForMultipleChoice.from_pretrained(model_name,cache_dir = cache)
        elif "xlnet" in model_name:
            return XLNetForMultipleChoice.from_pretrained(model_name,cache_dir = cache)
        

In [26]:
tokenizer = select_tokenizer(args)

In [27]:
_,_,train_dataset= load_csqa_omcs_dataset(tokenizer,args,omcs_corpus,"train")

model\feature_cache\cached_train_QAconcept-Match_baseline_5


puting commonsencs into examples: 100%|█████████████████████████████████████████| 9741/9741 [00:00<00:00, 30248.95it/s]
CSQA processing: 100%|████████████████████████████████████████████████████████████| 9741/9741 [00:39<00:00, 247.16it/s]


In [31]:
file_name = os.path.join(args.data_dir,args.train_file) 
file_name

'dataset/CSQA\\train_rand_split.jsonl'

In [1]:
import torch
import torch.nn.functional as F

In [2]:
F.one_hot(torch.tensor([0,1,2,1], dtype=torch.int64), num_classes=5)

tensor([[1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 1, 0, 0, 0]])

In [3]:
labels = torch.arange(1,4, dtype=torch.long).reshape(3, 1)
labels

tensor([[1],
        [2],
        [3]])

In [4]:
oh_labels = F.one_hot(labels, num_classes=5) # [3,1] -> [3,1,5]
# oh_labels
oh_labels = oh_labels.squeeze(1)  # [3,1,5] -> [3,5]
oh_labels

tensor([[0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0]])

In [49]:
reshaped_logits = torch.randn(3,5)
reshaped_logits

tensor([[-1.5587,  0.8788,  0.4759, -0.3196, -0.5946],
        [ 0.1233, -0.5129,  0.8072,  0.4943,  0.3895],
        [ 0.2659, -0.5676,  0.6588, -1.4019, -0.3201]])

In [82]:
# 过 softmax
reshaped_logits = F.softmax(reshaped_logits, dim=1)
print(reshaped_logits)
# mask 负例
mask = oh_labels.bool()
print(mask)
# 仅留下正例
logits_p = reshaped_logits.masked_select(mask).reshape(-1, 1)
print(logits_pr)
# 重复
logits_p.repeat(1, 5)
print(logits_pr)

tensor([[0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]])
tensor([[False,  True, False, False, False],
        [False, False,  True, False, False],
        [False, False, False,  True, False]])
tensor([[0.0225],
        [0.1821],
        [0.0409]])
tensor([[0.0225],
        [0.1821],
        [0.0409]])


In [61]:
logits_n = reshaped_logits
logits_n

tensor([[0.0382, 0.4373, 0.2923, 0.1319, 0.1002],
        [0.1596, 0.0845, 0.3163, 0.2313, 0.2083],
        [0.2732, 0.1187, 0.4046, 0.0515, 0.1520]])

In [68]:
logits_np = logits_p - logits_n
logits_np

tensor([[ 0.3991,  0.0000,  0.1450,  0.3054,  0.3371],
        [ 0.1567,  0.2318,  0.0000,  0.0850,  0.1080],
        [-0.2216, -0.0672, -0.3531,  0.0000, -0.1005]])

In [70]:
import torch.nn as nn

In [71]:
Loss = nn.HingeEmbeddingLoss()

In [72]:
labels_oh2 = oh_labels * 2 -1
labels_oh2

tensor([[-1,  1, -1, -1, -1],
        [-1, -1,  1, -1, -1],
        [-1, -1, -1,  1, -1]])

In [76]:
Loss(logits_np, labels_oh2)

tensor(0.7316)

In [129]:
help(nn.HingeEmbeddingLoss)

Help on class HingeEmbeddingLoss in module torch.nn.modules.loss:

class HingeEmbeddingLoss(_Loss)
 |  HingeEmbeddingLoss(margin=1.0, size_average=None, reduce=None, reduction='mean')
 |  
 |  Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`
 |  (containing 1 or -1).
 |  This is usually used for measuring whether two inputs are similar or
 |  dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
 |  used for learning nonlinear embeddings or semi-supervised learning.
 |  
 |  The loss function for :math:`n`-th sample in the mini-batch is
 |  
 |  .. math::
 |      l_n = \begin{cases}
 |          x_n, & \text{if}\; y_n = 1,\\
 |          \max \{0, \Delta - x_n\}, & \text{if}\; y_n = -1,
 |      \end{cases}
 |  
 |  and the total loss functions is
 |  
 |  .. math::
 |      \ell(x, y) = \begin{cases}
 |          \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
 |          \operatorname{sum}(L),  & \text{if reduction} 

In [None]:
reshaped_logits  # batch_size, choice_num
labels       # btach_size, 1