In [1]:
import argparse
import glob
import json
import logging
import os
import re
import shutil
import random
from multiprocessing import Pool
from typing import Dict, List, Tuple
from copy import deepcopy

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AlbertConfig,
    AlbertForSequenceClassification,
    AlbertTokenizer,
    BertConfig,
    BertForSequenceClassification,
    BertForLongSequenceClassification,
    BertForLongSequenceClassificationCat,
    BertTokenizer,
    DNATokenizer,
    DistilBertConfig,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    FlaubertConfig,
    FlaubertForSequenceClassification,
    FlaubertTokenizer,
    RobertaConfig,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    XLMConfig,
    XLMForSequenceClassification,
    XLMRobertaConfig,
    XLMRobertaForSequenceClassification,
    XLMRobertaTokenizer,
    XLMTokenizer,
    XLNetConfig,
    XLNetForSequenceClassification,
    XLNetTokenizer,
    get_linear_schedule_with_warmup,
)
from transformers import glue_compute_metrics as compute_metrics
from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter


logger = logging.getLogger(__name__)

ALL_MODELS = sum(
    (
        tuple(conf.pretrained_config_archive_map.keys())
        for conf in (
            BertConfig,
            XLNetConfig,
            XLMConfig,
            RobertaConfig,
            DistilBertConfig,
            AlbertConfig,
            XLMRobertaConfig,
            FlaubertConfig,
        )
    ),
    (),
)

MODEL_CLASSES = {
    "dna": (BertConfig, BertForSequenceClassification, DNATokenizer),
    "dnalong": (BertConfig, BertForLongSequenceClassification, DNATokenizer),
    "dnalongcat": (BertConfig, BertForLongSequenceClassificationCat, DNATokenizer),
    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
    "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
    "flaubert": (FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer),
}
                    
TOKEN_ID_GROUP = ["bert", "dnalong", "dnalongcat", "xlnet", "albert"] 

In [2]:
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)


In [3]:
class ss():
    def __init__(self):
        self.adam_epsilon=1e-08
        self.attention_probs_dropout_prob=0.1
        self.beta1=0.9
        self.beta2=0.999
        self.cache_dir=''
        self.config_name=''
        self.data_dir='../examples/sample_data/ft/6/'
        self.do_ensemble_pred=False
        self.do_eval=False
        self.do_lower_case=False
        self.do_predict=False
        self.do_train=False
        self.do_visualize=False
        self.early_stop=0
        self.eval_all_checkpoints=False
        self.evaluate_during_training=False
        self.fp16=False
        self.fp16_opt_level='O1'
        self.gradient_accumulation_steps=1
        self.hidden_dropout_prob=0.1
        self.learning_rate=5e-05
        self.local_rank=-1
        self.logging_steps=500
        self.max_grad_norm=1.0
        self.max_seq_length=128
        self.max_steps=-1
        self.model_name_or_path='../examples/6-new-12w-0'
        self.model_type='dna'
        self.n_process=2
        self.no_cuda=False
        self.num_rnn_layer=2
        self.num_train_epochs=3.0
        self.output_dir='../ft/6/'
        self.overwrite_cache=False
        self.overwrite_output_dir=False
        self.per_gpu_eval_batch_size=8
        self.per_gpu_pred_batch_size=8
        self.per_gpu_train_batch_size=8
        self.predict_dir=None
        self.predict_scan_size=1
        self.result_dir=None
        self.rnn='lstm'
        self.rnn_dropout=0.0
        self.rnn_hidden=768
        self.save_steps=500
        self.save_total_limit=None
        self.seed=42
        self.server_ip=''
        self.server_port=''
        self.should_continue=False
        self.task_name='dnaprom'
        self.tokenizer_name=''
        self.visualize_data_dir=None
        self.visualize_models=None
        self.visualize_train=False
        self.warmup_percent=0
        self.warmup_steps=0
        self.weight_decay=0.0

In [4]:
args = ss()

In [5]:
args.task_name = args.task_name.lower()
processor = processors[args.task_name]()
args.output_mode = output_modes[args.task_name]
label_list = processor.get_labels()
num_labels = len(label_list)

In [6]:
args.model_type = args.model_type.lower()
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

In [7]:
config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=args.task_name,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )

In [8]:
config.hidden_dropout_prob = args.hidden_dropout_prob
config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
if args.model_type in ["dnalong", "dnalongcat"]:
    assert args.max_seq_length % 512 == 0
config.split = int(args.max_seq_length/512)
config.rnn = args.rnn
config.num_rnn_layer = args.num_rnn_layer
config.rnn_dropout = args.rnn_dropout
config.rnn_hidden = args.rnn_hidden


In [9]:
tokenizer = tokenizer_class.from_pretrained(
    args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
    do_lower_case=args.do_lower_case,
    cache_dir=args.cache_dir if args.cache_dir else None,
)
model = model_class.from_pretrained(
    args.model_name_or_path,
    from_tf=bool(".ckpt" in args.model_name_or_path),
    config=config,
    cache_dir=args.cache_dir if args.cache_dir else None,
)

<class 'transformers.tokenization_dna.DNATokenizer'>


In [10]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4101, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [11]:
task = args.task_name
evaluate=False

In [12]:
if args.local_rank not in [-1, 0] and not evaluate:
    torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

processor = processors[task]()
output_mode = output_modes[task]
# Load data features from cache or dataset file
cached_features_file = os.path.join(
    args.data_dir,
    "cached_{}_{}_{}_{}".format(
        "dev" if evaluate else "train",
        list(filter(None, args.model_name_or_path.split("/"))).pop(),
        str(args.max_seq_length),
        str(task),
    ),
)
if args.do_predict:
    cached_features_file = os.path.join(
    args.data_dir,
    "cached_{}_{}_{}".format(
        "dev" if evaluate else "train",
        str(args.max_seq_length),
        str(task),
    ),
)
if os.path.exists(cached_features_file) and not args.overwrite_cache:
    logger.info("Loading features from cached file %s", cached_features_file)
    features = torch.load(cached_features_file)
else:
    logger.info("Creating features from dataset file at %s", args.data_dir)
    label_list = processor.get_labels()
    if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
        # HACK(label indices are swapped in RoBERTa pretrained model)
        label_list[1], label_list[2] = label_list[2], label_list[1]
    examples = (
        processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
    )   


    print("finish loading examples")

    # params for convert_examples_to_features
    max_length = args.max_seq_length
    pad_on_left = bool(args.model_type in ["xlnet"])
    pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
    pad_token_segment_id = 4 if args.model_type in ["xlnet"] else 0


    if args.n_process == 1:
        features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=label_list,
        max_length=max_length,
        output_mode=output_mode,
        pad_on_left=pad_on_left,  # pad on the left for xlnet
        pad_token=pad_token,
        pad_token_segment_id=pad_token_segment_id,)

    else:
        n_proc = int(args.n_process)
        if evaluate:
            n_proc = max(int(n_proc/4),1)
        print("number of processes for converting feature: " + str(n_proc))
        p = Pool(n_proc)
        indexes = [0]
        len_slice = int(len(examples)/n_proc)
        for i in range(1, n_proc+1):
            if i != n_proc:
                indexes.append(len_slice*(i))
            else:
                indexes.append(len(examples))

        results = []

        for i in range(n_proc):
            results.append(p.apply_async(convert_examples_to_features, args=(examples[indexes[i]:indexes[i+1]], tokenizer, max_length, None, label_list, output_mode, pad_on_left, pad_token, pad_token_segment_id, True,  )))
            print(str(i+1) + ' processor started !')

        p.close()
        p.join()

        features = []
        for result in results:
            features.extend(result.get())


    if args.local_rank in [-1, 0]:
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)

if args.local_rank == 0 and not evaluate:
    torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
if output_mode == "classification":
#         all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
elif output_mode == "regression":
    all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

In [13]:
train_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)

In [14]:
train_dataset

<torch.utils.data.dataset.TensorDataset at 0x7fa466c89588>

In [15]:
for i,x in enumerate(train_dataset):
    print(x[3])
    if i==20:
        break

tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.0200)
tensor(0.0200)
tensor(0.9800)
tensor(0.0200)


In [16]:
args.n_gpu=2
if args.local_rank in [-1, 0]:
    tb_writer = SummaryWriter()

args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

if args.max_steps > 0:
    t_total = args.max_steps
    args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]

warmup_steps = args.warmup_steps if args.warmup_percent == 0 else int(args.warmup_percent*t_total)

optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1,args.beta2))
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)

# Check if saved optimizer or scheduler states exist
if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
    os.path.join(args.model_name_or_path, "scheduler.pt")
):
    # Load in optimizer and scheduler states
    optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
    scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

if args.fp16:
    try:
        from apex import amp
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
    model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
    )

# Train!
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_dataset))
logger.info("  Num Epochs = %d", args.num_train_epochs)
logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
logger.info(
    "  Total train batch size (w. parallel, distributed & accumulation) = %d",
    args.train_batch_size
    * args.gradient_accumulation_steps
    * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
)
logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
logger.info("  Total optimization steps = %d", t_total)

global_step = 0
epochs_trained = 0
steps_trained_in_current_epoch = 0
# Check if continuing training from a checkpoint
if os.path.exists(args.model_name_or_path):
    # set global_step to gobal_step of last saved checkpoint from model path
    try:
        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
    except:
        global_step = 0
    epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
    steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

    logger.info("  Continuing training from checkpoint, will skip to saved global_step")
    logger.info("  Continuing training from epoch %d", epochs_trained)
    logger.info("  Continuing training from global step %d", global_step)
    logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)

tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(
    epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
)
set_seed(args)  # Added here for reproductibility

best_auc = 0
last_auc = 0
stop_count = 0

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

In [17]:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
    break


Iteration:   0%|          | 0/2023 [00:00<?, ?it/s][A


In [18]:
batch

[tensor([[   2, 2301,  999,  ...,    0,    0,    0],
         [   2,  175,  685,  ...,    0,    0,    0],
         [   2,  569, 2263,  ...,    0,    0,    0],
         ...,
         [   2,  167,  655,  ...,    0,    0,    0],
         [   2, 1579, 2205,  ...,    0,    0,    0],
         [   2, 1405, 1510,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([0.0200, 0.0200, 0.9800, 0.9800, 0.0200, 0.9800, 0.0200, 0.9800, 0.9800,
         0.9800, 0.0200, 0.0200, 0.0200, 0.9800, 0.9800, 0.0200])]

In [17]:
for i,x in enumerate(dataset):
    print(x[3])
    if i==20:
        break

tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.0200)
tensor(0.0200)
tensor(0.9800)
tensor(0.0200)


In [15]:
train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)

In [16]:
for i,x in enumerate(train_dataset):
    print(x[3])
    if i==20:
        break

tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)


In [19]:
features = torch.load(cached_features_file)

In [22]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
if output_mode == "classification":
#         all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
elif output_mode == "regression":
    all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

In [27]:
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)

In [28]:
for i,x in enumerate(dataset):
    print(x[3])
    if i==20:
        break

tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.0200)
tensor(0.9800)
tensor(0.9800)
tensor(0.0200)
tensor(0.0200)
tensor(0.0200)
tensor(0.9800)
tensor(0.0200)


In [26]:
all_labels

tensor([0.0200, 0.9800, 0.9800,  ..., 0.9800, 0.9800, 0.0200])

In [13]:
train_dataset

NameError: name 'train_dataset' is not defined

In [35]:
all_labels

tensor([0.0200, 2.3400, 2.3400,  ..., 2.3400, 2.3400, 0.0200])

In [32]:
for i,x in enumerate(train_dataset):
    print(x[3])
    if i==20:
        break

tensor(0)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(0)
tensor(2)
tensor(2)
tensor(2)
tensor(0)
tensor(2)
tensor(2)
tensor(0)
tensor(0)
tensor(2)
tensor(2)
tensor(0)
tensor(0)
tensor(0)
tensor(2)
tensor(0)


In [30]:
train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)

finish loading examples
number of processes for converting feature: 2
1 processor started !
2 processor started !


In [14]:
for i,x in enumerate(train_dataset):
    print(x[3])
    if i==100:
        break

tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)
tensor(0)


In [15]:
x

(tensor([   2, 2274,  892, 3556, 1921, 3576, 2003, 3901, 3304,  916, 3652, 2308,
         1025, 4087, 4045, 3879, 3214,  555, 2206,  618, 2459, 1631, 2415, 1454,
         1708, 2724, 2691, 2559, 2031, 4014, 3755, 2719, 2670, 2476, 1699, 2687,
         2542, 1962, 3738, 2650, 2395, 1374, 1386, 1434, 1625, 2390, 1355, 1311,
         1135,  429, 1701, 2694, 2571, 2077,  102,  396, 1570, 2172,  483, 1919,
         3566, 1962, 3739, 2653, 2408, 1425, 1591, 2254,  811, 3229,  614, 2443,
         1567, 2159,  430, 1707, 2718, 2667, 2461, 1637, 2439, 1549, 2088,  145,
          565, 2245,  773, 3077,    6,   10,   26,   92,  355, 1407, 1518, 1964,
         3746,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [14]:
evaluate=False
processor = processors[args.task_name]()
output_mode = output_modes[args.task_name]
# Load data features from cache or dataset file
cached_features_file = os.path.join(
    args.data_dir,
    "cached_{}_{}_{}_{}".format(
        "dev" if evaluate else "train",
        list(filter(None, args.model_name_or_path.split("/"))).pop(),
        str(args.max_seq_length),
        str(args.task_name),
    ),
)
if args.do_predict:
    cached_features_file = os.path.join(
    args.data_dir,
    "cached_{}_{}_{}".format(
        "dev" if evaluate else "train",
        str(args.max_seq_length),
        str(task),
    ),
)

In [15]:
label_list = processor.get_labels()

In [16]:
label_list

['0', '1']

In [17]:
processor

<transformers.data.processors.glue.DnaPromProcessor at 0x7f271f107da0>

In [18]:
examples = (
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )

In [19]:
examples

[{
   "guid": "train-1",
   "label": "0.02",
   "text_a": "CACAGC ACAGCC CAGCCA AGCCAG GCCAGC CCAGCC CAGCCA AGCCAC GCCACT CCACTA CACTAG ACTAGG CTAGGC TAGGCC AGGCCC GGCCCT GCCCTG CCCTGT CCTGTC CTGTCC TGTCCT GTCCTG TCCTGC CCTGCA CTGCAG TGCAGC GCAGCC CAGCCC AGCCCC GCCCCC CCCCCT CCCCTG CCCTGT CCTGTA CTGTAG TGTAGG GTAGGG TAGGGG AGGGGT GGGGTC GGGTCT GGTCTG GTCTGG TCTGGA CTGGAA TGGAAC GGAACA GAACAG AACAGC ACAGCC CAGCCA AGCCAG GCCAGG CCAGGA CAGGAG AGGAGT GGAGTG GAGTGG AGTGGT GTGGTT TGGTTT GGTTTA GTTTAA TTTAAG TTAAGA TAAGAG AAGAGG AGAGGC GAGGCA AGGCAG GGCAGG GCAGGG CAGGGG AGGGGA GGGGAG GGGAGT GGAGTC GAGTCG AGTCGC GTCGCC TCGCCT CGCCTT GCCTTG CCTTGC CTTGCC TTGCCC TGCCCT GCCCTG CCCTGT CCTGTG CTGTGC TGTGCC GTGCCA TGCCAC GCCACA CCACAC",
   "text_b": null
 },
 {
   "guid": "train-2",
   "label": "2.34",
   "text_a": "CTAATC TAATCT AATCTA ATCTAG TCTAGT CTAGTA TAGTAA AGTAAT GTAATG TAATGC AATGCC ATGCCG TGCCGC GCCGCG CCGCGT CGCGTT GCGTTG CGTTGG GTTGGT TTGGTG TGGTGG GGTGGA GTGGAA TGGAAA GGAAAG GAAAGA AAAG

In [20]:
max_length = args.max_seq_length
pad_on_left = bool(args.model_type in ["xlnet"])
pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
pad_token_segment_id = 4 if args.model_type in ["xlnet"] else 0

In [21]:
features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=max_length,
            output_mode=output_mode,
            pad_on_left=pad_on_left,  # pad on the left for xlnet
            pad_token=pad_token,
            pad_token_segment_id=pad_token_segment_id,)

In [22]:
features[2].label

2.34

In [23]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
if output_mode == "classification":
#         all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
elif output_mode == "regression":
    all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

In [24]:
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)

In [28]:
c=0
for x in dataset:
    print(x[3])
    c+=1
    if c==20:
        break

tensor(0.0200)
tensor(2.3400)
tensor(2.3400)
tensor(2.3400)
tensor(2.3400)
tensor(0.0200)
tensor(2.3400)
tensor(2.3400)
tensor(2.3400)
tensor(0.0200)
tensor(2.3400)
tensor(2.3400)
tensor(0.0200)
tensor(0.0200)
tensor(2.3400)
tensor(2.3400)
tensor(0.0200)
tensor(0.0200)
tensor(0.0200)
tensor(2.3400)


In [29]:
x

(tensor([   2,  218,  858, 3418, 1371, 1375, 1389, 1447, 1679, 2606, 2220,  675,
         2687, 2541, 1959, 3726, 2602, 2203,  605, 2407, 1421, 1573, 2182,  524,
         2081,  119,  463, 1839, 3246,  683, 2719, 2669, 2472, 1683, 2624, 2291,
          959, 3823, 2990, 3755, 2718, 2665, 2455, 1614, 2348, 1188,  643, 2557,
         2024, 3985, 3640, 2257,  823, 3278,  809, 3224,  595, 2365, 1254,  907,
         3613, 2149,  392, 1555, 2111,  237,  936, 3731, 2622, 2284,  932, 3715,
         2557, 2021, 3973, 3591, 2063,   45,  168,  657, 2616, 2260,  835, 3328,
         1010, 4027, 3807, 2926, 3498, 1690, 2649, 2389, 1352, 1300, 1090,  251,
          991,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [21]:
args.n_gpu=2
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

if args.max_steps > 0:
    t_total = args.max_steps
    args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]

warmup_steps = args.warmup_steps if args.warmup_percent == 0 else int(args.warmup_percent*t_total)

optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1,args.beta2))
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)

# Check if saved optimizer or scheduler states exist
if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
    os.path.join(args.model_name_or_path, "scheduler.pt")
):
    # Load in optimizer and scheduler states
    optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
    scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

if args.fp16:
    try:
        from apex import amp
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
    model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
    model = torch.nn.DataParallel(model)

# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
    )

In [24]:
train_iterator = trange(
    0, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
)
set_seed(args)  # Added here for reproductibility

best_auc = 0
last_auc = 0
stop_count = 0

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])


Iteration:   0%|          | 0/2023 [00:00<?, ?it/s][A

In [27]:
for step, batch in enumerate(epoch_iterator):
    break

Iteration:   0%|          | 0/2023 [00:09<?, ?it/s]


In [28]:
batch

[tensor([[   2, 2301,  999,  ...,    0,    0,    0],
         [   2,  175,  685,  ...,    0,    0,    0],
         [   2,  569, 2263,  ...,    0,    0,    0],
         ...,
         [   2,  167,  655,  ...,    0,    0,    0],
         [   2, 1579, 2205,  ...,    0,    0,    0],
         [   2, 1405, 1510,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([0.0200, 0.0200, 0.9800, 0.9800, 0.0200, 0.9800, 0.0200, 0.9800, 0.9800,
         0.9800, 0.0200, 0.0200, 0.0200, 0.9800, 0.9800, 0.0200])]

In [37]:
args.train_batch_size = args.per_gpu_train_batch_size * max(1, 2)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

In [38]:
t_total = args.max_steps
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1

In [39]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]

In [40]:
warmup_steps = args.warmup_steps if args.warmup_percent == 0 else int(args.warmup_percent*t_total)

optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(args.beta1,args.beta2))
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)

In [41]:
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])

In [42]:
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)


train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
)

Epoch: 0it [00:00, ?it/s]

In [43]:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])


Iteration:   0%|          | 0/2023 [00:00<?, ?it/s][A

In [50]:
for step, batch in enumerate(epoch_iterator):
    break

In [51]:
batch[0][0]

tensor([   2, 1895, 3471, 1582, 2217,  661, 2629, 2310, 1034,   25,   88,  339,
        1343, 1262,  939, 3743, 2670, 2473, 1685, 2629, 2311, 1037,   39,  142,
         553, 2198,  588, 2339, 1151,  494, 1964, 3747, 2685, 2534, 1930, 3609,
        2135,  335, 1325, 1192,  657, 2616, 2260,  835, 3326, 1004, 4003, 3711,
        2541, 1959, 3727, 2605, 2216,  660, 2628, 2308, 1027, 4094, 4073, 3991,
        3662, 2347, 1182,  618, 2458, 1625, 2390, 1356, 1316, 1156,  515, 2048,
        4084, 4034, 3834, 3034, 3929, 3415, 1358, 1324, 1185,  632, 2513, 1845,
        3269,  773, 3080,   18,   60,  228,  899, 3582, 2025, 3990, 3658, 2331,
        1117,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

In [52]:
batch[1][0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [53]:
batch[2][0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [54]:
batch[3].view(-1)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [49]:
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}

In [27]:
outputs = model(**inputs)

In [28]:
outputs[1]

tensor([[-0.3907, -0.2003],
        [ 0.1935, -0.0373],
        [-0.2498, -0.3836],
        [-0.0074, -0.3486],
        [-0.2061, -0.4780],
        [-0.0176, -0.1627],
        [ 0.3110, -0.1996],
        [-0.0390, -0.0384],
        [ 0.0604,  0.0103],
        [-0.0195,  0.0687],
        [ 0.0381, -0.3179],
        [ 0.4512,  0.0862],
        [ 0.2045, -0.3010],
        [ 0.0569, -0.1780],
        [ 0.1077, -0.2808],
        [-0.2997,  0.0672]], grad_fn=<AddmmBackward>)

In [29]:
sm = torch.nn.Softmax(dim=1)

In [34]:
sm(outputs[1])

tensor([[0.4525, 0.5475],
        [0.5574, 0.4426],
        [0.5334, 0.4666],
        [0.5845, 0.4155],
        [0.5676, 0.4324],
        [0.5362, 0.4638],
        [0.6249, 0.3751],
        [0.4999, 0.5001],
        [0.5125, 0.4875],
        [0.4780, 0.5220],
        [0.5881, 0.4119],
        [0.5902, 0.4098],
        [0.6238, 0.3762],
        [0.5585, 0.4415],
        [0.5959, 0.4041],
        [0.4093, 0.5907]], grad_fn=<SoftmaxBackward>)

In [31]:
from torch import nn

In [96]:
in1=outputs[1].view(-1, num_labels)

In [117]:
# out1=batch[3].view(-1)
out1=torch.tensor([0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0])

In [142]:
loss_fct = CrossEntropyLoss()
loss = loss_fct(in1,out1)

In [145]:
labels

NameError: name 'labels' is not defined

In [138]:
torch.mean(-(1-out1)*torch.log(in2[:,0]+1e-10)-(out1)*torch.log(in2[:,1]+1e-10))

tensor(0.6982, grad_fn=<MeanBackward0>)

In [128]:
in2=torch.nn.Softmax(dim=1)(in1)
in2

tensor([[0.5373, 0.4627],
        [0.5320, 0.4680],
        [0.4909, 0.5091],
        [0.4703, 0.5297],
        [0.5104, 0.4896],
        [0.5144, 0.4856],
        [0.4700, 0.5300],
        [0.5361, 0.4639],
        [0.4370, 0.5630],
        [0.5230, 0.4770],
        [0.4976, 0.5024],
        [0.4127, 0.5873],
        [0.5222, 0.4778],
        [0.5652, 0.4348],
        [0.5377, 0.4623],
        [0.3710, 0.6290]], grad_fn=<SoftmaxBackward>)

In [133]:
in2[:,1]

tensor([0.4627, 0.4680, 0.5091, 0.5297, 0.4896, 0.4856, 0.5300, 0.4639, 0.5630,
        0.4770, 0.5024, 0.5873, 0.4778, 0.4348, 0.4623, 0.6290],
       grad_fn=<SelectBackward>)

In [131]:
outputs

(tensor(0.7263, grad_fn=<NllLossBackward>),
 tensor([[ 8.0712e-02, -6.8817e-02],
         [ 9.4218e-03, -1.1881e-01],
         [ 5.9929e-02,  9.6341e-02],
         [ 1.3406e-01,  2.5283e-01],
         [ 1.4132e-01,  9.9617e-02],
         [ 1.7507e-01,  1.1730e-01],
         [-1.2221e-01, -2.2399e-03],
         [ 2.4637e-01,  1.0175e-01],
         [ 1.4240e-01,  3.9558e-01],
         [ 9.2061e-02, -4.7039e-05],
         [-3.7199e-04,  9.3512e-03],
         [ 3.3749e-02,  3.8644e-01],
         [ 1.8809e-01,  9.9111e-02],
         [ 1.4139e-01, -1.2084e-01],
         [ 6.7679e-02, -8.3590e-02],
         [-2.3591e-01,  2.9191e-01]], grad_fn=<AddmmBackward>))

In [105]:
loss

tensor(0.7263, grad_fn=<NllLossBackward>)

In [45]:
inputs['labels']=torch.tensor([0.4627, 0.4680, 0.5091, 0.5297, 0.4896, 0.4856, 0.5300, 0.4639, 0.5630,0.4770, 0.5024, 0.5873, 0.4778, 0.4348, 0.4623, 0.6290])
inputs['labels']

tensor([0.4627, 0.4680, 0.5091, 0.5297, 0.4896, 0.4856, 0.5300, 0.4639, 0.5630,
        0.4770, 0.5024, 0.5873, 0.4778, 0.4348, 0.4623, 0.6290])

In [38]:
sm = torch.nn.Softmax(dim=1)

In [41]:
sm(outputs[1])[:,1]

tensor([0.4627, 0.4680, 0.5091, 0.5297, 0.4896, 0.4856, 0.5300, 0.4639, 0.5630,
        0.4770, 0.5024, 0.5873, 0.4778, 0.4348, 0.4623, 0.6290],
       grad_fn=<SelectBackward>)

In [47]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4101, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [59]:
from torch.nn import CrossEntropyLoss, MSELoss
loss = CrossEntropyLoss()

In [52]:
loss_fct(
    torch.tensor([0.8, 0.2, 0.6, 0.4]),
    torch.tensor([1, 0, 1, 0])

)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [53]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)

In [54]:
input

tensor([[-0.8019, -0.7606,  0.1775,  1.2971, -1.0675],
        [ 1.7152, -1.4885,  0.1433, -0.7590, -0.0808],
        [-0.0602, -0.2995, -0.4844, -0.9050, -0.1346]], requires_grad=True)

In [63]:
torch.randn(3, 5).softmax(dim=1)

tensor([[0.2720, 0.2544, 0.1930, 0.1541, 0.1265],
        [0.0282, 0.1042, 0.6981, 0.0518, 0.1177],
        [0.0465, 0.4829, 0.1023, 0.2369, 0.1314]])

In [68]:
inn= torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)

tensor([[-1.1311,  0.3460,  0.9342, -0.0816,  1.3712],
        [ 2.2268,  0.2388,  0.0550, -0.0937, -0.2281],
        [-0.5459,  1.0565,  1.8346,  1.0831, -0.4669]], requires_grad=True)

In [70]:
output = loss(inn, torch.tensor([2,0,2]))
output

tensor(0.7977, grad_fn=<NllLossBackward>)

In [69]:
inn.softmax(dim=1)

tensor([[0.0353, 0.1546, 0.2784, 0.1008, 0.4309],
        [0.6968, 0.0954, 0.0794, 0.0684, 0.0598],
        [0.0436, 0.2163, 0.4709, 0.2221, 0.0471]], grad_fn=<SoftmaxBackward>)

In [74]:
output = loss(inn, torch.tensor([0,4,3]))
output

tensor(2.5549, grad_fn=<NllLossBackward>)

In [91]:
inn

tensor([[-1.1311,  0.3460,  0.9342, -0.0816,  1.3712],
        [ 2.2268,  0.2388,  0.0550, -0.0937, -0.2281],
        [-0.5459,  1.0565,  1.8346,  1.0831, -0.4669]], requires_grad=True)