In [1]:
import torch
import numpy as np
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

In [2]:
from datasets import load_dataset

# Download and cache the dataset
raw_datasets = load_dataset("glue", "mrpc")

Found cached dataset glue (/home/mist/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
raw_train_dataset = raw_datasets["train"]

In [4]:
checkpoint = 'bert-base-uncased'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [6]:
def tokenize_function(example):
    """
    This function takes a dictionary (like the items of our dataset) and returns a new dictionary 
    with the keys input_ids, attention_mask, and token_type_ids. 
        - Note that it also works if the example dictionary contains several samples 
          (each key as a list of sentences) since the tokenizer works on lists 
          of pairs of sentences, as seen before. 
    
    This will allow us to use the option batched=True in our call to map(), 
    which will greatly speed up the tokenization. 
        - The tokenizer is backed by a tokenizer written in Rust from the 
          🤗 Tokenizers library. 
        - This tokenizer can be very fast, but only if we give it lots of 
          inputs at once.
    
    Note that we’ve left the padding argument out in our tokenization function for now. 
        - This is because padding all the samples to the maximum length is not efficient
        - It’s better to pad the samples when we’re building a batch, as then we only need to 
          pad to the maximum length in that batch, and not the maximum length in the entire dataset. 
        - This can save a lot of time and processing power when the inputs have variable lengths.
    
    Args:
        example (Dict): A dictionary containing the items of the dataset.
    
    Returns:
        A new dictionary with the keys input_ids, attention_mask, and token_type_ids.
    
    """
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Here is how we apply the tokenization function on all our datasets at once. 
# 
# We’re using batched=True in our call to map so the function is applied to 
# multiple elements of our dataset at once, and not on each element separately. 
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Loading cached processed dataset at /home/mist/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-6fd6978f4d0e1f7f.arrow
Loading cached processed dataset at /home/mist/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-e21445cecc3a2248.arrow
Loading cached processed dataset at /home/mist/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-74cc38742f918009.arrow


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [7]:
#  Remove the columns corresponding to values the model does not expect
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])

# Rename the column label to labels
#      - Because the model expects the argument to be named `labels`
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# Set the format of the datasets so they return PyTorch tensors
tokenized_datasets.set_format("torch")

In [8]:
import random
l_len = len(tokenized_datasets["train"])
random.seed(1234)
line_r = [random.randint(0,l_len-1) for _ in range(int(l_len*0.01))]
tokenized_datasets["train"] = [tokenized_datasets["train"][i] for i in line_r]

In [9]:
import random
l_len = len(tokenized_datasets["validation"])
random.seed(1234)
line_r = [random.randint(0,l_len-1) for _ in range(int(l_len*0.01))]
tokenized_datasets["validation"] = [tokenized_datasets["validation"][i] for i in line_r]

In [14]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [10]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=1, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=1, collate_fn=data_collator
)

In [11]:
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='tf')

In [12]:
for batch in train_dataloader:
    for k,v in batch.items(): print(f"\t{k:<15} --> {v.shape}")
    break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


	labels          --> (1,)
	input_ids       --> (1, 58)
	token_type_ids  --> (1, 58)
	attention_mask  --> (1, 58)


2022-10-14 15:41:50.374502: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-14 15:41:52.336235: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30527 MB memory:  -> device: 0, name: Tesla PG500-216, pci bus id: 0000:83:00.0, compute capability: 7.0


In [13]:
batch

{'labels': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>, 'input_ids': <tf.Tensor: shape=(1, 58), dtype=int32, numpy=
array([[  101,  1996,  6745,  5008,  5799,  2000,  1996, 11867,  9910,
         2001,  1037,  2281,  2340,  5008,  2012,  5226,  2430,  4732,
         2082,  1999, 15578,  5753,  1010,  2055,  1017, 22287,  2013,
         1996, 10846,  1012,   102,  2178,  5008,  5799,  2000,  1996,
        11867,  9910,  4158, 13292,  1012,  2340,  2012,  5226,  2430,
         4732,  1999, 15578,  5753,  1010,  2055,  2048,  2661,  2013,
         1996, 10846,  1012,   102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 58), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 58), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [14]:
%pwd

'/home/mist/My Code/modeling'

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
from typing import Optional
from sklearn import metrics
from torch.nn.utils.rnn import pad_sequence
import tensorflow as tf

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from transformers.file_utils import ModelOutput
from transformers import (
    AutoModelForSequenceClassification,
    BertModel,
    BertPreTrainedModel,
    )

from connected_shapley import explain_shapley, construct_positions_connectedshapley
from continuity1 import  onehat_pre,continuity,rl_cd_ss
from logical_fluency import newset

from transformers import AutoConfig

config = AutoConfig.from_pretrained(
   'bert-base-uncased',
    num_labels=2,

)

def compute_metrics_fn(eval_prediction, tokenizer=None, return_text=False):
    # full_logits = eval_prediction.predictions[0]
    masked_logits = eval_prediction.predictions[0]
    mask = eval_prediction.predictions[1]
    labels = eval_prediction.predictions[2]
    mask_labels = eval_prediction.predictions[3]
    mask_labels = np.maximum(mask_labels, 0)

    s_loss = eval_prediction.predictions[5]
    cph_loss = eval_prediction.predictions[6]
    cpt_loss = eval_prediction.predictions[7]
    lf_loss = eval_prediction.predictions[8]

    # full_preds = np.argmax(full_logits, axis=1)
    masked_preds = np.argmax(masked_logits, axis=1)
    is_masked = np.greater(mask, 0.5).astype(np.int)

    results = {}
    # results["full_acc"] = metrics.accuracy_score(labels, full_preds)
    results["masked_acc"] = metrics.accuracy_score(labels, masked_preds)
    results["mask_f1"] = metrics.f1_score(mask_labels, is_masked, average="micro", zero_division=1)
    results["mask_recall"] = metrics.recall_score(mask_labels, is_masked, average="micro", zero_division=1)
    results["mask_precision"] = metrics.precision_score(mask_labels, is_masked, average="micro", zero_division=1)

    results["suffiency_loss"] = s_loss.mean()
    results["comprehensiveness_loss"] = cph_loss.mean()
    results["compactness_loss"] = cpt_loss.mean()
    results["logical_fluence_loss"] = lf_loss.mean()

    if return_text:
        input_ids = eval_prediction.predictions[4]
        examples = []
        for i in range(len(input_ids)):
            import pdb; pdb.set_trace()
            row = tokenizer.convert_ids_to_tokens(input_ids[i])
            original = []
            masked = []
            for j in range(len(row)):
                if row[j] in ["[CLS]", "[SEP]", "<pad>"]:
                    continue
                original.append(row[j])
                if is_masked[i][j]:
                    masked.append("▁<mask>")
                else:
                    masked.append(row[j])
            original = tokenizer.convert_tokens_to_string(original)
            masked = tokenizer.convert_tokens_to_string(masked)
            combined = f"{original} OLD: {masked}"
            examples.append(combined)
        results["text"] = examples

    return results


class MaskOutput(ModelOutput):
    rationale: torch.FloatTensor
    noise: torch.FloatTensor
    r_ind: torch.FloatTensor



class ClassifyOutput(ModelOutput):
    logits: torch.FloatTensor
    loss: torch.FloatTensor


class TokenTaggingRationaleOutput(ModelOutput):
    loss: torch.FloatTensor
    # full_logits: torch.FloatTensor
    masked_logits: torch.FloatTensor
    mask: torch.FloatTensor
    labels: Optional[torch.FloatTensor] = None
    mask_labels: Optional[torch.FloatTensor] = None
    # is_unsupervised: Optional[torch.FloatTensor] = None
    input_ids: Optional[torch.LongTensor] = None
    s_loss: torch.FloatTensor = None
    cph_loss:torch.FloatTensor = None
    cpt_loss:torch.FloatTensor = None
    lf_loss:torch.FloatTensor = None


class BertForTokenRationale(BertPreTrainedModel):

    def __init__(self, config):
        super(BertForTokenRationale, self).__init__(config)
        self.config = config
        self.num_labels = 2
        # self.nei_index = config.nei_index
        self.mask_token_id = tokenizer.mask_token_id
        self.bert = BertModel(config).to(device)
        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # # nn.Dropout: 为了防止或减轻过拟合而使用的函数
        # self.masker = nn.Linear(config.hidden_size, 2)
        # # 是用来设置网络中的全连接层的，而在全连接层中的输入与输出都是二维张量，一般形状为[batch_size, size]


        # self.sufficiency_weight = config.sufficiency_weight # sufficiency 好像不需要weight
        self.config.cph_weight = 0.3
        self.config.cpt_weight = 0.2
        self.config.lf_weight = 0.2
        # self.batch_size = config.per_device_train_batch_size

        # self.train_size=config.per_device_train_batch_size
        # self.max_seq_length = config.max_seq_length

        # shapley
        # self.max_order = config.max_order
        self.num_neighbors = 2
        self.top_percentage = 0.2
        self.con_count =3

        self.cph_margin =0.05
        self.lf_seed=1234
        self.lf_num = 1


        self.init_weights()
        self.classifier = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english').to(device)
        for p in self.classifier.parameters():
            p.requires_grad = False

    def mask(
        self,
        input_ids=None,
        attention_mask=None,
        # token_type_ids=None,
        # position_ids=None,
        evidence_mask=None,
        labels=None,
    ):
        # print(type(attention_mask))
        # print(attention_mask)
        d = tf.reduce_sum(attention_mask, 1).numpy().tolist()
        # attention_mask.tolist().sum(axis=1)
        rationale_weights = []
        noise_weights = []
        r_d = []
        for i in range(len(input_ids)):
            positions_dict, key_to_idx, positions, coefficients, unique_inverse = construct_positions_connectedshapley(int(d[i]),k=2)


            mask_weights = torch.Tensor(positions).unsqueeze(0).to(device)
            mask_ids = torch.Tensor(input_ids[i].numpy())[:d[i]].clone().fill_(self.mask_token_id).to(device)
            input_id = torch.Tensor(input_ids[i].numpy())[:d[i]].to(device)
            mix_inputs = (1-mask_weights) * mask_ids +  mask_weights * input_id
            mix_inputs = mix_inputs.long().squeeze().to(device)
            
            
            score = explain_shapley(self.bert,int(d[i]), self.num_neighbors, key_to_idx,mix_inputs, input_id.long(),
                                    coefficients, unique_inverse)

            # score = torch.tensor(np.array(score))
            r_len = math.ceil(int(d[i])*self.top_percentage)
            # score_ind_order = torch.sort(score).indices
            # score_val_order = torch.sort(score).values

            one_hot = [onehat_pre(score,r_len)]
            ra_can = continuity(score, one_hot, self.con_count)
            s_d_r = rl_cd_ss(ra_can,score)

            max_sdr_ind = s_d_r.index(max(s_d_r))


            score_ind_order_d = torch.sort(torch.Tensor(np.array(score)),descending=False).indices[:r_len]

            nw = torch.Tensor([1 if i in score_ind_order_d else 0 for i in range(int(d[i]))]).to(device)
            nw = pad_sequence([nw,input_id]).T[0].tolist()

            noise_weights.append(nw)


            rw = torch.tensor(ra_can[max_sdr_ind]).to(device)
            rw = pad_sequence([rw,input_id]).T[0].tolist()
            rationale_weights.append(rw)
            r_d.append([i for i, x in enumerate(ra_can[max_sdr_ind]) if x == 1])

        rationale_weights = torch.Tensor(rationale_weights)
        noise_weights = torch.Tensor(noise_weights)


        rationale_weights = F.gumbel_softmax(rationale_weights, tau=0.1)
        noise_weights = F.gumbel_softmax(noise_weights, tau=0.1)

        return MaskOutput(
            rationale=rationale_weights,
            noise = noise_weights,
            r_ind=r_d,

        )

    def classify(
        self,
        input_ids=None,
        mask_weights=None,
        attention_mask=None,
        # token_type_ids=None,
        # position_ids=None,
        labels=None,
    ):
        def embed(ids):
            embeds = self.classifier.get_input_embeddings()(ids)
            return embeds

        # Embed inputs.
        input_ids = input_ids.long().to(device)
        input_embeds = embed(input_ids)

        # Targeted mask.
        mask_ids = input_ids.clone().fill_(self.mask_token_id).to(device)
        mask_embeds = embed(mask_ids).to(device)
        mask_weights = mask_weights.unsqueeze(-1).to(device)

        # Mix embeddings.
        # 我们直接用0mask还是像这样用 additive embedding呢
        mix_embeds = (1-mask_weights) * mask_embeds +  mask_weights * input_embeds
        
        # print(type(mix_embeds))
        # print(type(attention_mask))
        attention_mask = torch.Tensor(attention_mask.numpy()).long().to(device)
        labels= torch.Tensor(labels.numpy()).long().to(device)
        # Run model.
        outputs = self.classifier(
            inputs_embeds=mix_embeds,
            # attention_mask=attention_mask,
            # token_type_ids=token_type_ids,
            # position_ids=position_ids,
            return_dict=True
        )
        logits = outputs.logits

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(reduction="none")
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return ClassifyOutput(
            logits=logits,
            loss=loss,
        )

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        # token_type_ids=None,
        # position_ids=None,
        evidence_mask=None,
        labels=None,
        mask_labels=None,
        # is_unsupervised=None,
        **kwargs,
    ):

        # Encode the inputs and predict token-level masking scores.
        mask_output = self.mask(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids,
            # position_ids=position_ids,
            evidence_mask=evidence_mask,
            labels=None)
        rationale_weights = mask_output.rationale
        noise_weights = mask_output.noise
        rationale_index = mask_output.r_ind


        # Get the output with targeted masking.
        masked_cls_output = self.classify(
            input_ids=torch.Tensor(input_ids.numpy()),
            mask_weights=rationale_weights,
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids,
            # position_ids=position_ids,
            labels=labels)
        # Suffiency
        suffiency_loss = masked_cls_output.loss if labels is not None else 0

        noise_cls_output = self.classify(
            input_ids=torch.Tensor(input_ids.numpy()),
            mask_weights=noise_weights,
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids,
            # position_ids=position_ids,
            labels=labels)
        # Noise
        noise_loss = noise_cls_output.loss if labels is not None else 0

        # Comprehensive
        cph_loss = []
        for i in range(len(suffiency_loss)):
          cph_loss.append(max(suffiency_loss[i]-noise_loss[i]+self.cph_margin,0+1e-12))
        cph_loss = torch.Tensor(cph_loss).to(device)

        # Compute (soft) number of masked tokens
        mask_w = mask_output.rationale.sum(dim=1).tolist()
        t = tf.reduce_sum(attention_mask, 1).numpy().tolist()
        cpt_loss = []
        for i in range(len(input_ids)):
            total = t[i]
            num_masked = len(rationale_index[i])
            cpt_loss.append(max((num_masked/total)-self.top_percentage,0+1e-12))
        # Compactness
        cpt_loss = torch.Tensor(cpt_loss).to(device)

        # logical fluency
        one_hot_o = rationale_weights.tolist()
        new_tokens = []
        for i in range(len(one_hot_o)):
            for j in range(self.lf_num):
                new_order = newset(rationale_index[i],one_hot_o[i],self.lf_seed,self.lf_num)[j]
                # print(new_order)
            new_tokens.append([input_ids[i].numpy()[k] for k in new_order])
        new_tokens = torch.Tensor(new_tokens).to(device)
        lf_cls_output = self.classify(
            input_ids=new_tokens,
            mask_weights=new_tokens.clone().fill_(1),
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids,
            # position_ids=position_ids,
            labels=labels)

        full_cls_output = self.classify(
            input_ids=torch.Tensor(input_ids.numpy()),
            mask_weights=torch.Tensor(input_ids.numpy()).clone().fill_(1),
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids,
            # position_ids=position_ids,
            labels=labels)
        # logical fluency
        l_loss = lf_cls_output.loss if labels is not None else 0
        f_loss = full_cls_output.loss if labels is not None else 0
        lf_loss= []
        for i in range(len(l_loss)):
          lf_loss.append(max((l_loss[i]-f_loss[i]),0+1e-12))
        lf_loss = torch.Tensor(lf_loss).to(device)

        # Add loss components.
        loss = ( suffiency_loss +
                self.config.cph_weight * cph_loss +
                self.config.cpt_weight * cpt_loss +
                self.config.lf_weight * lf_loss) 
        loss = loss.mean()
        loss.requires_grad = True

        # print(masked_cls_output.logits.shape)
        # print(mask_output.rationale.shape)
        # print(labels.shape)
        # print(mask_labels)
        # print(mask_labels.shape)
        # print(input_ids.shape)
        return TokenTaggingRationaleOutput(
            loss=loss,
            # full_logits=full_cls_output.logits,
            masked_logits=masked_cls_output.logits,
            rationale=mask_output.rationale,
            labels=labels,
            mask_labels=mask_labels,
            # is_unsupervised=is_unsupervised,
            input_ids=input_ids,
            s_loss=suffiency_loss.mean(),
            cph_loss=cph_loss.mean(),
            cpt_loss=cpt_loss.mean(),
            lf_loss=lf_loss.mean(),
        )

In [16]:

from transformers import AdamW
from transformers import get_scheduler

# Instantiate the model using a checkpoint and define the number of label categories
model = BertForTokenRationale.from_pretrained(checkpoint, num_labels=2)

# Test the model
outputs = model(**batch)
print(f"\n\n\n... MODEL OUTPUT SHAPE : {outputs.masked_logits.shape} ")
print(f"... MODEL OUTPUT LOSS  : {outputs.loss} ")

# Instantiate our optimizer using the defaults
optimizer = AdamW(model.parameters(), lr=5e-5)
print(f"\n\n\n... ADAMW OPTIMIZER OBJECT : {optimizer} ")

# Instantiate our learning rate scheduler using defaults
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(f"\n\n\n... LEARNING RATE SCHEDULER OBJECT   : {lr_scheduler} ")
print(f"... LEARNING RATE SCHEDULE # OF STEPS : {num_training_steps} ")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenRationale: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenRationale from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenRationale from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenRationale were not initialized from the model checkpoint at bert-base-uncased and are newly




... MODEL OUTPUT SHAPE : torch.Size([1, 2]) 
... MODEL OUTPUT LOSS  : 0.6406993269920349 



... ADAMW OPTIMIZER OBJECT : AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    lr: 5e-05
    weight_decay: 0.0
) 



... LEARNING RATE SCHEDULER OBJECT   : <torch.optim.lr_scheduler.LambdaLR object at 0x7fa39c760820> 
... LEARNING RATE SCHEDULE # OF STEPS : 180 




In [17]:
import torch
from tqdm.auto import tqdm

# If GPU available ensure training occurs on it... otherwise fallback to CPU
#      - Get device
#      - Push model to device (CPU or GPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Check which device we are using
print(f"\n\n\n... TRAINING WILL OCCUR USING {device} ...\n")

# Define the progress bar for training based on # of steps
progress_bar = tqdm(range(num_training_steps))

# ----------------------------------------------------------
# --------------------- TRAINING LOOP ---------------------
# ----------------------------------------------------------
model.train()
for epoch in range(num_epochs):
    loss_s = []
    for batch in train_dataloader:
        # batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        # for name, parms in model.named_parameters():
        #         print('-->name:', name)
        #         # print('-->para:', parms)
        #         print('-->grad_requirs:',parms.requires_grad)
        #         print('-->grad_value:',parms.grad)
        #         print("===")
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        loss_s.append(loss.tolist())
    # print(loss_s)
    print("=============更新之后===========")
    print(sum(loss_s))
    print(np.mean(loss_s))
    
    # for name, parms in model.named_parameters():	
    #     print('-->name:', name)
    #     # print('-->para:', parms)
    #     print('-->grad_requirs:',parms.requires_grad)
    #     print('-->grad_value:',parms.grad)
    print(optimizer)
    print("===")

# ----------------------------------------------------------




... TRAINING WILL OCCUR USING cuda ...



  0%|          | 0/180 [00:00<?, ?it/s]

25.684633135795593
0.7134620315498776
AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    initial_lr: 5e-05
    lr: 4e-05
    weight_decay: 0.0
)
===
24.591809034347534
0.6831058065096537
AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    initial_lr: 5e-05
    lr: 3e-05
    weight_decay: 0.0
)
===
24.801637142896652
0.6889343650804626
AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    initial_lr: 5e-05
    lr: 2e-05
    weight_decay: 0.0
)
===
25.375915974378586
0.7048865548438497
AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    initial_lr: 5e-05
    lr: 1e-05
    weight_decay: 0.0
)
===
24.74106204509735
0.6872517234749265
AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    initial_lr: 5e-05
    lr: 0.0
    weight_decay: 0.0
)
===
