In [1]:
## the HuggingFace libraries are not already installed in Colab
!pip install transformers==4.17 # this will also install tokenizers 
!pip install datasets
!pip install optuna
!pip install ray

Collecting transformers==4.17
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 7.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 64.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 84.6 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 56.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: p

In [2]:
import torch
import torch.nn as nn 
import torch.optim as optim 
import numpy as np 
import matplotlib.pyplot as plt
import nltk
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, roc_curve, auc
from nltk.tokenize import sent_tokenize
import pandas as pd
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, RobertaForSequenceClassification,RobertaModel, Trainer, TrainingArguments
from transformers import AdamW, get_linear_schedule_with_warmup
import datasets
from ray.tune.schedulers import PopulationBasedTraining,ASHAScheduler
from ray import tune
from ray.tune import uniform, CLIReporter
import optuna
from random import randint
from sklearn.metrics import accuracy_score
from ray.tune.stopper import TrialPlateauStopper
import os

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


Load Data

In [3]:
data = datasets.load_dataset("tweet_eval", "emotion")

Downloading builder script:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Downloading and preparing dataset tweet_eval/emotion (download: 472.47 KiB, generated: 511.52 KiB, post-processed: Unknown size, total: 984.00 KiB) to /root/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343...


Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/134k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183 [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

Dataset tweet_eval downloaded and prepared to /root/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
### number of labels
num_labels = len(np.unique(data['train']['label']))
print (num_labels)

4


Hyperparameters for model

In [5]:
# Hyper-parameters
pretrained_model_name = "roberta-base" # we can change this, and play with any model from HuggingFace! 
# BERT-base: "bert-base-cased" #Sensitive to lower and upper case
# BERT-large: "bert-large-cased"
# RoBERTa-base: "roberta-base"
# RoBERTa-large: "roberta-large"
# etc ... 
max_length = 50
batch_size = 32
d_in = 768 # bert-base / roberta-base
d_h = 512 # 512 
d_out = num_labels # 2 classes so 1 is enough
freeze_pretrained = False

Tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name,
    cache_dir = "."
)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Hyperparameters

In [7]:
# Hyper-parameters
n_epochs = 10
lr = 3e-5
warmup = 0.06
max_gradient_norm = 10
patience = 3

Fine-tune model class

In [8]:
class FinetuneModel(nn.Module):
    """Pretrained Model + classifier
    """
    def __init__(self, pretrained_model, d_in, d_h, d_out, dropout, freeze_pretrained=False):
        super(FinetuneModel, self).__init__()

        # Instantiate BERT model
        self.pretrained_model = pretrained_model

        # Instantiate a 2-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(d_in, d_h),
            nn.ReLU(),
            nn.Dropout(float(dropout)),
            nn.Linear(d_h, d_out)
        )

        # Freeze the encoder model
        if freeze_pretrained:
            for param in self.pretrained_model.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        # Feed input to pretrained model
        pre_outputs = self.pretrained_model(input_ids = input_ids,attention_mask = attention_mask)
        # print (pre_outputs[0].shape,pre_outputs[1].shape)
        # print (pre_outputs.keys())
        hidden_state = pre_outputs[0]
        hidden_state = hidden_state[:,0,:]
        logits = self.classifier(hidden_state)
        # print(outputs.keys()) # ["last_hidden_state", "pooler_output", "hidden_states", "attention"] last hidden state 才是最后的hidden
        # print (outputs[0].shape)
        # Extract the last hidden state
        # last_hidden = outputs["last_hidden_state"] # shape: (batch_size, sequence_length, d_in)
        
        # # Use the `[CLS]` token of the last hidden state for classification task
        # last_hidden_state_cls = last_hidden[:, 0, :] # shape: (batch_size, d_in)

        # Feed input to classifier to compute logits
        # logits = self.classifier(outputs[0]) # shape: (batch_size, 1)

        return logits

Dataset class

In [9]:
class EmotionsDataset():

    def __init__(self, tokenizer, df,max_length):
        super(EmotionsDataset, self).__init__()

        self.tokenizer = tokenizer
        self.max_seq_len = max_length
        self.input_ids, self.attention_mask,self.token_type_ids,self.label = self.get_input(df)
        

    def __len__(self):
        return len(self.label)
    
    def trunate_and_pad(self, tokens_seq):
        
        # Concat '[CLS]' at the beginning
        tokens_seq = ['[CLS]'] + tokens_seq     
        # Truncate sequences of which the lengths exceed the max_seq_len
        if len(tokens_seq) > self.max_seq_len:
            tokens_seq = tokens_seq[0 : self.max_seq_len]           
        # Generate padding
        padding = [0] * (self.max_seq_len - len(tokens_seq))       
        # Convert tokens_seq to token_ids
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens_seq) # convert to id
        input_ids += padding   # add padding
        # Create attention_mask
        attention_mask = [1] * len(tokens_seq) + padding     
        # Create token_type_ids
        token_type_ids = [0] * (self.max_seq_len) # for identifying next sentence prediction
        
        assert len(input_ids) == self.max_seq_len
        assert len(attention_mask) == self.max_seq_len
        assert len(token_type_ids) == self.max_seq_len
        
        return input_ids, attention_mask, token_type_ids 

    def get_input(self,df):          
        tweet = df['text'].values
        label = df['label'].values
        tokens_seq = list(map(self.tokenizer.tokenize,tweet))
        result = list(map(self.trunate_and_pad, tokens_seq))
        input_ids = [i[0] for i in result]
        attention_mask = [i[1] for i in result]
        token_type_ids = [i[2] for i in result]

        return (
               torch.Tensor(input_ids).type(torch.long), 
               torch.Tensor(attention_mask).type(torch.long),
               torch.Tensor(token_type_ids).type(torch.long), 
               torch.Tensor(label).type(torch.long)
               )

    def __getitem__(self, item):
        return self.input_ids[item], self.attention_mask[item],self.token_type_ids[item],self.label[item]

Function to load data

In [10]:
def load_data(data_dir = None):
    if load_data  == None:
        datasets.load_dataset('tweet_eval','emotion')
    data_train = datasets.Dataset.to_pandas(data["train"])
    data_val = datasets.Dataset.to_pandas(data["validation"])
    data_test = datasets.Dataset.to_pandas(data["test"])
    ## Emotion dataset
    train_dataset = EmotionsDataset(tokenizer, data_train, max_length)
    val_dataset = EmotionsDataset(tokenizer, data_val, max_length)
    test_dataset = EmotionsDataset(tokenizer, data_test, max_length)
    ## dataloader
    # train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True) # shuffle the training set
    # val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = batch_size, shuffle = False)
    # test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size, shuffle = False)
    return train_dataset,val_dataset,test_dataset

Config 

In [11]:
### config_model
config_model = AutoConfig.from_pretrained(
    pretrained_model_name, 
    output_hidden_states = True,
    output_attention = False
)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Train function

In [12]:
def set_seed(seed_value = 0):
    """Set seed for reproducibility.
    """
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(train_loader, model, optimizer, scheduler,loss_fct):
    model.train()
    train_losses = []
    correct_preds = 0
    pred_class = []
    actual_class = []
    tqdm_loader = tqdm(train_loader)
    for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in enumerate (tqdm_loader):
        model.zero_grad()
  

        input_ids = batch_seqs
        attention_mask = batch_seq_masks
        segments = batch_seq_segments
        labels = batch_labels

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        segments = segments.to(device)
        labels = labels.to(device)

        logits = model(input_ids, attention_mask) # forward pass (batch_size, 1)
        loss = loss_fct(logits,labels)
        probs = nn.functional.softmax(logits,dim = -1)
        _,out_classes = torch.max(logits,dim = 1)
        ## for weighted accuracy
        pred_class.extend(out_classes.cpu().numpy())
        actual_class.extend(labels.cpu().numpy())

        correct = (out_classes == labels).sum()
        correct_preds += correct.item()
        train_losses.append(loss.item())

        loss.backward() # backward pass
        ## clip grad
        nn.utils.clip_grad_norm_(model.parameters(), max_gradient_norm)
        optimizer.step() # weights update
        optimizer.zero_grad()
        
    train_loss = np.mean(train_losses)
    epoch_accuracy = correct_preds /len(train_loader.dataset)
    # epoch_accuracy = balanced_accuracy_score(actual_class,pred_class)

    return train_loss,epoch_accuracy

from sklearn.metrics import roc_auc_score, f1_score
def eval_multiclass(loader, model,loss_fct):
    model.eval()
    val_losses = []
    val_probs = []
    acc_correct = 0
    iter = 0
    log_interval = 100
    sigmoid = nn.Sigmoid()
    log_interval = 50
    count = 0
    val_labels = []
    val_targets = []
    pred_class = []
    actual_class = []
  
    # threshold = torch.tensor([0.5]).to(device)
    # loader = tqdm(loader)
    for (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in loader:
        input_ids = batch_seqs
        attention_mask = batch_seq_masks
        segments = batch_seq_segments
        labels = batch_labels

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        segments = segments.to(device)
        # print (attention_mask.shape,labels.shape)
        # print (segments.shape)
        
        with torch.no_grad():
            valid_logits = model(input_ids, attention_mask) # raw logits
            # print (f'logits = {valid_logits}')
            # print(f'labels = {labels}')
            valid_loss = loss_fct(valid_logits,labels) # loss
            valid_probs = nn.functional.softmax(valid_logits,dim = -1)
            _,out_classes = torch.max(valid_logits,dim = 1)
            # if count % log_interval == 0:
            #   print (f'labels = {labels}')
            #   print (f'predicted labels = {out_classes}')
            f1_target = torch.argmax(valid_probs,dim= 1)
            # print (f1_target)
            correct = (out_classes == labels).sum()
            acc_correct += correct.item()
            # weighted accuracy

            pred_class.extend(out_classes.cpu().numpy())
            actual_class.extend(labels.cpu().numpy())


            # print (valid_probs.shape)
            val_probs.extend(valid_probs[:,:].cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
            val_losses.append(valid_loss.item())
            val_targets.extend(f1_target.cpu().numpy())
            count +=1
            # print (count)
    val_loss = np.mean(val_losses)
    val_probs = np.array(val_probs)
    val_labels = np.array(val_labels)
    val_targets = np.array(val_targets)
    valid_accuracy = acc_correct/(len(loader.dataset))
    # valid_accuracy = balanced_accuracy_score(actual_class,pred_class)

    
    val_auc = 100* roc_auc_score(val_labels, val_probs, multi_class='ovr')
    val_f1_score = f1_score(val_labels,val_targets,average = 'weighted')

    return val_loss, val_auc, val_targets, val_labels,valid_accuracy,val_f1_score

In [13]:
def train_function(config, checkpoint_dir='fine-tuned_model', data_dir=None):

    # if checkpoint_dir == None:
    #   checkpoint_dir = 'fine-tuned_model'
    ## define model
    model = RobertaModel.from_pretrained("roberta-base",config = config_model)
    model = FinetuneModel(model,d_in,config['d_h'],d_out,config['dropout'],freeze_pretrained) #pretrained_model, d_in, d_h, d_out, freeze_pretrained=False

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
    model.to(device)
    print (f'is GPU available? {torch.cuda.is_available()}')
    print (f'device used: {device}')
    ### Optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
            {
                    'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                    'weight_decay':config['weight_decay']
            },
            {
                    'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                    'weight_decay':0.0
            }
    ]
    # optimizer = AdamW(optimizer_grouped_parameters, lr=config['lr'],betas = (config['beta1'],config['beta2']),eps = config['eps'])
    if config["optimizer"] == "adamW":
      optimizer = AdamW(optimizer_grouped_parameters, lr=config['lr'],betas = (config['beta1'],config['beta2']),eps = config['eps'])
    elif config["optimizer"] == "adam":
      optimizer = optim.Adam(model.parameters(), lr = config['lr'])
    elif config["optimizer"] == "SGD":
      optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"])
      
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=config['factor'], patience=config['patience'])
    criterion = nn.CrossEntropyLoss()

    # if checkpoint_dir:
    #     model_state, optimizer_state = torch.load(
    #         os.path.join(checkpoint_dir, "checkpoint"))
    #     model.load_state_dict(model_state)
    #     optimizer.load_state_dict(optimizer_state)

    trainset, valset,_ = load_data(data_dir)


    trainloader = torch.utils.data.DataLoader(
        trainset,
        batch_size=int(config["batch_size"]),
        shuffle=True)
    valloader = torch.utils.data.DataLoader(
        valset,
        batch_size=int(config["batch_size"]),
        shuffle=False)
    ## Training ####
    for epoch in range(config['num_epochs']):  # loop over the dataset multiple times
        train_loss, epoch_accuracy = train(trainloader, model, optimizer, scheduler, criterion)

        # Validation loss
        val_loss, val_auc, val_targets, val_labels,valid_accuracy,val_f1_score = eval_multiclass(valloader, model, criterion)
                
        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            print (f'checkpoint path : {path}')
            torch.save((model.state_dict(), optimizer.state_dict(),scheduler.state_dict()), path)
            print ('Model Saved!')
        tune.report(loss=val_loss, accuracy=valid_accuracy)
        ## scheduler
        scheduler.step(valid_accuracy)
    print("Finished Training")

Test accuracy function for test set

In [14]:
def test_accuracy(model, device="cpu"):
    _,_,testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False)
    criterion = nn.CrossEntropyLoss()
    correct = 0
    total = 0
    test_loss = 0.
    test_steps = 0
    with torch.no_grad():
        for batch_index, (batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) in enumerate (testloader):
            batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels = batch_seqs.to(device), batch_seq_masks.to(device), batch_seq_segments.to(device), batch_labels.to(device)
            outputs = model(batch_seqs, batch_seq_masks)
            _, predicted = torch.max(outputs.data, 1)
            loss = criterion(outputs,batch_labels)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()
            test_loss += loss.cpu().numpy()
            test_steps += 1

    return correct / total, test_loss/test_steps

In [15]:
config = {
    "num_epochs": 10,
    "lr": 1e-5,
    "batch_size": 16,
    "weight_decay": 0.05,
    "beta1" : 0.9,
    "beta2" : 0.995,
    "eps": 1e-8,
    "max_gradient_norm": 8,
    "d_h" : 512,
    "dropout": 0.1,
    "factor": 0.5,
    'patience': 2,
    "optimizer": tune.choice(["adam", "SGD", "adamW"]),
    "momentum": 0.9
}

In [16]:
def main(num_samples=10,max_num_epochs =10, gpus_per_trial=1):
    load_data()
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    
    # ASHA scheduler
    scheduler = ASHAScheduler(
        metric="accuracy",
        mode="max",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    
    result = tune.run(
        train_function,
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter = reporter,
        )

    best_trial = result.get_best_trial("accuracy", "max", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    # best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    # device = "cpu"
    model = RobertaModel.from_pretrained("roberta-base",config = config_model)
    best_trained_model = FinetuneModel(model,d_in,best_trial.config['d_h'],d_out,best_trial.config['dropout'],freeze_pretrained)
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state,scheduler_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc,test_loss = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {} , test loss : {}".format(test_acc,test_loss))
    print (f'scheduler : {scheduler_state.keys()}')



Tuning on original dataset

In [17]:
main(num_samples=10,max_num_epochs=10, gpus_per_trial=1)

2022-04-14 07:49:46,980	INFO logger.py:606 -- pip install "ray[tune]" to see TensorBoard files.


== Status ==
Current time: 2022-04-14 07:49:48 (running for 00:00:01.56)
Memory usage on this node: 2.2/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_748af_00000 | RUNNING  | 172.28.0.2:322 | adamW       |
| train_function_748af_00001 | PENDING  |                | adamW       |
| train_function_748af_00002 | PENDING  |                | adam        |
| train_function_748af_00003 | PENDING  |                | SGD         |
| train_function_748af_0

[2m[36m(train_function pid=322)[0m Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]
[2m[36m(train_function pid=322)[0m Downloading:   1%|          | 2.94M/478M [00:00<00:16, 30.8MB/s]
Downloading:   1%|▏         | 6.45M/478M [00:00<00:14, 34.3MB/s]
Downloading:   2%|▏         | 10.0M/478M [00:00<00:13, 35.9MB/s]
Downloading:   3%|▎         | 13.9M/478M [00:00<00:12, 37.7MB/s]
Downloading:   4%|▍         | 18.2M/478M [00:00<00:11, 40.5MB/s]
Downloading:   5%|▍         | 22.4M/478M [00:00<00:11, 41.7MB/s]
Downloading:   6%|▌         | 26.8M/478M [00:00<00:11, 42.9MB/s]
Downloading:   6%|▋         | 31.0M/478M [00:00<00:10, 43.3MB/s]
Downloading:   7%|▋         | 35.2M/478M [00:00<00:10, 43.6MB/s]
Downloading:   8%|▊         | 39.4M/478M [00:01<00:10, 43.1MB/s]
Downloading:   9%|▉         | 43.7M/478M [00:01<00:10, 43.8MB/s]
Downloading:  10%|█         | 47.9M/478M [00:01<00:10, 43.9MB/s]
Downloading:  11%|█         | 52.1M/478M [00:01<00:10, 43.7MB/s]
Downloading:  12%|█▏ 

== Status ==
Current time: 2022-04-14 07:49:59 (running for 00:00:12.34)
Memory usage on this node: 2.7/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_748af_00000 | RUNNING  | 172.28.0.2:322 | adamW       |
| train_function_748af_00001 | PENDING  |                | adamW       |
| train_function_748af_00002 | PENDING  |                | adam        |
| train_function_748af_00003 | PENDING  |                | SGD         |
| train_function_748af_0

[2m[36m(train_function pid=322)[0m Downloading:  46%|████▌     | 219M/478M [00:05<00:05, 47.6MB/s]
[2m[36m(train_function pid=322)[0m Downloading:  47%|████▋     | 224M/478M [00:05<00:05, 47.6MB/s]
Downloading:  48%|████▊     | 228M/478M [00:05<00:05, 46.0MB/s]
Downloading:  49%|████▊     | 233M/478M [00:05<00:05, 45.0MB/s]
Downloading:  50%|████▉     | 237M/478M [00:05<00:05, 45.8MB/s]
Downloading:  51%|█████     | 242M/478M [00:05<00:05, 46.1MB/s]
Downloading:  52%|█████▏    | 246M/478M [00:05<00:05, 46.6MB/s]
Downloading:  52%|█████▏    | 251M/478M [00:06<00:05, 46.7MB/s]
Downloading:  53%|█████▎    | 255M/478M [00:06<00:04, 46.9MB/s]
Downloading:  54%|█████▍    | 260M/478M [00:06<00:04, 46.4MB/s]
Downloading:  55%|█████▌    | 264M/478M [00:06<00:04, 46.1MB/s]
Downloading:  56%|█████▌    | 269M/478M [00:06<00:04, 46.1MB/s]
Downloading:  57%|█████▋    | 273M/478M [00:06<00:04, 46.2MB/s]
Downloading:  58%|█████▊    | 277M/478M [00:06<00:04, 46.1MB/s]
Downloading:  59%|█████▉  

== Status ==
Current time: 2022-04-14 07:50:04 (running for 00:00:17.80)
Memory usage on this node: 2.7/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_748af_00000 | RUNNING  | 172.28.0.2:322 | adamW       |
| train_function_748af_00001 | PENDING  |                | adamW       |
| train_function_748af_00002 | PENDING  |                | adam        |
| train_function_748af_00003 | PENDING  |                | SGD         |
| train_function_748af_0

[2m[36m(train_function pid=322)[0m Downloading:  96%|█████████▌| 459M/478M [00:10<00:00, 43.8MB/s]
[2m[36m(train_function pid=322)[0m Downloading:  97%|█████████▋| 463M/478M [00:10<00:00, 44.1MB/s]
Downloading:  98%|█████████▊| 468M/478M [00:10<00:00, 45.9MB/s]
Downloading:  99%|█████████▉| 473M/478M [00:11<00:00, 45.9MB/s]
Downloading: 100%|██████████| 478M/478M [00:11<00:00, 45.1MB/s]
[2m[36m(train_function pid=322)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
[2m[36m(train_function pid=322)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=322)[0m - This IS NOT

== Status ==
Current time: 2022-04-14 07:50:10 (running for 00:00:23.23)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_748af_00000 | RUNNING  | 172.28.0.2:322 | adamW       |
| train_function_748af_00001 | PENDING  |                | adamW       |
| train_function_748af_00002 | PENDING  |                | adam        |
| train_function_748af_00003 | PENDING  |                | SGD         |
| train_function_748af_0



[2m[36m(train_function pid=322)[0m   0%|          | 0/204 [00:00<?, ?it/s]
== Status ==
Current time: 2022-04-14 07:50:21 (running for 00:00:34.14)
Memory usage on this node: 4.2/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_748af_00000 | RUNNING  | 172.28.0.2:322 | adamW       |
| train_function_748af_00001 | PENDING  |                | adamW       |
| train_function_748af_00002 | PENDING  |                | adam        |
| train_function_74

[2m[36m(train_function pid=323)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
[2m[36m(train_function pid=323)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=323)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[2m[36m(train_function pid=323)[0m is GPU available? True
[2m[36m(train_function pid=323)[0m device used: cuda:0
[2m[36m(train_function pid=323)[0m   0%|          | 0/204 [00:00<?, ?it/s]
== Status ==
Current time: 2022-04-14 07:55:28 (running for 00:05:41.47)
Memory usage on this node: 4.6/25.5 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 8.000: 0.7700534759358288 | Iter 4.000: 0.7834224598930482 | Iter 2.000: 0.7700534759358288 | Iter 1.000: 0.7433155080213903
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (8 PENDING, 1 RUNNING, 1 TERMINATED)
+----------------------------+------------+----------------+-------------+---------+------------+----------------------+
| Trial name                 | status     | loc            | optimizer   |    loss |   accuracy |   training_iteration |
|-------------------------

[2m[36m(train_function pid=320)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
[2m[36m(train_function pid=320)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=320)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 07:56:34 (running for 00:06:47.35)
Memory usage on this node: 3.7/25.5 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 8.000: 0.7700534759358288 | Iter 4.000: 0.7834224598930482 | Iter 2.000: 0.7687165775401069 | Iter 1.000: 0.7486631016042781
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (7 PENDING, 1 RUNNING, 2 TERMINATED)
+----------------------------+------------+----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc            | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+----------------+-------------+----------+------------+----------------------|
| train_function_748af_00002 | RUNNING    | 172.28.0.2:320 | adam        |          |            | 

[2m[36m(train_function pid=321)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
[2m[36m(train_function pid=321)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=321)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 08:01:41 (running for 00:11:54.24)
Memory usage on this node: 3.6/25.5 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 8.000: 0.7727272727272727 | Iter 4.000: 0.7847593582887701 | Iter 2.000: 0.7700534759358288 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (6 PENDING, 1 RUNNING, 3 TERMINATED)
+----------------------------+------------+----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc            | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+----------------+-------------+----------+------------+----------------------|
| train_function_748af_00003 | RUNNING    | 172.28.0.2:321 | SGD         |          |            | 

[2m[36m(train_function pid=991)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
[2m[36m(train_function pid=991)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=991)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 08:02:24 (running for 00:12:37.01)
Memory usage on this node: 4.6/25.5 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 8.000: 0.7727272727272727 | Iter 4.000: 0.7847593582887701 | Iter 2.000: 0.7700534759358288 | Iter 1.000: 0.7486631016042781
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (5 PENDING, 1 RUNNING, 4 TERMINATED)
+----------------------------+------------+----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc            | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+----------------+-------------+----------+------------+----------------------|
| train_function_748af_00004 | RUNNING    | 172.28.0.2:991 | adam        |          |            | 

[2m[36m(train_function pid=1071)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
[2m[36m(train_function pid=1071)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1071)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 08:03:34 (running for 00:13:47.78)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 8.000: 0.7727272727272727 | Iter 4.000: 0.7847593582887701 | Iter 2.000: 0.7687165775401069 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (4 PENDING, 1 RUNNING, 5 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_748af_00005 | RUNNING    | 172.28.0.2:1071 | adam        |          |          

[2m[36m(train_function pid=1185)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
[2m[36m(train_function pid=1185)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1185)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 08:05:45 (running for 00:15:58.29)
Memory usage on this node: 3.6/25.5 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 8.000: 0.7727272727272727 | Iter 4.000: 0.7834224598930482 | Iter 2.000: 0.7700534759358288 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (3 PENDING, 1 RUNNING, 6 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_748af_00006 | RUNNING    | 172.28.0.2:1185 | adamW       |          |          



[2m[36m(train_function pid=1185)[0m   0%|          | 0/204 [00:00<?, ?it/s]
== Status ==
Current time: 2022-04-14 08:05:50 (running for 00:16:03.74)
Memory usage on this node: 4.5/25.5 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 8.000: 0.7727272727272727 | Iter 4.000: 0.7834224598930482 | Iter 2.000: 0.7700534759358288 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (3 PENDING, 1 RUNNING, 6 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_

[2m[36m(train_function pid=1252)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
[2m[36m(train_function pid=1252)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1252)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[2m[36m(train_function pid=1252)[0m is GPU available? True
[2m[36m(train_function pid=1252)[0m device used: cuda:0




== Status ==
Current time: 2022-04-14 08:06:33 (running for 00:16:46.98)
Memory usage on this node: 4.5/25.5 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 8.000: 0.7727272727272727 | Iter 4.000: 0.7834224598930482 | Iter 2.000: 0.7700534759358288 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (2 PENDING, 1 RUNNING, 7 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_748af_00007 | RUNNING    | 172.28.0.2:1252 | adamW       |          |          

[2m[36m(train_function pid=1366)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
[2m[36m(train_function pid=1366)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1366)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 08:08:40 (running for 00:18:54.00)
Memory usage on this node: 4.2/25.5 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 8.000: 0.7727272727272727 | Iter 4.000: 0.7807486631016043 | Iter 2.000: 0.7713903743315508 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (1 PENDING, 1 RUNNING, 8 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_748af_00008 | RUNNING    | 172.28.0.2:1366 | SGD         |          |          

[2m[36m(train_function pid=1430)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
[2m[36m(train_function pid=1430)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1430)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 08:09:18 (running for 00:19:31.53)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 8.000: 0.7727272727272727 | Iter 4.000: 0.7807486631016043 | Iter 2.000: 0.7713903743315508 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (1 RUNNING, 9 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_748af_00009 | RUNNING    | 172.28.0.2:1430 | adamW       |          |            |        



[2m[36m(train_function pid=1430)[0m   0%|          | 0/204 [00:00<?, ?it/s]
== Status ==
Current time: 2022-04-14 08:09:23 (running for 00:19:36.54)
Memory usage on this node: 4.6/25.5 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 8.000: 0.7727272727272727 | Iter 4.000: 0.7807486631016043 | Iter 2.000: 0.7713903743315508 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (1 RUNNING, 9 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_748af_00009



[2m[36m(train_function pid=1430)[0m   0%|          | 0/204 [00:00<?, ?it/s]
== Status ==
Current time: 2022-04-14 08:13:13 (running for 00:23:26.64)
Memory usage on this node: 4.0/25.5 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 8.000: 0.7727272727272727 | Iter 4.000: 0.7834224598930482 | Iter 2.000: 0.7727272727272727 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.34 GiB heap, 0.0/7.17 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_07-49-46
Number of trials: 10/10 (1 RUNNING, 9 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_748af_00009

2022-04-14 08:14:10,371	INFO tune.py:639 -- Total run time: 1463.53 seconds (1463.09 seconds for the tuning loop).


Best trial config: {'num_epochs': 10, 'lr': 1e-05, 'batch_size': 16, 'weight_decay': 0.05, 'beta1': 0.9, 'beta2': 0.995, 'eps': 1e-08, 'max_gradient_norm': 8, 'd_h': 512, 'dropout': 0.1, 'factor': 0.5, 'patience': 2, 'optimizer': 'adam', 'momentum': 0.9}
Best trial final validation loss: 1.0733110674967368
Best trial final validation accuracy: 0.786096256684492


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Best trial test set accuracy: 0.8114004222378607 , test loss : 0.9032420500912808
scheduler : dict_keys(['factor', 'min_lrs', 'patience', 'verbose', 'cooldown', 'cooldown_counter', 'mode', 'threshold', 'threshold_mode', 'best', 'num_bad_epochs', 'mode_worse', 'eps', 'last_epoch', '_last_lr'])


Tuning on cleaned **dataset**

In [17]:
from nltk.stem import WordNetLemmatizer 
!pip install emoji
import emoji
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
!pip install emot
import re
from emot.emo_unicode import UNICODE_EMOJI

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[?25l[K     |█▉                              | 10 kB 29.1 MB/s eta 0:00:01[K     |███▊                            | 20 kB 22.7 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 16.6 MB/s eta 0:00:01[K     |███████▌                        | 40 kB 14.9 MB/s eta 0:00:01[K     |█████████▍                      | 51 kB 7.7 MB/s eta 0:00:01[K     |███████████▏                    | 61 kB 9.0 MB/s eta 0:00:01[K     |█████████████                   | 71 kB 9.4 MB/s eta 0:00:01[K     |███████████████                 | 81 kB 8.8 MB/s eta 0:00:01[K     |████████████████▉               | 92 kB 9.7 MB/s eta 0:00:01[K     |██████████████████▊             | 102 kB 8.3 MB/s eta 0:00:01[K     |████████████████████▌           | 112 kB 8.3 MB/s eta 0:00:01[K     |██████████████████████▍         | 122 kB 8.3 MB/s eta 0:00:01[K     |████████████████████████▎       | 133 kB 8.3 MB/s eta 0:00:01[K     |███████

In [18]:
class CleanedEmotionsDataset():

    def __init__(self, tokenizer, df,max_length):
        super(CleanedEmotionsDataset, self).__init__()

        self.tokenizer = tokenizer
        self.max_seq_len = max_length
        self.input_ids, self.attention_mask,self.token_type_ids,self.label = self.get_input(df)
        

    def __len__(self):
        return len(self.label)
    
    def trunate_and_pad(self, tokens_seq):
        
        # Concat '[CLS]' at the beginning
        tokens_seq = ['[CLS]'] + tokens_seq     
        # Truncate sequences of which the lengths exceed the max_seq_len
        if len(tokens_seq) > self.max_seq_len:
            tokens_seq = tokens_seq[0 : self.max_seq_len]           
        # Generate padding
        padding = [0] * (self.max_seq_len - len(tokens_seq))       
        # Convert tokens_seq to token_ids
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens_seq) # convert to id
        input_ids += padding   # add padding
        # Create attention_mask
        attention_mask = [1] * len(tokens_seq) + padding     
        # Create token_type_ids
        token_type_ids = [0] * (self.max_seq_len) # for identifying next sentence prediction
        
        assert len(input_ids) == self.max_seq_len
        assert len(attention_mask) == self.max_seq_len
        assert len(token_type_ids) == self.max_seq_len
        
        return input_ids, attention_mask, token_type_ids 

    # strip  @ with words and numbers
    def remove_pattern(self,input_txt):
        r = re.findall("@[\w]*", input_txt)
        for i in r:
            input_txt = re.sub(i, '', input_txt)
        input_txt = re.sub(r"\d+", "", input_txt)
        return input_txt  

    ## strip all @ and # with words following
    def strip_links(self,text):
        link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
        links = re.findall(link_regex, text)
        for link in links:
            text = text.replace(link[0], ', ')    
        return text
        
    def strip_all_entities(self,text):
        entity_prefixes = ['@','#']
        for separator in  string.punctuation:
            if separator not in entity_prefixes :
                text = text.replace(separator,' ')
        words = []
        for word in text.split():
            word = word.strip()
            if word:
                if word[0] not in entity_prefixes:
                    words.append(word)
        return ' '.join(words)

        # removing stopwords
    def removing_stopwords(self,text):
        text = repr(text)
        No_StopWords = [word for word in word_tokenize(text) if word.lower() not in stoplist ]
        # Convert list of tokens_without_stopwords to String type.
        words_string = ' '.join(No_StopWords)    
        return words_string

    ## lemmatization
    def lemmatization(self,text):
        # Converting words to their root forms
        lemma = [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]
        return ' '.join(lemma)
    
    ## convert emoji to text
    def convert_emojis(self,text):
        for emot in UNICODE_EMOJI:
            text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
            text = text.replace('_', " ")
        return text

    def get_input(self,df):
        # df_tweet = self.clean_dataset(df['text'])
        ## clean off tags

        for i in range(len(df['text'])):
            df['text'][i] = df['text'][i].lower()
            df['text'][i] = self.remove_pattern(df['text'][i])
            df['text'][i] = self.convert_emojis(df['text'][i])
            # df['text'][i] = self.lemmatization(df['text'][i])
            # df['text'][i] = self.removing_stopwords(df['text'][i])
            # df['text'][i] = self.spell(self.lemmatization(df['text'][i]))
            
        tweet = df['text'].values
        # print (tweet)
        label = df['label'].values
        tokens_seq = list(map(self.tokenizer.tokenize,tweet))
        result = list(map(self.trunate_and_pad, tokens_seq))
        input_ids = [i[0] for i in result]
        attention_mask = [i[1] for i in result]
        token_type_ids = [i[2] for i in result]

        return (
               torch.Tensor(input_ids).type(torch.long), 
               torch.Tensor(attention_mask).type(torch.long),
               torch.Tensor(token_type_ids).type(torch.long), 
               torch.Tensor(label).type(torch.long)
               )

    def __getitem__(self, item):
        return self.input_ids[item], self.attention_mask[item],self.token_type_ids[item],self.label[item]

In [19]:
def load_data(data_dir = None):
    if load_data  == None:
        datasets.load_dataset('tweet_eval','emotion')
    data_train = datasets.Dataset.to_pandas(data["train"])
    data_val = datasets.Dataset.to_pandas(data["validation"])
    data_test = datasets.Dataset.to_pandas(data["test"])
    ## Emotion dataset
    train_dataset = CleanedEmotionsDataset(tokenizer, data_train, max_length)
    val_dataset = CleanedEmotionsDataset(tokenizer, data_val, max_length)
    test_dataset = CleanedEmotionsDataset(tokenizer, data_test, max_length)
    ## dataloader
    # train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True) # shuffle the training set
    # val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = batch_size, shuffle = False)
    # test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = batch_size, shuffle = False)
    return train_dataset,val_dataset,test_dataset

In [20]:
main(num_samples=10, max_num_epochs=10, gpus_per_trial=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
2022-04-14 08:52:27,108	INFO logger.py:606 -- pip install "ray[tune]" to see TensorBoard files.


== Status ==
Current time: 2022-04-14 08:52:28 (running for 00:00:00.99)
Memory usage on this node: 2.2/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_35c0f_00000 | RUNNING  | 172.28.0.2:374 | adam        |
| train_function_35c0f_00001 | PENDING  |                | adamW       |
| train_function_35c0f_00002 | PENDING  |                | SGD         |
| train_function_35c0f_00003 | PENDING  |                | adamW       |
| train_function_35c0f_0

[2m[36m(train_function pid=374)[0m Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]
[2m[36m(train_function pid=374)[0m Downloading:   1%|          | 4.50M/478M [00:00<00:10, 47.1MB/s]
Downloading:   2%|▏         | 10.0M/478M [00:00<00:09, 53.5MB/s]
Downloading:   3%|▎         | 16.2M/478M [00:00<00:08, 58.7MB/s]
Downloading:   5%|▍         | 22.9M/478M [00:00<00:07, 63.4MB/s]
Downloading:   6%|▌         | 29.7M/478M [00:00<00:07, 66.2MB/s]
Downloading:   8%|▊         | 36.5M/478M [00:00<00:06, 67.8MB/s]
Downloading:   9%|▉         | 43.1M/478M [00:00<00:06, 68.2MB/s]
Downloading:  10%|█         | 49.6M/478M [00:00<00:06, 68.4MB/s]
Downloading:  12%|█▏        | 56.3M/478M [00:00<00:06, 68.8MB/s]
Downloading:  13%|█▎        | 63.2M/478M [00:01<00:06, 69.8MB/s]


== Status ==
Current time: 2022-04-14 08:52:33 (running for 00:00:06.13)
Memory usage on this node: 2.7/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_35c0f_00000 | RUNNING  | 172.28.0.2:374 | adam        |
| train_function_35c0f_00001 | PENDING  |                | adamW       |
| train_function_35c0f_00002 | PENDING  |                | SGD         |
| train_function_35c0f_00003 | PENDING  |                | adamW       |
| train_function_35c0f_0

[2m[36m(train_function pid=374)[0m Downloading:  15%|█▍        | 69.8M/478M [00:01<00:06, 68.1MB/s]
Downloading:  16%|█▌        | 76.3M/478M [00:01<00:06, 67.3MB/s]
Downloading:  17%|█▋        | 82.8M/478M [00:01<00:06, 67.5MB/s]
Downloading:  19%|█▊        | 89.4M/478M [00:01<00:06, 67.9MB/s]
Downloading:  20%|██        | 95.8M/478M [00:01<00:05, 67.9MB/s]
Downloading:  21%|██▏       | 102M/478M [00:01<00:05, 67.2MB/s] 
Downloading:  23%|██▎       | 109M/478M [00:01<00:05, 67.9MB/s]
Downloading:  24%|██▍       | 115M/478M [00:01<00:05, 67.7MB/s]
Downloading:  26%|██▌       | 122M/478M [00:01<00:05, 67.7MB/s]
Downloading:  27%|██▋       | 128M/478M [00:02<00:05, 67.7MB/s]
Downloading:  28%|██▊       | 135M/478M [00:02<00:05, 65.2MB/s]
Downloading:  30%|██▉       | 141M/478M [00:02<00:05, 65.5MB/s]
Downloading:  31%|███       | 148M/478M [00:02<00:05, 65.9MB/s]
Downloading:  32%|███▏      | 154M/478M [00:02<00:05, 66.0MB/s]
Downloading:  34%|███▎      | 160M/478M [00:02<00:05, 64.8M

== Status ==
Current time: 2022-04-14 08:52:39 (running for 00:00:11.93)
Memory usage on this node: 2.7/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_35c0f_00000 | RUNNING  | 172.28.0.2:374 | adam        |
| train_function_35c0f_00001 | PENDING  |                | adamW       |
| train_function_35c0f_00002 | PENDING  |                | SGD         |
| train_function_35c0f_00003 | PENDING  |                | adamW       |
| train_function_35c0f_0

[2m[36m(train_function pid=374)[0m Downloading:  95%|█████████▌| 454M/478M [00:07<00:00, 67.0MB/s]
Downloading:  96%|█████████▋| 461M/478M [00:07<00:00, 67.8MB/s]
Downloading:  98%|█████████▊| 467M/478M [00:07<00:00, 67.7MB/s]
Downloading: 100%|██████████| 478M/478M [00:07<00:00, 67.6MB/s]
[2m[36m(train_function pid=374)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
[2m[36m(train_function pid=374)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=374)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exa

== Status ==
Current time: 2022-04-14 08:52:44 (running for 00:00:17.34)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_35c0f_00000 | RUNNING  | 172.28.0.2:374 | adam        |
| train_function_35c0f_00001 | PENDING  |                | adamW       |
| train_function_35c0f_00002 | PENDING  |                | SGD         |
| train_function_35c0f_00003 | PENDING  |                | adamW       |
| train_function_35c0f_0

[2m[36m(train_function pid=374)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=374)[0m 
[2m[36m(train_function pid=374)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=374)[0m   default="WORKER",
[2m[36m(train_function pid=374)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=374)[0m 
[2m[36m(train_function pid=374)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=374)[0m   help="Specify the type of the worker process")
[2m[36m(train_function pid=374)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=374)[0m 
[2m[36m(train_function pid=374)[0m See the caveats in the documentati

[2m[36m(train_function pid=374)[0m is GPU available? True
[2m[36m(train_function pid=374)[0m device used: cuda:0
== Status ==
Current time: 2022-04-14 08:52:55 (running for 00:00:28.11)
Memory usage on this node: 4.2/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_35c0f_00000 | RUNNING  | 172.28.0.2:374 | adam        |
| train_function_35c0f_00001 | PENDING  |                | adamW       |
| train_function_35c0f_00002 | PENDING  |          



== Status ==
Current time: 2022-04-14 08:53:38 (running for 00:01:11.24)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+----------------------------+----------+----------------+-------------+
| Trial name                 | status   | loc            | optimizer   |
|----------------------------+----------+----------------+-------------|
| train_function_35c0f_00000 | RUNNING  | 172.28.0.2:374 | adam        |
| train_function_35c0f_00001 | PENDING  |                | adamW       |
| train_function_35c0f_00002 | PENDING  |                | SGD         |
| train_function_35c0f_00003 | PENDING  |                | adamW       |
| train_function_35c0f_0

[2m[36m(train_function pid=375)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
[2m[36m(train_function pid=375)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=375)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_function pid=375)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=375)[0m 
[2m[36m(train_function pid=375)[0m Se

[2m[36m(train_function pid=375)[0m is GPU available? True
[2m[36m(train_function pid=375)[0m device used: cuda:0
== Status ==
Current time: 2022-04-14 08:58:14 (running for 00:05:47.42)
Memory usage on this node: 4.2/25.5 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 8.000: 0.7887700534759359 | Iter 4.000: 0.8048128342245989 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7620320855614974
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (8 PENDING, 1 RUNNING, 1 TERMINATED)
+----------------------------+------------+----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc            | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+----------------+-------------+----------+------------+----

[2m[36m(train_function pid=372)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
[2m[36m(train_function pid=372)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=372)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 08:59:09 (running for 00:06:42.09)
Memory usage on this node: 4.8/25.5 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 8.000: 0.7887700534759359 | Iter 4.000: 0.8048128342245989 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7580213903743316
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (7 PENDING, 1 RUNNING, 2 TERMINATED)
+----------------------------+------------+----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc            | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00002 | RUNNING    | 172.28.0.2:372 | SGD         |          |            | 

[2m[36m(train_function pid=372)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=372)[0m 
[2m[36m(train_function pid=372)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=372)[0m   default="WORKER",
[2m[36m(train_function pid=372)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=372)[0m 
[2m[36m(train_function pid=372)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=372)[0m   help="Specify the type of the worker process")
[2m[36m(train_function pid=372)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=372)[0m 
[2m[36m(train_function pid=372)[0m See the caveats in the documentati

== Status ==
Current time: 2022-04-14 08:59:14 (running for 00:06:47.45)
Memory usage on this node: 4.2/25.5 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 8.000: 0.7887700534759359 | Iter 4.000: 0.8048128342245989 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7580213903743316
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (7 PENDING, 1 RUNNING, 2 TERMINATED)
+----------------------------+------------+----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc            | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00002 | RUNNING    | 172.28.0.2:372 | SGD         |          |            | 

[2m[36m(train_function pid=373)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
[2m[36m(train_function pid=373)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=373)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 09:00:02 (running for 00:07:35.30)
Memory usage on this node: 4.4/25.5 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 8.000: 0.7887700534759359 | Iter 4.000: 0.8048128342245989 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (6 PENDING, 1 RUNNING, 3 TERMINATED)
+----------------------------+------------+----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc            | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00003 | RUNNING    | 172.28.0.2:373 | adamW       |          |            | 

[2m[36m(train_function pid=373)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=373)[0m 
[2m[36m(train_function pid=373)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=373)[0m   default="WORKER",
[2m[36m(train_function pid=373)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=373)[0m 
[2m[36m(train_function pid=373)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=373)[0m   help="Specify the type of the worker process")
[2m[36m(train_function pid=373)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=373)[0m 
[2m[36m(train_function pid=373)[0m See the caveats in the documentati

== Status ==
Current time: 2022-04-14 09:00:07 (running for 00:07:40.67)
Memory usage on this node: 4.2/25.5 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 8.000: 0.7887700534759359 | Iter 4.000: 0.8048128342245989 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (6 PENDING, 1 RUNNING, 3 TERMINATED)
+----------------------------+------------+----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc            | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00003 | RUNNING    | 172.28.0.2:373 | adamW       |          |            | 

[2m[36m(train_function pid=1053)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
[2m[36m(train_function pid=1053)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1053)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 09:04:26 (running for 00:11:59.62)
Memory usage on this node: 4.7/25.5 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 8.000: 0.7834224598930482 | Iter 4.000: 0.8061497326203209 | Iter 2.000: 0.803475935828877 | Iter 1.000: 0.7566844919786097
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (5 PENDING, 1 RUNNING, 4 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00004 | RUNNING    | 172.28.0.2:1053 | adamW       |          |           

[2m[36m(train_function pid=1053)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1053)[0m 
[2m[36m(train_function pid=1053)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=1053)[0m   default="WORKER",
[2m[36m(train_function pid=1053)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1053)[0m 
[2m[36m(train_function pid=1053)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=1053)[0m   help="Specify the type of the worker process")
[2m[36m(train_function pid=1053)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1053)[0m 
[2m[36m(train_function pid=1053)[0m See the caveats in the 

== Status ==
Current time: 2022-04-14 09:04:32 (running for 00:12:05.01)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 8.000: 0.7834224598930482 | Iter 4.000: 0.8061497326203209 | Iter 2.000: 0.803475935828877 | Iter 1.000: 0.7566844919786097
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (5 PENDING, 1 RUNNING, 4 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00004 | RUNNING    | 172.28.0.2:1053 | adamW       |          |           

[2m[36m(train_function pid=1126)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
[2m[36m(train_function pid=1126)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1126)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[2m[36m(train_function pid=1126)[0m is GPU available? True
[2m[36m(train_function pid=1126)[0m device used: cuda:0


[2m[36m(train_function pid=1126)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1126)[0m 
[2m[36m(train_function pid=1126)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=1126)[0m   default="WORKER",
[2m[36m(train_function pid=1126)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1126)[0m 
[2m[36m(train_function pid=1126)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=1126)[0m   help="Specify the type of the worker process")
[2m[36m(train_function pid=1126)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1126)[0m 
[2m[36m(train_function pid=1126)[0m See the caveats in the 

== Status ==
Current time: 2022-04-14 09:05:26 (running for 00:12:59.08)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 8.000: 0.7834224598930482 | Iter 4.000: 0.8061497326203209 | Iter 2.000: 0.803475935828877 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (4 PENDING, 1 RUNNING, 5 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00005 | RUNNING    | 172.28.0.2:1126 | adamW       |          |           

[2m[36m(train_function pid=1216)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
[2m[36m(train_function pid=1216)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1216)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2m[36m(train_function pid=1216)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1216)[0m 
[2m[36m(train_function pid=1216)

[2m[36m(train_function pid=1216)[0m is GPU available? True
[2m[36m(train_function pid=1216)[0m device used: cuda:0
== Status ==
Current time: 2022-04-14 09:06:52 (running for 00:14:25.67)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 8.000: 0.7834224598930482 | Iter 4.000: 0.8061497326203209 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7566844919786097
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (3 PENDING, 1 RUNNING, 6 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------

[2m[36m(train_function pid=1289)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias']
[2m[36m(train_function pid=1289)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1289)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 09:07:46 (running for 00:15:19.39)
Memory usage on this node: 3.7/25.5 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 8.000: 0.7834224598930482 | Iter 4.000: 0.8061497326203209 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (2 PENDING, 1 RUNNING, 7 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00007 | RUNNING    | 172.28.0.2:1289 | adam        |          |          

[2m[36m(train_function pid=1289)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1289)[0m 
[2m[36m(train_function pid=1289)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=1289)[0m   default="WORKER",
[2m[36m(train_function pid=1289)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1289)[0m 
[2m[36m(train_function pid=1289)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=1289)[0m   help="Specify the type of the worker process")
[2m[36m(train_function pid=1289)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1289)[0m 
[2m[36m(train_function pid=1289)[0m See the caveats in the 

== Status ==
Current time: 2022-04-14 09:07:51 (running for 00:15:24.78)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 8.000: 0.7834224598930482 | Iter 4.000: 0.8061497326203209 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7540106951871658
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (2 PENDING, 1 RUNNING, 7 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00007 | RUNNING    | 172.28.0.2:1289 | adam        |          |          

[2m[36m(train_function pid=1363)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
[2m[36m(train_function pid=1363)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1363)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 09:08:46 (running for 00:16:19.32)
Memory usage on this node: 4.3/25.5 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 8.000: 0.7834224598930482 | Iter 4.000: 0.8061497326203209 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7526737967914439
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (1 PENDING, 1 RUNNING, 8 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00008 | RUNNING    | 172.28.0.2:1363 | adam        |          |          

[2m[36m(train_function pid=1363)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1363)[0m 
[2m[36m(train_function pid=1363)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=1363)[0m   default="WORKER",
[2m[36m(train_function pid=1363)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1363)[0m 
[2m[36m(train_function pid=1363)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=1363)[0m   help="Specify the type of the worker process")
[2m[36m(train_function pid=1363)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1363)[0m 
[2m[36m(train_function pid=1363)[0m See the caveats in the 

== Status ==
Current time: 2022-04-14 09:08:51 (running for 00:16:24.73)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 8.000: 0.7834224598930482 | Iter 4.000: 0.8061497326203209 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7526737967914439
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (1 PENDING, 1 RUNNING, 8 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00008 | RUNNING    | 172.28.0.2:1363 | adam        |          |          

[2m[36m(train_function pid=1434)[0m Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
[2m[36m(train_function pid=1434)[0m - This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
[2m[36m(train_function pid=1434)[0m - This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


== Status ==
Current time: 2022-04-14 09:09:45 (running for 00:17:18.80)
Memory usage on this node: 4.7/25.5 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 8.000: 0.7834224598930482 | Iter 4.000: 0.8061497326203209 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7513368983957219
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (1 RUNNING, 9 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00009 | RUNNING    | 172.28.0.2:1434 | adam        |          |            |        

[2m[36m(train_function pid=1434)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1434)[0m 
[2m[36m(train_function pid=1434)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=1434)[0m   default="WORKER",
[2m[36m(train_function pid=1434)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1434)[0m 
[2m[36m(train_function pid=1434)[0m See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[2m[36m(train_function pid=1434)[0m   help="Specify the type of the worker process")
[2m[36m(train_function pid=1434)[0m A value is trying to be set on a copy of a slice from a DataFrame
[2m[36m(train_function pid=1434)[0m 
[2m[36m(train_function pid=1434)[0m See the caveats in the 

== Status ==
Current time: 2022-04-14 09:09:50 (running for 00:17:23.81)
Memory usage on this node: 4.1/25.5 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 8.000: 0.7834224598930482 | Iter 4.000: 0.8061497326203209 | Iter 2.000: 0.7887700534759359 | Iter 1.000: 0.7513368983957219
Resources requested: 2.0/4 CPUs, 1.0/1 GPUs, 0.0/14.28 GiB heap, 0.0/7.14 GiB objects (0.0/1.0 accelerator_type:P100)
Result logdir: /root/ray_results/train_function_2022-04-14_08-52-27
Number of trials: 10/10 (1 RUNNING, 9 TERMINATED)
+----------------------------+------------+-----------------+-------------+----------+------------+----------------------+
| Trial name                 | status     | loc             | optimizer   |     loss |   accuracy |   training_iteration |
|----------------------------+------------+-----------------+-------------+----------+------------+----------------------|
| train_function_35c0f_00009 | RUNNING    | 172.28.0.2:1434 | adam        |          |            |        

2022-04-14 09:11:01,651	INFO tune.py:639 -- Total run time: 1114.65 seconds (1114.27 seconds for the tuning loop).


Best trial config: {'num_epochs': 10, 'lr': 1e-05, 'batch_size': 16, 'weight_decay': 0.05, 'beta1': 0.9, 'beta2': 0.995, 'eps': 1e-08, 'max_gradient_norm': 8, 'd_h': 512, 'dropout': 0.1, 'factor': 0.5, 'patience': 2, 'optimizer': 'adam', 'momentum': 0.9}
Best trial final validation loss: 0.9082448921787242
Best trial final validation accuracy: 0.7941176470588235


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the d

Best trial test set accuracy: 0.8212526389866291 , test loss : 0.8298243870916174
scheduler : dict_keys(['factor', 'min_lrs', 'patience', 'verbose', 'cooldown', 'cooldown_counter', 'mode', 'threshold', 'threshold_mode', 'best', 'num_bad_epochs', 'mode_worse', 'eps', 'last_epoch', '_last_lr'])
