# Download and Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/POCSE

/content/drive/MyDrive/POCSE


In [1]:
%pwd

'/home/LoraCSE'

In [None]:
%pip install -r requirements.txt

In [2]:
%cd data
!bash download_wiki.sh
!bash download_nli.sh
%cd ../SentEval/data/downstream
!bash download_dataset.sh

/home/LoraCSE/data
/home/LoraCSE/SentEval/data/downstream
--2023-04-13 14:03:51--  https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/senteval.tar
Resolving huggingface.co (huggingface.co)... 52.85.242.63, 52.85.242.6, 52.85.242.117, ...
Connecting to huggingface.co (huggingface.co)|52.85.242.63|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/datasets/princeton-nlp/datasets-for-simcse/bc43c148f7be97471c78fc4255399d3158cb99dfe8f2221999c918338b138c38?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27senteval.tar%3B+filename%3D%22senteval.tar%22%3B&response-content-type=application%2Fx-tar&Expires=1681653831&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly9jZG4tbGZzLmh1Z2dpbmdmYWNlLmNvL2RhdGFzZXRzL3ByaW5jZXRvbi1ubHAvZGF0YXNldHMtZm9yLXNpbWNzZS9iYzQzYzE0OGY3YmU5NzQ3MWM3OGZjNDI1NTM5OWQzMTU4Y2I5OWRmZThmMjIyMTk5OWM5MTgzMzhiMTM4YzM4P3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS

In [2]:
import random
import time
from typing import Dict, List
from lion_pytorch import Lion

import sys
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from loguru import logger
from scipy.stats import spearmanr
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModel,
    AutoTokenizer,
    AutoConfig,
    BertConfig,
    BertModel, 
    BertTokenizer,
    DistilBertTokenizer, 
    DistilBertModel, 
    DistilBertConfig,
    default_data_collator,
    get_linear_schedule_with_warmup
)
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
)

# Configuration

In [3]:
# CONFIG
EPOCHS = 4
BATCH_SIZE = 512
LR = 2e-4
DROPOUT = 0.05
MAXLEN = 32
POOLING = 'cls'   # choose in ['cls', 'pooler', 'first-last-avg', 'last-avg']
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
RAND_SIZE = 0
RAND_STD = 0.5
RANK = 2
SAVE_FREQ = 100
PEFT_CONFIG = LoraConfig(inference_mode=False, 
              r=RANK, 
              lora_alpha=RANK*2, 
              lora_dropout=0.05,
              # target_modules=["q_lin","k_lin"]
              target_modules=['value','query']
              )
HID = 768
# PEFT_CONFIG = PrefixTuningConfig(task_type="SEQ_CLS",num_virtual_tokens=20)

# Pretrain model
DISBERT = 'distilbert-base-uncased'
ROBERTA = 'roberta-base'
ROBERTA_LARGE = 'roberta-large'
BERT = 'bert-base-uncased'
T5 = "t5-small"
model_path = ROBERTA_LARGE

# where to save
# SAVE_PATH = './saved_model/simcse_unsup_qv_roberta_large.pt'
SAVE_PATH = f'./saved_model/loracse_sup_{model_path}_r{RANK}_b{BATCH_SIZE}.pt'

# dataset dir
# DATA_PATH = './data/wiki1m_for_simcse.txt'
DATA_PATH = './data/nli_for_simcse.csv'

# Prepare the dataset

In [9]:
%cd ../../../

/home/LoraCSE


In [4]:
from datasets import load_dataset
data_files = {}
data_files['train'] = DATA_PATH
extension = DATA_PATH.split(".")[-1]
if extension == "txt":
    extension = "text"
if extension == "csv":
    datasets = load_dataset(extension, data_files=data_files, cache_dir="./data/", delimiter="\t" if "tsv" in DATA_PATH else ",")
else:
    datasets = load_dataset(extension, data_files=data_files, cache_dir="./data/")

Using custom data configuration default
Reusing dataset csv (./data/csv/default-70806cc3f53ac92e/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2)


In [5]:
column_names = datasets["train"].column_names
sent2_cname = None
if len(column_names) == 2:
    # Pair datasets
    sent0_cname = column_names[0]
    sent1_cname = column_names[1]
elif len(column_names) == 3:
    # Pair datasets with hard negatives
    sent0_cname = column_names[0]
    sent1_cname = column_names[1]
    sent2_cname = column_names[2]
elif len(column_names) == 1:
    # Unsupervised datasets
    sent0_cname = column_names[0]
    sent1_cname = column_names[0]
else:
    raise NotImplementedError

In [6]:
def prepare_features(examples):
        # padding = longest (default)
        #   If no sentence in the batch exceed the max length, then use
        #   the max sentence length in the batch, otherwise use the 
        #   max sentence length in the argument and truncate those that
        #   exceed the max length.
        # padding = max_length (when pad_to_max_length, for pressure test)
        #   All sentences are padded/truncated to data_args.max_seq_length.
        total = len(examples[sent0_cname])

        # Avoid "None" fields 
        for idx in range(total):
            if examples[sent0_cname][idx] is None:
                examples[sent0_cname][idx] = " "
            if examples[sent1_cname][idx] is None:
                examples[sent1_cname][idx] = " "
        
        sentences = examples[sent0_cname] + examples[sent1_cname]

        # If hard negative exists
        if sent2_cname is not None:
            for idx in range(total):
                if examples[sent2_cname][idx] is None:
                    examples[sent2_cname][idx] = " "
            sentences += examples[sent2_cname]

        sent_features = tokenizer(
            sentences,
            max_length=32, #data_args.max_seq_length,
            truncation=True,
            padding="max_length" #if data_args.pad_to_max_length else False,
        )

        features = {}
        if sent2_cname is not None:
            for key in sent_features:
                features[key] = [[sent_features[key][i], sent_features[key][i+total], sent_features[key][i+total*2]] for i in range(total)]
        else:
            for key in sent_features:
                features[key] = [[sent_features[key][i], sent_features[key][i+total]] for i in range(total)]
        return features

tokenizer = AutoTokenizer.from_pretrained(model_path)
train_dataset = datasets["train"].map(
        prepare_features,
        batched=True,
        num_proc=32,
        remove_columns=column_names,
        load_from_cache_file=False,
    )
data_collator = default_data_collator



































# Lora-CSE model

## model and loss

In [7]:
class MLPLayer(nn.Module):
    """
    Head for getting sentence representations over RoBERTa/BERT's CLS representation.
    """
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()
    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = self.activation(x)
        return x
    
class SimcseModel(nn.Module):
    """Simcse"""
    def __init__(self, pretrained_model, pooling, peft_config):
        super(SimcseModel, self).__init__()
        config = AutoConfig.from_pretrained(pretrained_model, return_dict=True)       
        config.attention_probs_dropout_prob = DROPOUT   # 修改config的dropout系数
        config.hidden_dropout_prob = DROPOUT           
        self.bert = AutoModel.from_pretrained(pretrained_model)
        self.pooling = pooling
        self.bert = get_peft_model(self.bert, peft_config)
        self.bert.print_trainable_parameters()
        self.cls = MLPLayer(config)
        
    def forward(self, input_ids, attention_mask, token_type_ids=None, test=False):
        batch_size = input_ids.size(0)
        if (len(input_ids.shape))==3:
          #num_sent = 2 pair, 3 supervised
          num_sent = input_ids.size(1)
          mlm_outputs = None
          # Flatten input for encoding
          input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
          attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)
          if token_type_ids is not None:
              token_type_ids = token_type_ids.view((-1, token_type_ids.size(-1))) # (bs * num_sent, len)
        else:
          num_sent = 1
        x = self.bert(input_ids, attention_mask, token_type_ids, output_hidden_states=True)

        if self.pooling == 'cls':
            x = x.last_hidden_state[:, 0]
            x = x.view((batch_size, num_sent, -1))
            return self.cls(x)    # [batch, num_sent, 768]          
        
        if self.pooling == 'pooler':
            x = x.pooler_output   # [batch * num_sent, 768]
            return x.view((batch_size, num_sent, -1))     # [batch, num_sent, 768]            
        
        if self.pooling == 'last-avg':
            last = x.last_hidden_state.transpose(1, 2)    # [batch * num_sent, 768, seqlen]
            x = torch.avg_pool1d(last, kernel_size=last.shape[-1]).squeeze(-1)       # [batch * num_sent, 768]
            return x.view((batch_size, num_sent, -1))     # [batch, num_sent, 768]     

        if self.pooling == 'first-last-avg':
            first = x.hidden_states[1].transpose(1, 2)    # [batch * num_sent, 768, seqlen]
            last = x.hidden_states[-1].transpose(1, 2)    # [batch * num_sent, 768, seqlen]                   
            first_avg = torch.avg_pool1d(first, kernel_size=last.shape[-1]).squeeze(-1) # [batch * num_sent, 768]
            last_avg = torch.avg_pool1d(last, kernel_size=last.shape[-1]).squeeze(-1)   # [batch * num_sent, 768]
            avg = torch.cat((first_avg.unsqueeze(1), last_avg.unsqueeze(1)), dim=1)     # [batch * num_sent, 2, 768]
            x = torch.avg_pool1d(avg.transpose(1, 2), kernel_size=2).squeeze(-1)     # [batch * num_sent, 768]
            return x.view((batch_size, num_sent, -1))     # [batch, num_sent, 768]  

def simcse_unsup_loss(y_pred: 'tensor') -> 'tensor':
    """Infocse loss
    y_pred (tensor): the output of SIMCSE, [batch_size, num_sent, 768]
    
    """
    # get the batch size and number of sentences
    batch_size = y_pred.size(0)
    num_sent = y_pred.size(1)

    # output
    z1 = y_pred[:, 0]   # [batch_size, 768]
    z2 = y_pred[:, 1]   # [batch_size, 768]
    if num_sent >= 3:
        z3 = y_pred[:, 2]   # [batch_size, 768]
    #similarity
    cos_sim = F.cosine_similarity(z1.unsqueeze(1), z2.unsqueeze(0), dim=-1) # [batch_size, batch_size]
    if num_sent >= 3:
        z1_z3_cos = F.cosine_similarity(z1.unsqueeze(1), z3.unsqueeze(0), dim=-1) # [batch_size, batch_size]
        cos_sim = torch.cat([cos_sim, z1_z3_cos], 1) # [batch_size, 2 * batch_size]

    #gsinfonce
    if RAND_SIZE > 0:
        #import ipdb;ipdb.set_trace()
        if RAND_STD == 0.0 :
            z2_random = torch.randn(RAND_STD, z1.shape[1]).to(DEVICE)
        else:
            z2_random = torch.normal(0, RAND_STD, size=(RAND_SIZE, z1.shape[1])).to(DEVICE)
        cos_sim = torch.cat((cos_sim, F.cosine_similarity(z1.unsqueeze(1), z2_random.unsqueeze(0))),1).to(DEVICE)

    
    labels = torch.arange(cos_sim.size(0)).long().to(DEVICE) # [batch_size]
    cos_sim /= 0.05
    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(cos_sim, labels) #[batch_size]
    return loss

def mlm_loss(y_pred: 'tensor') -> 'tensor':
    """mlm loss
    y_pred (tensor): the output of SIMCSE, [batch_size, num_sent, 768]
    
    """
    pass

## Evaluation

In [8]:
# Set path to SentEval
PATH_TO_SENTEVAL = './SentEval'
PATH_TO_DATA = './SentEval/data'

# Import SentEval
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval
def evaluate(eval_senteval_transfer: bool, tokenizer, model) -> Dict[str, float]:
    # SentEval prepare and batcher
    def prepare(params, samples):
        return

    def batcher(params, batch):
        sentences = [' '.join(s) for s in batch]
        batch = tokenizer.batch_encode_plus(
            sentences,
            return_tensors='pt',
            padding=True,
        )
        for k in batch:
            batch[k] = batch[k].to(DEVICE)
        with torch.no_grad():
            pooler_output = model(**batch, test=True).view((-1,HID))
        return pooler_output.cpu()

    # Set params for SentEval (fastmode)
    params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
    params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
                                        'tenacity': 3, 'epoch_size': 2}

    se = senteval.engine.SE(params, batcher, prepare)
    tasks = ['STSBenchmark', 'SICKRelatedness']
    if eval_senteval_transfer:
        tasks = ['STSBenchmark', 'SICKRelatedness', 'MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']
    model.eval()
    results = se.eval(tasks)
    
    stsb_spearman = results['STSBenchmark']['dev']['spearman'][0]
    sickr_spearman = results['SICKRelatedness']['dev']['spearman'][0]

    metrics = {"eval_stsb_spearman": stsb_spearman, "eval_sickr_spearman": sickr_spearman, "eval_avg_sts": (stsb_spearman + sickr_spearman) / 2} 
    if eval_senteval_transfer:
        avg_transfer = 0
        for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']:
            avg_transfer += results[task]['devacc']
            metrics['eval_{}'.format(task)] = results[task]['devacc']
        avg_transfer /= 7
        metrics['eval_avg_transfer'] = avg_transfer

    logger.info(metrics)
    return metrics

from prettytable import PrettyTable

def print_table(task_names, scores):
    tb = PrettyTable()
    tb.field_names = task_names
    tb.add_row(scores)
    print(tb)

def test_evaluate(tokenizer, model) -> Dict[str, float]:
    # SentEval prepare and batcher
    def prepare(params, samples):
        return

    def batcher(params, batch):
        sentences = [' '.join(s) for s in batch]
        batch = tokenizer.batch_encode_plus(
            sentences,
            return_tensors='pt',
            padding=True,
        )
        for k in batch:
            batch[k] = batch[k].to(DEVICE)
        with torch.no_grad():
            pooler_output = model(**batch, test=True).view((-1,HID)) 
        return pooler_output.cpu()

    # Set params for SentEval (testmode)
    params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
    params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 'tenacity': 5, 'epoch_size': 4}

    se = senteval.engine.SE(params, batcher, prepare)
    tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']
    # tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC']

    model.eval()
    results = {}
    for task in tasks:
        se = senteval.engine.SE(params, batcher, prepare)
        result = se.eval(task)
        results[task] = result

    print("------ test ------")
    task_names = []
    scores = []
    for task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']:
        task_names.append(task)
        if task in results:
            if task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
                scores.append("%.2f" % (results[task]['all']['spearman']['all'] * 100))
            else:
                scores.append("%.2f" % (results[task]['test']['spearman'].correlation * 100))
        else:
            scores.append("0.00")
    task_names.append("Avg.")
    scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
    print_table(task_names, scores)

    task_names = []
    scores = []
    for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']:
        task_names.append(task)
        if task in results:
            scores.append("%.2f" % (results[task]['acc']))    
        else:
            scores.append("0.00")
    task_names.append("Avg.")
    scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
    print_table(task_names, scores)

## Trainer

In [9]:
def train(model, train_dl, optimizer, lr_scheduler, tokenizer) -> None:
    global best
    gpus = [0, 1, 2, 3]
    torch.cuda.set_device('cuda:{}'.format(gpus[0]))
    model = nn.DataParallel(model.to(DEVICE), device_ids=gpus, output_device=gpus[0])
    scaler = torch.cuda.amp.GradScaler()
    early = 0
    for epoch in range(EPOCHS):
      logger.info(f'epoch: {epoch}')
      for batch_idx, batch in enumerate(tqdm(train_dl), start=1):  
          model.train()
          batch = {k: v.to(DEVICE) for k, v in batch.items()}
          optimizer.zero_grad() 
          with torch.cuda.amp.autocast():
            out = model(**batch)
            loss = simcse_unsup_loss(out)
          scaler.scale(loss).backward()
          scaler.step(optimizer)
          scaler.update()
        
                
          if batch_idx % SAVE_FREQ == 0: 
              model.eval()
              torch.cuda.empty_cache()
              metrics = evaluate(False, tokenizer, model)
              corrcoef = metrics["eval_stsb_spearman"]
              # corrcoef = metrics["eval_avg_sts"]
              if best < corrcoef:
                  best = corrcoef
                  torch.save(model.state_dict(), SAVE_PATH)
                  logger.info(f"higher corrcoef: {best:.4f} in batch: {batch_idx}, save model")
                  early = 0
              else:
                early += 1
                if early == 4:
                  return
              torch.cuda.empty_cache()

# Experiment on q,v; batch size 512; r=4

In [None]:
model_path = ROBERTA
BATCH_SIZE = 512
RANK = 4
TARGET = ['value']
PEFT_CONFIG = LoraConfig(inference_mode=False, 
              r=RANK, 
              lora_alpha=RANK*2, 
              lora_dropout=0.05,
              # target_modules=["q_lin","k_lin"]
              target_modules=TARGET
              )
SAVE_PATH = f'./saved_model/loracse_sup_{model_path}_r{RANK}_b{BATCH_SIZE}_t{TARGET}.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True, num_workers=4, pin_memory=True)
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE)
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

In [None]:
model_path = ROBERTA
BATCH_SIZE = 512
RANK = 4
TARGET = ['query']
PEFT_CONFIG = LoraConfig(inference_mode=False, 
              r=RANK, 
              lora_alpha=RANK*2, 
              lora_dropout=0.05,
              # target_modules=["q_lin","k_lin"]
              target_modules=TARGET
              )
SAVE_PATH = f'./saved_model/loracse_sup_{model_path}_r{RANK}_b{BATCH_SIZE}_t{TARGET}.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True, num_workers=4, pin_memory=True)
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE)
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

In [None]:
model_path = ROBERTA
BATCH_SIZE = 512
RANK = 4
TARGET = ['value','query']
PEFT_CONFIG = LoraConfig(inference_mode=False, 
              r=RANK, 
              lora_alpha=RANK*2, 
              lora_dropout=0.05,
              # target_modules=["q_lin","k_lin"]
              target_modules=TARGET
              )
SAVE_PATH = f'./saved_model/loracse_sup_{model_path}_r{RANK}_b{BATCH_SIZE}_t{TARGET}.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True, num_workers=4, pin_memory=True)
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE)
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

# Experiment on different rank 1 2 4 8

In [None]:
for r in [1,2,4,8]:
    model_path = ROBERTA
    BATCH_SIZE = 512
    RANK = r
    TARGET = ['value','query']
    PEFT_CONFIG = LoraConfig(inference_mode=False, 
                r=RANK, 
                lora_alpha=RANK*2, 
                lora_dropout=0.05,
                # target_modules=["q_lin","k_lin"]
                target_modules=TARGET
                )
    SAVE_PATH = f'./saved_model/loracse_sup_{model_path}_r{RANK}_b{BATCH_SIZE}_t{TARGET}.pt'
    logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
    # load model
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True, num_workers=4, pin_memory=True)
    assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
    model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE)
    optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
    # train cls with mlp
    best=0
    train(model, train_dataloader, optimizer, None, tokenizer)
    logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
    # eval
    model.load_state_dict(torch.load(SAVE_PATH))
    test_evaluate(tokenizer, model)

# Roberta Large 

## Roberta Large batch size 452 rank 4 with Lion

In [12]:
BATCH_SIZE = 450
RANK = 4
PEFT_CONFIG = LoraConfig(inference_mode=False, 
              r=RANK, 
              lora_alpha=RANK*2, 
              lora_dropout=0.05,
              # target_modules=["q_lin","k_lin"]
              target_modules=['value','query']
              )
SAVE_PATH = f'./saved_model/loracse_sup_{model_path}_r{RANK}_b{BATCH_SIZE}.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True, num_workers=4, pin_memory=True)
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE)
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-13 16:06:56.153[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mdevice: cuda, pooling: cls, model path: roberta-large, batch size:452[0m
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


trainable params: 393216 || all params: 355752960 || trainable%: 0.11053063339234057


[32m2023-04-13 16:07:01.559[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m6[0m - [1mepoch: 0[0m
 16%|█▌        | 99/610 [02:27<12:38,  1.49s/it][32m2023-04-13 16:10:16.366[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m50[0m - [1m{'eval_stsb_spearman': 0.8263376503356064, 'eval_sickr_spearman': 0.763433896109588, 'eval_avg_sts': 0.7948857732225971}[0m
 16%|█▌        | 99/610 [03:16<16:53,  1.98s/it]


RuntimeError: [enforce fail at inline_container.cc:325] . unexpected pos 489819456 vs 489819344

## Roberta Large batch size 452 rank 1 with Lion

In [11]:
BATCH_SIZE = 452
RANK = 1
PEFT_CONFIG = LoraConfig(inference_mode=False, 
              r=RANK, 
              lora_alpha=RANK*2, 
              lora_dropout=0.05,
              # target_modules=["q_lin","k_lin"]
              target_modules=['value','query']
              )
SAVE_PATH = f'./saved_model/loracse_sup_{model_path}_r{RANK}_b{BATCH_SIZE}.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True, num_workers=4, pin_memory=True)
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE)
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-13 14:53:52.917[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mdevice: cuda, pooling: cls, model path: roberta-large, batch size:452[0m
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


trainable params: 98304 || all params: 355458048 || trainable%: 0.0276555842674295


[32m2023-04-13 14:53:58.900[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m6[0m - [1mepoch: 0[0m
 16%|█▌        | 99/610 [02:27<12:39,  1.49s/it][32m2023-04-13 14:57:14.216[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m50[0m - [1m{'eval_stsb_spearman': 0.7111494011716875, 'eval_sickr_spearman': 0.697080514499183, 'eval_avg_sts': 0.7041149578354352}[0m
[32m2023-04-13 14:57:15.550[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m28[0m - [1mhigher corrcoef: 0.7111 in batch: 100, save model[0m
 33%|███▎      | 199/610 [05:43<10:10,  1.48s/it]  [32m2023-04-13 15:00:29.986[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m50[0m - [1m{'eval_stsb_spearman': 0.8254029507142806, 'eval_sickr_spearman': 0.7712142140760407, 'eval_avg_sts': 0.7983085823951607}[0m
[32m2023-04-13 15:00:36.040[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m28[0m - [1mhigher corrcoef: 0.8254 in batch: 200, save model[0m

 49%|████▉     | 299/610 [09:44<10:08,  1.96s/it]
[32m2023-04-13 16:03:55.758[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [1mtrain is finished, best model is saved at ./saved_model/loracse_sup_roberta-large_r1_b452.pt[0m


------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 78.32 | 88.60 | 84.23 | 87.78 | 84.33 |    87.17     |      81.72      | 84.59 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


## Roberta Large batch size 452 rank 2 with Lion

In [10]:
BATCH_SIZE = 452
SAVE_PATH = f'./saved_model/loracse_sup_{model_path}_r{RANK}_b{BATCH_SIZE}.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True, num_workers=4, pin_memory=True)
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE)
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-13 14:12:21.189[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mdevice: cuda, pooling: cls, model path: roberta-large, batch size:452[0m
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


trainable params: 196608 || all params: 355556352 || trainable%: 0.05529587613723745


[32m2023-04-13 14:12:28.471[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m6[0m - [1mepoch: 0[0m
 16%|█▌        | 99/610 [02:28<12:37,  1.48s/it][32m2023-04-13 14:15:43.822[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m50[0m - [1m{'eval_stsb_spearman': 0.8043958134785762, 'eval_sickr_spearman': 0.7508253477332719, 'eval_avg_sts': 0.7776105806059241}[0m
[32m2023-04-13 14:15:45.140[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m28[0m - [1mhigher corrcoef: 0.8044 in batch: 100, save model[0m
 33%|███▎      | 199/610 [05:44<10:11,  1.49s/it]  [32m2023-04-13 14:19:00.365[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m50[0m - [1m{'eval_stsb_spearman': 0.8513076678336058, 'eval_sickr_spearman': 0.8200548316977913, 'eval_avg_sts': 0.8356812497656985}[0m
[32m2023-04-13 14:19:01.773[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m28[0m - [1mhigher corrcoef: 0.8513 in batch: 200, save model[0

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 78.79 | 87.72 | 83.08 | 87.39 | 84.39 |    86.98     |      82.09      | 84.35 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


# Experiment

## Lora-Bert-base

In [None]:
# bert batch 512 with mlp, r=2, epoch=4
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE) 
optimizer = Lion(model.parameters(), lr=1e-4, weight_decay=1e-2)
# train cls with mlp
best = 0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-12 03:01:45.750[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 2>[0m:[36m2[0m - [1mdevice: cuda, pooling: cls, model path: bert-base-uncased, batch size:512[0m
[32m2023-04-12 03:01:45.754[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m6[0m - [1mepoch: 0[0m
 23%|██▎       | 124/539 [00:45<02:29,  2.77it/s][32m2023-04-12 03:02:48.929[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8554180626237553, 'eval_sickr_spearman': 0.8123737940180112, 'eval_avg_sts': 0.8338959283208833}[0m
[32m2023-04-12 03:02:50.074[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m29[0m - [1mhigher corrcoef: 0.8339 in batch: 125, save model[0m
 46%|████▌     | 249/539 [01:49<01:44,  2.77it/s][32m2023-04-12 03:03:52.243[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8569734045737527, 'eval_sickr_spearman': 0.8122038794740944, 'eval_a

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 75.48 | 84.04 | 79.70 | 85.61 | 81.91 |    84.00     |      79.82      | 81.51 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


In [None]:
# bert batch 256 with mlp, r=4，epoch
SAVE_PATH = f'./saved_model/loracse_sup_bert_base_b512_r4_mlp.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE) 
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best = 0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-12 00:59:39.359[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 3>[0m:[36m3[0m - [1mdevice: cuda, pooling: cls, model path: bert-base-uncased, batch size:512[0m
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Be

trainable params: 147456 || all params: 109629696 || trainable%: 0.13450370235451534


 23%|██▎       | 124/539 [00:45<02:29,  2.78it/s][32m2023-04-12 01:00:44.234[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8331040802720957, 'eval_sickr_spearman': 0.7916465485656109, 'eval_avg_sts': 0.8123753144188532}[0m
[32m2023-04-12 01:00:45.307[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m29[0m - [1mhigher corrcoef: 0.8331 in batch: 125, save model[0m
 46%|████▌     | 249/539 [01:48<01:44,  2.77it/s][32m2023-04-12 01:01:47.902[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8354498071269327, 'eval_sickr_spearman': 0.7956911035244102, 'eval_avg_sts': 0.8155704553256715}[0m
[32m2023-04-12 01:01:49.026[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m29[0m - [1mhigher corrcoef: 0.8354 in batch: 250, save model[0m
 69%|██████▉   | 374/539 [02:52<00:59,  2.78it/s][32m2023-04-12 01:02:51.458[0m | [1mINFO    [0m | [36m__main__

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 75.13 | 84.25 | 79.76 | 85.83 | 81.40 |    83.83     |      79.79      | 81.43 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


In [None]:
# bert batch 512 with mlp, r=6，epoch=4
SAVE_PATH = f'./saved_model/loracse_sup_bert_base_b512_r6_mlp.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE) 
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best = 0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-11 22:17:55.334[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 3>[0m:[36m3[0m - [1mdevice: cuda, pooling: cls, model path: bert-base-uncased, batch size:512[0m
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Be

trainable params: 221184 || all params: 109703424 || trainable%: 0.20161996037607724


 23%|██▎       | 124/539 [00:45<02:28,  2.79it/s][32m2023-04-11 22:19:00.027[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8331552601718153, 'eval_sickr_spearman': 0.7949885045725735, 'eval_avg_sts': 0.8140718823721944}[0m
[32m2023-04-11 22:19:01.075[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m29[0m - [1mhigher corrcoef: 0.8332 in batch: 125, save model[0m
 46%|████▌     | 249/539 [01:48<01:44,  2.78it/s][32m2023-04-11 22:20:03.412[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8415476796571214, 'eval_sickr_spearman': 0.7991213887269173, 'eval_avg_sts': 0.8203345341920194}[0m
[32m2023-04-11 22:20:04.433[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m29[0m - [1mhigher corrcoef: 0.8415 in batch: 250, save model[0m
 69%|██████▉   | 374/539 [02:51<00:59,  2.79it/s][32m2023-04-11 22:21:06.428[0m | [1mINFO    [0m | [36m__main__

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 75.54 | 84.42 | 79.60 | 85.97 | 80.52 |    83.58     |      80.01      | 81.38 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


In [None]:
# bert batch 512 with mlp, r=5，epoch=4
SAVE_PATH = f'./saved_model/loracse_sup_bert_base_b512_r5_mlp.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE) 
optimizer = Lion(model.parameters(), lr=3e-4, weight_decay=1e-2)
# train cls with mlp
best = 0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-11 21:59:25.514[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 3>[0m:[36m3[0m - [1mdevice: cuda, pooling: cls, model path: bert-base-uncased, batch size:512[0m
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Be

trainable params: 184320 || all params: 109666560 || trainable%: 0.16807311180363457


[32m2023-04-11 21:59:30.450[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m6[0m - [1mepoch: 0[0m
 23%|██▎       | 124/539 [00:46<02:29,  2.77it/s][32m2023-04-11 22:00:34.543[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8307225921250608, 'eval_sickr_spearman': 0.7960561398953397, 'eval_avg_sts': 0.8133893660102003}[0m
[32m2023-04-11 22:00:35.602[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m29[0m - [1mhigher corrcoef: 0.8307 in batch: 125, save model[0m
 46%|████▌     | 249/539 [01:49<01:44,  2.77it/s][32m2023-04-11 22:01:37.912[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8399285013518084, 'eval_sickr_spearman': 0.8053199464608101, 'eval_avg_sts': 0.8226242239063093}[0m
[32m2023-04-11 22:01:39.031[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m29[0m - [1mhigher corrcoef: 0.8399 in batch: 250, save model[0m

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 74.99 | 83.88 | 79.53 | 86.02 | 81.09 |    83.74     |      79.79      | 81.29 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


In [None]:
# bert batch 256 with mlp, r=4
SAVE_PATH = f'./saved_model/loracse_sup_bert_base_b256_mlp.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE) 
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best = 0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-11 21:05:30.445[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 3>[0m:[36m3[0m - [1mdevice: cuda, pooling: cls, model path: bert-base-uncased, batch size:256[0m
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a Be

trainable params: 147456 || all params: 109629696 || trainable%: 0.13450370235451534


 12%|█▏        | 124/1077 [00:22<02:46,  5.71it/s][32m2023-04-11 21:06:12.348[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8296824501012819, 'eval_sickr_spearman': 0.7932102010729287, 'eval_avg_sts': 0.8114463255871054}[0m
[32m2023-04-11 21:06:13.514[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m29[0m - [1mhigher corrcoef: 0.8114 in batch: 125, save model[0m
 23%|██▎       | 249/1077 [01:02<02:25,  5.70it/s][32m2023-04-11 21:06:52.790[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8293059569595671, 'eval_sickr_spearman': 0.7954730903549825, 'eval_avg_sts': 0.8123895236572748}[0m
[32m2023-04-11 21:06:53.929[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m29[0m - [1mhigher corrcoef: 0.8124 in batch: 250, save model[0m
 35%|███▍      | 374/1077 [01:42<02:03,  5.69it/s][32m2023-04-11 21:07:33.464[0m | [1mINFO    [0m | [36m__mai

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 74.98 | 83.91 | 78.83 | 84.73 | 80.90 |    83.18     |      79.82      | 80.91 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


## Lora-Roberta-base

### Roberta-base batch size 512 without mlp 

In [None]:
#roberta base without mlp
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}')
# load model
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE) 
optimizer = Lion(model.parameters(), lr=3e-4, weight_decay=1e-2)
# train
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-11 04:13:25.708[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 1>[0m:[36m1[0m - [1mdevice: cuda, pooling: cls, model path: roberta-base[0m
[32m2023-04-11 04:13:25.710[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m48[0m - [1mepoch: 0[0m
 23%|██▎       | 124/539 [00:45<02:30,  2.76it/s][32m2023-04-11 04:14:28.521[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8565715130938942, 'eval_sickr_spearman': 0.792112162062952, 'eval_avg_sts': 0.824341837578423}[0m
[32m2023-04-11 04:14:29.798[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m70[0m - [1mhigher corrcoef: 0.8566 in batch: 125, save model[0m
 46%|████▌     | 249/539 [01:48<01:44,  2.77it/s][32m2023-04-11 04:15:32.094[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8573582360553504, 'eval_sickr_spearman': 0.7906411135192069, 'eval_avg_sts': 0.82399967478

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 74.67 | 83.85 | 78.18 | 84.09 | 81.77 |    84.20     |      79.25      | 80.86 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+-------+-------+-------+-------+-------+-------+-------+-------+
|   MR  |   CR  |  SUBJ |  MPQA |  SST2 |  TREC |  MRPC |  Avg. |
+-------+-------+-------+-------+-------+-------+-------+-------+
| 83.37 | 90.30 | 92.35 | 88.99 | 88.85 | 87.60 | 74.26 | 86.53 |
+-------+-------+-------+-------+-------+-------+-------+-------+


### **Roberta-base batch size 512 with mlp 

In [None]:
# roberta-base 512 with mlp
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}')
# load model
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE) 
optimizer = Lion(model.parameters(), lr=3e-4, weight_decay=1e-2)
# train
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-11 05:00:22.250[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 1>[0m:[36m1[0m - [1mdevice: cuda, pooling: cls, model path: roberta-base[0m
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


trainable params: 294912 || all params: 124940544 || trainable%: 0.23604187284473485


[32m2023-04-11 05:00:27.032[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m48[0m - [1mepoch: 0[0m
 23%|██▎       | 124/539 [00:45<02:27,  2.82it/s][32m2023-04-11 05:01:30.009[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8443887787294697, 'eval_sickr_spearman': 0.8050052466841875, 'eval_avg_sts': 0.8246970127068286}[0m
[32m2023-04-11 05:01:31.163[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m70[0m - [1mhigher corrcoef: 0.8444 in batch: 125, save model[0m
 46%|████▌     | 249/539 [01:48<01:43,  2.81it/s][32m2023-04-11 05:02:32.350[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8603142202964573, 'eval_sickr_spearman': 0.8039910699773217, 'eval_avg_sts': 0.8321526451368895}[0m
[32m2023-04-11 05:02:33.554[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m70[0m - [1mhigher corrcoef: 0.8603 in batch: 250, save model[0

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 75.84 | 85.82 | 80.79 | 86.01 | 83.27 |    85.69     |      80.00      | 82.49 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


## Lora-Roberta-large

### Roberta-large batch size 364 without mlp with noize

In [None]:
# roberta-large without mlp
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-11 02:46:12.209[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m73[0m - [1mepoch: 0[0m
 16%|█▌        | 124/788 [01:21<07:14,  1.53it/s][32m2023-04-11 02:48:19.715[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8425739376226038, 'eval_sickr_spearman': 0.8246494868613612, 'eval_avg_sts': 0.8336117122419825}[0m
[32m2023-04-11 02:48:23.298[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m95[0m - [1mhigher corrcoef: 0.8426 in batch: 125, save model[0m
 32%|███▏      | 249/788 [03:32<05:53,  1.52it/s][32m2023-04-11 02:50:31.271[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8543153780889652, 'eval_sickr_spearman': 0.8304111056654715, 'eval_avg_sts': 0.8423632418772183}[0m
[32m2023-04-11 02:50:34.787[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m95[0m - [1mhigher corrcoef: 0.8543 in batch: 250, save model[0

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 76.91 | 85.37 | 80.61 | 85.09 | 83.00 |    85.33     |      80.00      | 82.33 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+-------+-------+-------+-------+-------+-------+-------+-------+
|   MR  |   CR  |  SUBJ |  MPQA |  SST2 |  TREC |  MRPC |  Avg. |
+-------+-------+-------+-------+-------+-------+-------+-------+
| 84.94 | 90.76 | 93.37 | 89.65 | 90.12 | 81.00 | 76.75 | 86.66 |
+-------+-------+-------+-------+-------+-------+-------+-------+


### Roberta-large batch size 364 with mlp without noize

In [None]:
# roberta-large with 364 batch size and without noize
# roberta-large new
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}')
# load model
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE) 
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-11 15:26:33.707[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 3>[0m:[36m3[0m - [1mdevice: cuda, pooling: cls, model path: roberta-large[0m
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


trainable params: 786432 || all params: 356146176 || trainable%: 0.22081719613914932


[32m2023-04-11 15:26:41.916[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m48[0m - [1mepoch: 0[0m
 16%|█▋        | 124/758 [01:20<06:40,  1.58it/s][32m2023-04-11 15:28:48.275[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8461586767449017, 'eval_sickr_spearman': 0.8299913138389023, 'eval_avg_sts': 0.838074995291902}[0m
[32m2023-04-11 15:28:51.755[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m70[0m - [1mhigher corrcoef: 0.8462 in batch: 125, save model[0m
 33%|███▎      | 249/758 [03:28<05:21,  1.58it/s][32m2023-04-11 15:30:56.661[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8696973139437615, 'eval_sickr_spearman': 0.8438459330798274, 'eval_avg_sts': 0.8567716235117944}[0m
[32m2023-04-11 15:31:00.100[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m70[0m - [1mhigher corrcoef: 0.8697 in batch: 250, save model[0m

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 78.34 | 87.54 | 83.21 | 86.99 | 84.37 |    86.91     |      81.20      | 84.08 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


### Roberta-large batch size 364 with mlp with noize

In [None]:
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}')
# load model
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE) 
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)

[32m2023-04-11 14:26:54.102[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 1>[0m:[36m1[0m - [1mdevice: cuda, pooling: cls, model path: roberta-large[0m


HBox(children=(FloatProgress(value=0.0, description='Downloading pytorch_model.bin', max=1425941629.0, style=P…




Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


trainable params: 786432 || all params: 356146176 || trainable%: 0.22081719613914932


In [None]:
# train cls with mlp with noize
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')

[32m2023-04-11 14:27:31.718[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m48[0m - [1mepoch: 0[0m
 16%|█▌        | 124/766 [01:18<06:43,  1.59it/s][32m2023-04-11 14:29:43.389[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8452444625924865, 'eval_sickr_spearman': 0.8103035094527242, 'eval_avg_sts': 0.8277739860226054}[0m
[32m2023-04-11 14:29:46.706[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m70[0m - [1mhigher corrcoef: 0.8452 in batch: 125, save model[0m
 33%|███▎      | 249/766 [03:33<05:25,  1.59it/s][32m2023-04-11 14:31:51.343[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8634561395967669, 'eval_sickr_spearman': 0.8281217032154239, 'eval_avg_sts': 0.8457889214060954}[0m
[32m2023-04-11 14:31:54.817[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m70[0m - [1mhigher corrcoef: 0.8635 in batch: 250, save model[0

In [None]:
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 78.65 | 86.89 | 82.79 | 87.01 | 83.89 |    85.87     |      81.39      | 83.78 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


### **Roberta large batch size 375 rank1 use Lion

In [None]:
BATCH_SIZE = 375
SAVE_PATH = f'./saved_model/loracse_sup_{model_path}_r{RANK}_b{BATCH_SIZE}.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True, num_workers=4)
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE) 
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-13 00:39:45.957[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 3>[0m:[36m3[0m - [1mdevice: cuda, pooling: cls, model path: roberta-large, batch size:375[0m
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


trainable params: 98304 || all params: 355458048 || trainable%: 0.0276555842674295


[32m2023-04-13 00:39:53.976[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m6[0m - [1mepoch: 0[0m
 16%|█▋        | 120/735 [01:19<06:38,  1.54it/s][32m2023-04-13 00:41:59.689[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.7708795611471897, 'eval_sickr_spearman': 0.7010389016620021, 'eval_avg_sts': 0.735959231404596}[0m
[32m2023-04-13 00:42:03.018[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m36[0m - [1mhigher corrcoef: 0.7709 in batch: 121, save model[0m
 33%|███▎      | 241/735 [03:26<05:19,  1.55it/s][32m2023-04-13 00:44:07.371[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m51[0m - [1m{'eval_stsb_spearman': 0.8382150286390734, 'eval_sickr_spearman': 0.8066187508181749, 'eval_avg_sts': 0.8224168897286241}[0m
[32m2023-04-13 00:44:10.839[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m36[0m - [1mhigher corrcoef: 0.8382 in batch: 242, save model[0m


------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 79.06 | 88.57 | 83.62 | 87.80 | 84.50 |    87.26     |      82.00      | 84.69 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


### Roberta large batch size 376 rank 1 use Lion

In [None]:
BATCH_SIZE = 376
SAVE_PATH = f'./saved_model/loracse_sup_{model_path}_r{RANK}_b{BATCH_SIZE}.pt'
logger.info(f'device: {DEVICE}, pooling: {POOLING}, model path: {model_path}, batch size:{BATCH_SIZE}')
# load model
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle=True, num_workers=4, pin_memory=True)
assert POOLING in ['cls', 'pooler', 'last-avg', 'first-last-avg']
model = SimcseModel(pretrained_model=model_path, pooling=POOLING, peft_config = PEFT_CONFIG).to(DEVICE)
# optimizer = Tiger(model.parameters(), lr=2e-4, weight_decay=1e-2)
optimizer = Lion(model.parameters(), lr=2e-4, weight_decay=1e-2)
# train cls with mlp
best=0
train(model, train_dataloader, optimizer, None, tokenizer)
logger.info(f'train is finished, best model is saved at {SAVE_PATH}')
# eval
model.load_state_dict(torch.load(SAVE_PATH))
test_evaluate(tokenizer, model)

[32m2023-04-13 05:28:42.297[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 3>[0m:[36m3[0m - [1mdevice: cuda, pooling: cls, model path: roberta-large, batch size:376[0m
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


trainable params: 98304 || all params: 355458048 || trainable%: 0.0276555842674295


[32m2023-04-13 05:28:49.856[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m6[0m - [1mepoch: 0[0m
 17%|█▋        | 121/733 [01:20<06:36,  1.54it/s][32m2023-04-13 05:30:56.538[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m50[0m - [1m{'eval_stsb_spearman': 0.7760251013795039, 'eval_sickr_spearman': 0.7337754290487559, 'eval_avg_sts': 0.7549002652141299}[0m
[32m2023-04-13 05:31:00.031[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m26[0m - [1mhigher corrcoef: 0.7760 in batch: 122, save model[0m
 33%|███▎      | 243/733 [03:28<05:18,  1.54it/s][32m2023-04-13 05:33:05.304[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate[0m:[36m50[0m - [1m{'eval_stsb_spearman': 0.8362516892058601, 'eval_sickr_spearman': 0.7974616740167233, 'eval_avg_sts': 0.8168566816112917}[0m
[32m2023-04-13 05:33:08.745[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain[0m:[36m26[0m - [1mhigher corrcoef: 0.8363 in batch: 244, save model[0m

------ test ------
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness |  Avg. |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
| 78.68 | 88.32 | 82.97 | 87.70 | 84.32 |    86.82     |      81.68      | 84.36 |
+-------+-------+-------+-------+-------+--------------+-----------------+-------+
+------+------+------+------+------+------+------+------+
|  MR  |  CR  | SUBJ | MPQA | SST2 | TREC | MRPC | Avg. |
+------+------+------+------+------+------+------+------+
| 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
+------+------+------+------+------+------+------+------+


# Restart The GPU

In [31]:
!apt-get install psmisc

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  psmisc
0 upgraded, 1 newly installed, 0 to remove and 44 not upgraded.
Need to get 52.5 kB of archives.
After this operation, 266 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 psmisc amd64 23.1-1ubuntu0.1 [52.5 kB]
Fetched 52.5 kB in 0s (108 kB/s)  
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected package psmisc.
(Reading database ... 13495 files and directories currently installed.)
Preparing to unpack .../psmisc_23.1-1ubuntu0.1_amd64.deb ...
Unpacking psmisc (23.1-1ubuntu0.1) ...
Setting up psmisc (23.1-1ubuntu0.1) ...


In [1]:
!fuser -v /dev/nvidia*

                     USER        PID ACCESS COMMAND
/dev/nvidia-modeset: root     kernel mount /dev/nvidia-modeset
/dev/nvidia-uvm:     root     kernel mount /dev/nvidia-uvm
/dev/nvidia-uvm-tools:
                     root     kernel mount /dev/nvidia-uvm-tools
/dev/nvidia0:        root     kernel mount /dev/nvidia0
/dev/nvidiactl:      root     kernel mount /dev/nvidiactl


In [None]:
!kill -9 657

In [42]:
del optimizer
del model

NameError: name 'optimizer' is not defined

In [44]:
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [45]:
!nvidia-smi

Thu Apr 13 13:46:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:03:00.0 Off |                  Off |
| 30%   24C    P2    81W / 300W |   1085MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------