https://www.kaggle.com/code/stpeteishii/spotify-review-torch-roberta-peft

# 关于PEFT

PEFT（参数高效微调）是一种用于机器学习的技术，特别是在为特定任务微调大型预训练模型（如变压器（如BERT、GPT等）的情况下。PEFT的主要目标是减少在微调过程中需要更新的参数数量，这大大降低了计算成本和内存使用，同时保持甚至提高了性能

# PEFT的核心概念：

效率：通过只关注一部分参数，PEFT避免了微调模型所有参数的需要，使过程更快，资源密集度更低。
内存和计算节省：PEFT大大减少了所需的GPU内存量，使在较小的硬件设置上微调非常大的模型成为可能。
性能维护：尽管微调的参数较少，但PEFT技术通常能够实现与完全微调相当甚至更优的性能。
应用领域：PEFT广泛应用于自然语言处理（NLP）、计算机视觉和其他涉及大规模模型的人工智能任务，允许通用模型有效地适应特定应用。

# PEFT中使用的技术：¶
适配器：插入预训练模型各层的小型神经网络模块，在不改变主模型权重的情况下学习特定任务的信息。
低秩自适应（LoRA）：一种对模型权重的低秩更新进行微调的方法，减少了需要训练的参数数量。
前缀调整：将特定于任务的向量（前缀）添加到影响模型输出的输入序列中，从而在不改变其核心参数的情况下调整模型。
BitFit（仅偏置微调）：仅微调模型的偏置项，不影响大多数权重。

In [1]:
import numpy as np 
import pandas as pd 
import os
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import matplotlib.pyplot as plt 
import transformers
import random
import chardet
import warnings
warnings.simplefilter('ignore')
scaler = torch.cuda.amp.GradScaler() 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
def random_seed(SEED):
    
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    
SEED = 508
random_seed(SEED)

In [3]:
import os
os.chdir(r'E:\Python code\Spotify用户评论')

In [4]:
data=pd.read_csv('DATASET.csv')
data=data.dropna()
n=len(data)
N=list(range(n))
random.shuffle(N)
data=data.iloc[N[0:5000]]
display(data)

Unnamed: 0,Review,label
36615,What is happening! Spotify will randomly start...,NEGATIVE
41766,Some of my songs are not downloaded and i dont...,NEGATIVE
5708,There should be an option to edit home screen,NEGATIVE
27751,I love Spotify for the large catalogue and the...,POSITIVE
22981,Its so cool so far the best app i found for fr...,POSITIVE
...,...,...
25867,The app quits by itself and restarts by itself...,NEGATIVE
47692,Just constantly recommending podcasts to me de...,NEGATIVE
9590,Keeps crashing piece of sh,POSITIVE
20563,The app is super glitchy. Currently 1 title is...,NEGATIVE


In [5]:
data.columns=['text','label']
class_names=sorted(data['label'].unique().tolist())
print(class_names)
N=list(range(len(class_names)))
normal_mapping=dict(zip(class_names,N)) 
reverse_mapping=dict(zip(N,class_names))       
data['label']=data['label'].map(normal_mapping) 

['NEGATIVE', 'POSITIVE']


In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [7]:
#tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-uncased")
tokenizer = transformers.RobertaTokenizer.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [8]:
test_s = train['text'].iloc[0]

result1 = tokenizer.encode_plus(test_s)

tokenizer.decode(result1["input_ids"])




'<s>Learning to love it! â\x9d¤</s>'

In [9]:
len(test_s.split(" "))

5

In [10]:
result2 = tokenizer.encode_plus(
    test_s,
    add_special_tokens = True, 
    max_length = 32, 
    pad_to_max_length = True, 
    truncation = True 
)
tokenizer.decode(result2["input_ids"])


'<s>Learning to love it! â\x9d¤</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [11]:
max_sens = 32

train = train.sort_values("label").reset_index(drop=True)

train["kfold"] = train.index % 5

p_train = train[train["kfold"]!=0].reset_index(drop=True)
p_valid = train[train["kfold"]==0].reset_index(drop=True)

p_test=test.reset_index(drop=True)

In [12]:
class BERTDataSet(Dataset):
    #__len__ 方法定义了数据集的长度，而 __getitem__ 方法定义了如何访问数据集中的单个样本。
    def __init__(self,sentences,targets):        
        self.sentences = sentences
        self.targets = targets
        
    def __len__(self):#__len__ 方法返回 self.sentences 的长度        
        return len(self.sentences)
    
    def __getitem__(self,idx):#这个方法定义了如何通过索引来获取数据集中的单个元素。当你尝试访问 dataset_instance[i] 时（其中 i 是索引），就会调用这个方法。        
        sentence = self.sentences[idx]    
        bert_sens = tokenizer.encode_plus(
                                sentence,
                                add_special_tokens = True, 
                                max_length = max_sens, 
                                pad_to_max_length = True, 
                                return_attention_mask = True)

        ids = torch.tensor(bert_sens['input_ids'], dtype=torch.long)
        mask = torch.tensor(bert_sens['attention_mask'], dtype=torch.long)

        target = torch.tensor(self.targets[idx],dtype=torch.float)
        
        return {
                'ids': ids,
                'mask': mask,

                'targets': target
            }

In [13]:
train_dataset = BERTDataSet(p_train["text"],p_train["label"])
valid_dataset = BERTDataSet(p_valid["text"],p_valid["label"])
test_dataset = BERTDataSet(p_test["text"],p_test["label"])

In [15]:
train_batch = 16
valid_batch = 32
test_batch = 32

train_dataloader = DataLoader(train_dataset,batch_size=train_batch,shuffle = True,num_workers=8,pin_memory=True)
valid_dataloader = DataLoader(valid_dataset,batch_size=valid_batch,shuffle = False,num_workers=8,pin_memory=True)
test_dataloader = DataLoader(test_dataset,batch_size=test_batch,shuffle = False,num_workers=8,pin_memory=True)

In [17]:
model = transformers.RobertaForSequenceClassification.from_pretrained("roberta-base",num_labels=1)
#model = transformers.BertForSequenceClassification.from_pretrained("../input/bert-base-uncased",num_labels=1)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


peft setting for roberta

In [18]:
#peft setting for roberta

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,   
    inference_mode=False,        
    r=8,                          
    lora_alpha=16,             
    lora_dropout=0.1              
)

lora_model = get_peft_model(model, lora_config)
model = lora_model
display(model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [19]:
model.to(device)
model.train()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [None]:
for a in train_dataloader:
    ids = a["ids"].to(device)
    mask = a["mask"].to(device)

    output = model(ids,mask)
    break

In [None]:
output = output["logits"].squeeze(-1).shape

In [None]:
from transformers import AdamW
LR=2e-5
optimizer = AdamW(model.parameters(), LR,betas=(0.9, 0.999), weight_decay=1e-2) 

set epochs

In [None]:
from transformers import get_linear_schedule_with_warmup
epochs = 10
4
#if debug:
#    epochs = 1

train_steps = int(len(p_train)/train_batch*epochs)
print(train_steps)
num_steps = int(train_steps*0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)

def loss_fn(output,target):
    return torch.sqrt(nn.MSELoss()(output,target))