In [29]:
!pip install evaluate
import os
import sys
import logging
import datasets
import evaluate
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from peft import TaskType, PromptEncoderConfig, get_peft_model
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




运行环境为本地主机

## 预处理数据

注册logger

In [30]:
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info(r"running %s" % ''.join(sys.argv))

In [31]:
trainDataPath = r"/kaggle/input/imdb-sentiment-analysis/labeledTrainData.tsv"
testDataPath = r"/kaggle/input/imdb-sentiment-analysis/testData.tsv"

train = pd.read_csv(trainDataPath, header=0, delimiter="\t", quoting=3)
test = pd.read_csv(testDataPath, header=0, delimiter="\t", quoting=3)
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [32]:
train, val = train_test_split(train, test_size=0.2)

train_dict = {'label': train["sentiment"], 'text': train['review']}
val_dict = {'label': val["sentiment"], 'text': val['review']}
test_dict = {"text": test['review']}

train_dataset = datasets.Dataset.from_dict(train_dict)
val_dataset = datasets.Dataset.from_dict(val_dict)
test_dataset = datasets.Dataset.from_dict(test_dict)

In [33]:
checkpoint = "microsoft/deberta-v2-xxlarge"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples['text'], max_length=512, truncation=True)


tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
print(tokenized_train)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 20000
})


## 创建模型

In [34]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

peft_config = PromptEncoderConfig(
    num_virtual_tokens=20,
    encoder_hidden_size=128,
    task_type=TaskType.SEQ_CLS
)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

pytorch_model.bin:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v2-xxlarge and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 445,186 || all params: 1,567,358,724 || trainable%: 0.0284


## Trainer

In [35]:
def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [36]:
training_args = TrainingArguments(
        run_name="deberta-prefix",
        output_dir='./deberta-v2-xxlarge-prefix',  
        learning_rate=1e-5,
        num_train_epochs=3,  
        per_device_train_batch_size=1,  
        per_device_eval_batch_size=1,  
        weight_decay=0.01,
        eval_strategy="epoch",
        report_to="none",
)

使用​​前缀调整（Prefix Tuning）​​方法微调模型时，当前模型不支持​​past key values​​（历史键值对）功能，而这是前缀调整的必要条件

In [None]:
trainer = Trainer(
    model=model,  
    args=training_args,  
    train_dataset=tokenized_train, 
    eval_dataset=tokenized_val, 
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss


## 预测，并保存数据

In [None]:
import os

# 确保result文件夹存在
os.makedirs("./result", exist_ok=True)  # 自动创建且不报错（Python 3.2+）

prediction_outputs = trainer.predict(tokenized_test)
test_pred = np.argmax(prediction_outputs[0], axis=-1).flatten()
print(test_pred)

result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
result_output.to_csv("./result/deberta_prefix.csv", index=False, quoting=3)