In [1]:
!pip install transformers datasets sentencepiece scikit-learn
!pip install transformers[sentencepiece]




In [7]:
# Step 1: Import necessary libraries
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [8]:
# Step 2: Load the dataset
df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [11]:
# Step 3: Process labels and handle tie cases
def preprocess_pairwise(row):
    text_a = f"[Prompt] {row['prompt']} [Response] {row['response_a']}"
    text_b = f"[Prompt] {row['prompt']} [Response] {row['response_b']}"

    if row['winner_model_a'] == 1:
        label = 0
    elif row['winner_model_b'] == 1:
        label = 1
    elif row['winner_tie'] == 1:
        label = 2
    else:
        return None  # 無效資料

    return {
        "text_a": text_a,
        "text_b": text_b,
        "label": label
    }

In [12]:
processed_data = df.apply(preprocess_pairwise, axis=1).dropna().tolist()
processed_df = pd.DataFrame(processed_data)

# Step 4: 切分訓練與驗證資料
train_df, val_df = train_test_split(processed_df, test_size=0.1, random_state=42)
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [13]:
# Step 4: Tokenization
MODEL_NAME = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
def tokenize_function(example):
    return tokenizer(
        example["text_a"], example["text_b"],
        truncation=True, padding="max_length", max_length=512
    )

In [15]:
tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_val = val_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/51729 [00:00<?, ? examples/s]

Map:   0%|          | 0/5748 [00:00<?, ? examples/s]

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_total_limit=1,
)

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}


In [20]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: text_b, __index_level_0__, text_a. If text_b, __index_level_0__, text_a are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 51729
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 38799
  Number of trainable parameters = 435064835
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
test_data = pd.DataFrame([{
    "text_a": f"[Prompt] {row['prompt']} [Response] {row['response_a']}",
    "text_b": f"[Prompt] {row['prompt']} [Response] {row['response_b']}"
} for _, row in test_df.iterrows()])

In [None]:
test_ds = Dataset.from_pandas(test_data)
tokenized_test = test_ds.map(tokenize_function, batched=True)

predictions = trainer.predict(tokenized_test)
probs = torch.softmax(torch.tensor(predictions.predictions), dim=1)
predicted_labels = torch.argmax(probs, dim=1).numpy()

# Step 10: 儲存 Submission
label_map = {0: "a", 1: "b", 2: "tie"}
test_df["prediction"] = [label_map[x] for x in predicted_labels]
test_df[["id", "prediction"]].to_csv("submission.csv", index=False)

print("✅ Submission file saved as submission.csv")