In [3]:
from dataclasses import dataclass
from typing import Optional, Union

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoModelForMultipleChoice, AutoTokenizer, Trainer, TrainingArguments
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase

model_paths = ["../output/001_additional_data/003/checkpoint-21000"]

In [2]:
option_to_index = {option: idx for idx, option in enumerate("ABCDE")}
index_to_option = {v: k for k, v in option_to_index.items()}


def preprocess(example):
    first_sentence = [example["prompt"]] * 5
    second_sentences = [example[option] for option in "ABCDE"]
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
    tokenized_example["label"] = option_to_index[example["answer"]]

    return tokenized_example


@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [5]:
%%time

all_preds = []
all_labels = []
for model_path in model_paths:
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    test_df = pd.read_csv("../input/kaggle-llm-science-exam/test.csv")
    test_df[
        "answer"
    ] = "A"  # dummy answer that allows us to preprocess the test datataset using functionality that works for the train set

    tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
        preprocess, remove_columns=["prompt", "A", "B", "C", "D", "E", "answer"]
    )
    data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
    test_dataloader = DataLoader(tokenized_test_dataset, 10, shuffle=False, collate_fn=data_collator)

    model = AutoModelForMultipleChoice.from_pretrained(model_path).cuda()
    model.eval()
    preds = []
    labels = []
    for batch in test_dataloader:
        for k in batch.keys():
            batch[k] = batch[k].cuda()
        with torch.no_grad():
            outputs = model(**batch)
        labels.append(batch["labels"].cpu().detach())
        preds.append(outputs.logits.cpu().detach())

    preds = torch.cat(preds)
    labels = torch.cat(labels)
    all_preds.append(preds)
    all_labels.append(labels)

all_preds = torch.stack(all_preds)

  0%|          | 0/200 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


CPU times: user 4.5 s, sys: 2.39 s, total: 6.89 s
Wall time: 5.91 s


In [6]:
all_preds.shape

torch.Size([1, 200, 5])

In [8]:
test_predictions = all_preds.mean(0)
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_ids[:3]

tensor([[3, 1, 2, 0, 4],
        [3, 2, 1, 0, 4],
        [2, 0, 4, 1, 3]])

In [9]:
predictions_as_answer_letters = np.array(list("ABCDE"))[predictions_as_ids]
predictions_as_answer_letters[:3]

array([['D', 'B', 'C', 'A', 'E'],
       ['D', 'C', 'B', 'A', 'E'],
       ['C', 'A', 'E', 'B', 'D']], dtype='<U1')

In [10]:
predictions_as_string = test_df["prediction"] = [" ".join(row) for row in predictions_as_answer_letters[:, :3]]
predictions_as_string[:3]

['D B C', 'D C B', 'C A E']

In [12]:
submission = test_df[["id", "prediction"]]
submission.to_csv("submission.csv", index=False)

pd.read_csv("submission.csv").tail()

Unnamed: 0,id,prediction
195,195,C A E
196,196,C B A
197,197,B A D
198,198,D B C
199,199,A E D
