## localとpublicが相関しなくなってきたので、validationを増やして相関するかどうかをチェックしてみる

In [1]:
import ctypes
import gc

import torch

libc = ctypes.CDLL("libc.so.6")

from dataclasses import dataclass
from typing import Optional, Union

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoModelForMultipleChoice, AutoTokenizer, Trainer, TrainingArguments
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase

In [3]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k


def map_k(true_items, predictions, K=3):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_k = 0.0
    for u in range(U):
        user_preds = predictions[u]
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), K)):
            map_at_k += precision_at_k(user_results, k + 1) * user_results[k]
    return map_at_k / U


import numpy as np


def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)  # Sortting indices in descending order
    top_answer_indices = sorted_answer_indices[:, :]  # Taking the first three indices for each row
    top_answers = np.vectorize(index_to_option.get)(
        top_answer_indices
    )  # Transforming indices to options - i.e., 0 --> A
    return np.apply_along_axis(lambda row: " ".join(row), 1, top_answers)


@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [4]:
df_valid = pd.read_csv("../preprocessed/000_base/000/train.csv")

models = [
    {"path": "../output/005_retrieval/001/checkpoint-39563", "mode": "005"},
    {"path": "../output/005_retrieval/000/checkpoint-14625", "mode": "005"},
    {"path": "../output/004_retrieval_truncate/001/checkpoint-41100", "mode": "004"},
    {"path": "../output/004_retrieval_truncate/000/checkpoint-21125", "mode": "004"},
    {"path": "../output/003_retrieval/000/checkpoint-16250", "mode": "003"},
    {"path": "../output/002_additional_datas/002/checkpoint-54804", "mode": "002"},
    {"path": "../output/002_additional_datas/001/checkpoint-43840", "mode": "002"},
    {"path": "../output/002_additional_datas/000/checkpoint-36550", "mode": "002"},
    {"path": "../output/001_additional_data/004/checkpoint-6504", "mode": "002"},
]

In [5]:
torch.cuda.empty_cache()

In [7]:
all_preds = []
all_labels = []
for model_dict in models:
    tokenizer = AutoTokenizer.from_pretrained(model_dict["path"])

    test_df = pd.concat(
        [
            pd.read_csv("../preprocessed/000_base/000/train.csv"),
            pd.read_csv("../preprocessed/000_base/000/6000_all_categories_questions.csv").head(500),
            pd.read_csv("../preprocessed/000_base/000/6000_wiki_en_sci_questions.csv").head(500),
        ]
    ).reset_index(drop=True)
    test_df["id"] = test_df.index

    option_to_index = {option: idx for idx, option in enumerate("ABCDE")}
    index_to_option = {v: k for k, v in option_to_index.items()}

    if model_dict["mode"] == "002":

        def preprocess(example):
            first_sentence = [example["prompt"]] * 5
            second_sentences = [example[option] for option in "ABCDE"]
            tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
            tokenized_example["label"] = option_to_index[example["answer"]]
            return tokenized_example

        tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
            preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
        )

    elif model_dict["mode"] == "003":
        test_df["prompt"] = test_df["context"].str.slice(0, 800) + " #### " + test_df["prompt"]

        def preprocess(example):
            first_sentence = [example["prompt"]] * 5
            second_sentences = [example[option] for option in "ABCDE"]
            tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
            tokenized_example["label"] = option_to_index[example["answer"]]
            return tokenized_example

        tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
            preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
        )

    elif model_dict["mode"] == "004":
        test_df["prompt"] = test_df["prompt"] + " ## " + test_df["context"]

        def preprocess(example):
            first_sentence = [example["prompt"]] * 5
            second_sentences = [example[option] for option in "ABCDE"]
            tokenized_example = tokenizer(first_sentence, second_sentences, truncation=True, max_length=384)
            tokenized_example["label"] = option_to_index[example["answer"]]
            return tokenized_example

        tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
            preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
        )

    elif model_dict["mode"] == "005":
        test_df["prompt"] = (
            test_df["context"].apply(lambda x: " ".join(x.split()[:100])) + f"... [SEP] " + test_df["prompt"]
        )

        def preprocess(example):
            first_sentence = [example["prompt"]] * 5
            second_sentences = [example[option] for option in "ABCDE"]
            tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
            tokenized_example["label"] = option_to_index[example["answer"]]
            return tokenized_example

        tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
            preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
        )

    data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
    test_dataloader = DataLoader(
        tokenized_test_dataset,
        1,
        shuffle=False,
        collate_fn=data_collator,
    )

    model = AutoModelForMultipleChoice.from_pretrained(model_dict["path"]).cuda()
    model.eval()
    preds = []
    labels = []
    for batch in test_dataloader:
        for k in batch.keys():
            batch[k] = batch[k].cuda()
        with torch.no_grad():
            outputs = model(**batch)
        labels.append(batch["labels"].cpu().detach())
        preds.append(outputs.logits.cpu().detach())

    preds = torch.cat(preds)
    labels = torch.cat(labels)

    result_dict = {
        "old_map@3": map_k(df_valid["answer"].to_numpy(), predictions_to_map_output(preds[: len(df_valid), :])),
        "new_map@3": map_k(test_df["answer"].to_numpy(), predictions_to_map_output(preds)),
    }

    print(model_dict)
    print(result_dict)
    del model
    _ = gc.collect()
    libc.malloc_trim(0)
    torch.cuda.empty_cache()

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/005_retrieval/001/checkpoint-39563', 'mode': '005'}
{'old_map@3': 0.8683333333333336, 'new_map@3': 0.8152777777777804}


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/005_retrieval/000/checkpoint-14625', 'mode': '005'}
{'old_map@3': 0.7049999999999998, 'new_map@3': 0.6580555555555573}


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/004_retrieval_truncate/001/checkpoint-41100', 'mode': '004'}
{'old_map@3': 0.8516666666666669, 'new_map@3': 0.8125000000000023}


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/004_retrieval_truncate/000/checkpoint-21125', 'mode': '004'}
{'old_map@3': 0.7433333333333332, 'new_map@3': 0.6461111111111124}


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/003_retrieval/000/checkpoint-16250', 'mode': '003'}
{'old_map@3': 0.7166666666666666, 'new_map@3': 0.6611111111111124}


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/002_additional_datas/002/checkpoint-54804', 'mode': '002'}
{'old_map@3': 0.8183333333333335, 'new_map@3': 0.7552777777777812}


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/002_additional_datas/001/checkpoint-43840', 'mode': '002'}
{'old_map@3': 0.7250000000000001, 'new_map@3': 0.7186111111111146}


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/002_additional_datas/000/checkpoint-36550', 'mode': '002'}
{'old_map@3': 0.7049999999999998, 'new_map@3': 0.7375000000000037}


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/001_additional_data/004/checkpoint-6504', 'mode': '002'}
{'old_map@3': 0.6499999999999995, 'new_map@3': 0.533888888888889}


In [10]:
all_preds = []
all_labels = []
for model_dict in models:
    tokenizer = AutoTokenizer.from_pretrained(model_dict["path"])

    test_df = pd.concat(
        [
            pd.read_csv("../preprocessed/000_base/000/train.csv"),
            pd.read_csv("../preprocessed/000_base/000/6000_all_categories_questions.csv").head(200),
            pd.read_csv("../preprocessed/000_base/000/6000_wiki_en_sci_questions.csv").head(400),
        ]
    ).reset_index(drop=True)
    test_df["id"] = test_df.index

    option_to_index = {option: idx for idx, option in enumerate("ABCDE")}
    index_to_option = {v: k for k, v in option_to_index.items()}

    if model_dict["mode"] == "002":

        def preprocess(example):
            first_sentence = [example["prompt"]] * 5
            second_sentences = [example[option] for option in "ABCDE"]
            tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
            tokenized_example["label"] = option_to_index[example["answer"]]
            return tokenized_example

        tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
            preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
        )

    elif model_dict["mode"] == "003":
        test_df["prompt"] = test_df["context"].str.slice(0, 800) + " #### " + test_df["prompt"]

        def preprocess(example):
            first_sentence = [example["prompt"]] * 5
            second_sentences = [example[option] for option in "ABCDE"]
            tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
            tokenized_example["label"] = option_to_index[example["answer"]]
            return tokenized_example

        tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
            preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
        )

    elif model_dict["mode"] == "004":
        test_df["prompt"] = test_df["prompt"] + " ## " + test_df["context"]

        def preprocess(example):
            first_sentence = [example["prompt"]] * 5
            second_sentences = [example[option] for option in "ABCDE"]
            tokenized_example = tokenizer(first_sentence, second_sentences, truncation=True, max_length=384)
            tokenized_example["label"] = option_to_index[example["answer"]]
            return tokenized_example

        tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
            preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
        )

    elif model_dict["mode"] == "005":
        test_df["prompt"] = (
            test_df["context"].apply(lambda x: " ".join(x.split()[:100])) + f"... [SEP] " + test_df["prompt"]
        )

        def preprocess(example):
            first_sentence = [example["prompt"]] * 5
            second_sentences = [example[option] for option in "ABCDE"]
            tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
            tokenized_example["label"] = option_to_index[example["answer"]]
            return tokenized_example

        tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
            preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
        )

    data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
    test_dataloader = DataLoader(
        tokenized_test_dataset,
        1,
        shuffle=False,
        collate_fn=data_collator,
    )

    model = AutoModelForMultipleChoice.from_pretrained(model_dict["path"]).cuda()
    model.eval()
    preds = []
    labels = []
    for batch in test_dataloader:
        for k in batch.keys():
            batch[k] = batch[k].cuda()
        with torch.no_grad():
            outputs = model(**batch)
        labels.append(batch["labels"].cpu().detach())
        preds.append(outputs.logits.cpu().detach())

    preds = torch.cat(preds)
    labels = torch.cat(labels)

    result_dict = {
        "old_map@3": map_k(df_valid["answer"].to_numpy(), predictions_to_map_output(preds[: len(df_valid), :])),
        "new_map@3": map_k(test_df["answer"].to_numpy(), predictions_to_map_output(preds)),
    }

    print(model_dict)
    print(result_dict)
    del model
    _ = gc.collect()
    libc.malloc_trim(0)
    torch.cuda.empty_cache()

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/005_retrieval/001/checkpoint-39563', 'mode': '005'}
{'old_map@3': 0.8683333333333336, 'new_map@3': 0.8229166666666677}


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/005_retrieval/000/checkpoint-14625', 'mode': '005'}
{'old_map@3': 0.7049999999999998, 'new_map@3': 0.6795833333333325}


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/004_retrieval_truncate/001/checkpoint-41100', 'mode': '004'}
{'old_map@3': 0.8516666666666669, 'new_map@3': 0.8141666666666668}


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/004_retrieval_truncate/000/checkpoint-21125', 'mode': '004'}
{'old_map@3': 0.7433333333333332, 'new_map@3': 0.6762499999999991}


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/003_retrieval/000/checkpoint-16250', 'mode': '003'}
{'old_map@3': 0.7166666666666666, 'new_map@3': 0.6866666666666663}


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/002_additional_datas/002/checkpoint-54804', 'mode': '002'}
{'old_map@3': 0.8183333333333335, 'new_map@3': 0.7504166666666664}


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/002_additional_datas/001/checkpoint-43840', 'mode': '002'}
{'old_map@3': 0.7250000000000001, 'new_map@3': 0.6766666666666658}


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/002_additional_datas/000/checkpoint-36550', 'mode': '002'}
{'old_map@3': 0.7049999999999998, 'new_map@3': 0.6824999999999989}


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/001_additional_data/004/checkpoint-6504', 'mode': '002'}
{'old_map@3': 0.6499999999999995, 'new_map@3': 0.5520833333333318}
