In [1]:
import ctypes
import gc

import torch

libc = ctypes.CDLL("libc.so.6")

from dataclasses import dataclass
from typing import Optional, Union

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoModelForMultipleChoice, AutoTokenizer, Trainer, TrainingArguments
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase



In [2]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k


def map_k(true_items, predictions, K=3):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_k = 0.0
    for u in range(U):
        user_preds = predictions[u]
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), K)):
            map_at_k += precision_at_k(user_results, k + 1) * user_results[k]
    return map_at_k / U


import numpy as np


def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)  # Sortting indices in descending order
    top_answer_indices = sorted_answer_indices[:, :]  # Taking the first three indices for each row
    top_answers = np.vectorize(index_to_option.get)(
        top_answer_indices
    )  # Transforming indices to options - i.e., 0 --> A
    return np.apply_along_axis(lambda row: " ".join(row), 1, top_answers)


@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [3]:
df_valid = pd.read_csv("../preprocessed/000_base/000/train.csv")

model_dict = {"path": "../output/005_retrieval/001/checkpoint-39563", "mode": "005"}

all_preds = []
all_labels = []
tokenizer = AutoTokenizer.from_pretrained(model_dict["path"])

test_df = pd.concat(
    [
        pd.read_csv("../preprocessed/000_base/000/train.csv"),
        pd.read_csv("../preprocessed/000_base/000/6000_all_categories_questions.csv").head(500),
        pd.read_csv("../preprocessed/000_base/000/6000_wiki_en_sci_questions.csv").head(500),
    ]
).reset_index(drop=True)
test_df["id"] = test_df.index

option_to_index = {option: idx for idx, option in enumerate("ABCDE")}
index_to_option = {v: k for k, v in option_to_index.items()}

if model_dict["mode"] == "002":

    def preprocess(example):
        first_sentence = [example["prompt"]] * 5
        second_sentences = [example[option] for option in "ABCDE"]
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
        tokenized_example["label"] = option_to_index[example["answer"]]
        return tokenized_example

    tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
        preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
    )

elif model_dict["mode"] == "003":
    test_df["prompt"] = test_df["context"].str.slice(0, 800) + " #### " + test_df["prompt"]

    def preprocess(example):
        first_sentence = [example["prompt"]] * 5
        second_sentences = [example[option] for option in "ABCDE"]
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
        tokenized_example["label"] = option_to_index[example["answer"]]
        return tokenized_example

    tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
        preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
    )

elif model_dict["mode"] == "004":
    test_df["prompt"] = test_df["prompt"] + " ## " + test_df["context"]

    def preprocess(example):
        first_sentence = [example["prompt"]] * 5
        second_sentences = [example[option] for option in "ABCDE"]
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation=True, max_length=384)
        tokenized_example["label"] = option_to_index[example["answer"]]
        return tokenized_example

    tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
        preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
    )

elif model_dict["mode"] == "005":
    test_df["prompt"] = (
        test_df["context"].apply(lambda x: " ".join(x.split()[:100])) + f"... [SEP] " + test_df["prompt"]
    )

    def preprocess(example):
        first_sentence = [example["prompt"]] * 5
        second_sentences = [example[option] for option in "ABCDE"]
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
        tokenized_example["label"] = option_to_index[example["answer"]]
        return tokenized_example

    tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
        preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
    )

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

In [4]:
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(
    tokenized_test_dataset,
    1,
    shuffle=False,
    collate_fn=data_collator,
)

model = AutoModelForMultipleChoice.from_pretrained(model_dict["path"]).cuda()
model.eval()
preds = []
labels = []
for batch in test_dataloader:
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    labels.append(batch["labels"].cpu().detach())
    preds.append(outputs.logits.cpu().detach())

preds = torch.cat(preds)
labels = torch.cat(labels)

result_dict = {
    "old_map@3": map_k(df_valid["answer"].to_numpy(), predictions_to_map_output(preds[: len(df_valid), :])),
    "new_map@3": map_k(test_df["answer"].to_numpy(), predictions_to_map_output(preds)),
}

print(model_dict)
print(result_dict)
del model
_ = gc.collect()
libc.malloc_trim(0)
torch.cuda.empty_cache()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/005_retrieval/001/checkpoint-39563', 'mode': '005'}
{'old_map@3': 0.8683333333333336, 'new_map@3': 0.8152777777777804}


In [17]:
def predictions_to_map_output_top(predictions):
    sorted_answer_indices = np.argsort(-predictions)  # Sortting indices in descending order
    top_answer_indices = sorted_answer_indices[:, :]  # Taking the first three indices for each row
    top_answers = np.vectorize(index_to_option.get)(
        top_answer_indices
    )  # Transforming indices to options - i.e., 0 --> A
    return np.apply_along_axis(lambda row: " ".join(row[0]), 1, top_answers)


valid_preds = predictions_to_map_output_top(preds[: len(df_valid), :])
df_valid["preds"] = valid_preds

In [20]:
df_valid.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer,context,preds
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,The presence of a clustered thick disk-like co...,D
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,Many of these systems evolve in a self-similar...,A
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,It is possible that this usage is related with...,A
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,Renormalization is distinct from regularizatio...,C
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,Several qualitative observations can be made o...,D


In [39]:
wrong_df = df_valid.query("answer != preds").reset_index(drop=True)
df_valid["input"] = df_valid["context"].apply(lambda x: " ".join(x.split()[:100])) + f"... [SEP] " + df_valid["prompt"]


def print_wrong(index):
    for col, row in zip(df_valid.columns, wrong_df.iloc[index]):
        print(f"【{col}】:", row)

In [40]:
"""
ミスの原因？：質問同士がだいぶ近いので、区別が難しい
"""
print_wrong(0)

【id】: 13
【prompt】: What is the Roche limit?
【A】: The Roche limit is the distance at which tidal effects would cause an object to rotate since the forces exerted by two massive bodies produce a torque on a third object.
【B】: The Roche limit is the distance at which tidal effects would cause an object to unite since differential force from a planet results in parts becoming attracted to one another.
【C】: The Roche limit is the distance at which tidal effects would cause a planet to disintegrate since differential force from an object overcomes the planet's core.
【D】: The Roche limit is the distance at which tidal effects would cause an object to disintegrate since differential force from a planet overcomes the attraction of the parts between them.
【E】: The Roche limit is the distance at which tidal effects would cause an object to break apart due to differential force from the planet overcoming the attraction of the parts of the object for one another, which depends on the object's densi

In [41]:
"""
ミスの原因？

- hitしたwikipediaが違う
 - 正解：https://en.wikipedia.org/wiki/Causality_(physics)
 - 間違い：butterfly effect


タイトルと１行目からでは分からない内容を抽出しているので、現状のretrieval の方法ではカバーするのが難しい
"""
print_wrong(1)

【id】: 17
【prompt】: What is the butterfly effect?
【A】: The butterfly effect is a physical cause that occurs when a massive sphere is caused to roll down a slope starting from a point of unstable equilibrium, and its velocity is assumed to be caused by the force of gravity accelerating it.
【B】: The butterfly effect is a distributed causality that opens up the opportunity to understand the relationship between necessary and sufficient conditions in classical (Newtonian) physics.
【C】: The butterfly effect is a proportionality between the cause and the effect of a physical phenomenon in classical (Newtonian) physics.
【D】: The butterfly effect is a small push that is needed to set a massive sphere into motion when it is caused to roll down a slope starting from a point of unstable equilibrium.
【E】: The butterfly effect is a phenomenon that highlights the difference between the application of the notion of causality in physics and a more general use of causality as represented by Mackie's INU

In [44]:
"""
ミスの原因？


- 抽出はできているが優先順位が低いのでcutされる
  - >  == Reactive Leidenfrost effect == thumb|Reactive Leidenfrost effect of cellulose on silica, Non-volatile materials were discovered in 2015 to also exhibit a 'reactive Leidenfrost effect', whereby solid particles were observed to float above hot surfaces and skitter around erratically. 
→各選択肢との類似度を取ればcutされずに上位に入りそう
"""
print_wrong(2)

【id】: 18
【prompt】: What is the 'reactive Leidenfrost effect' observed in non-volatile materials?
【A】: The 'reactive Leidenfrost effect' is a phenomenon where solid particles float above hot surfaces and move erratically, observed in non-volatile materials.
【B】: The 'reactive Leidenfrost effect' is a phenomenon where solid particles float above hot surfaces and move erratically, observed in volatile materials.
【C】: The 'reactive Leidenfrost effect' is a phenomenon where solid particles sink into hot surfaces and move slowly, observed in non-volatile materials.
【D】: The 'reactive Leidenfrost effect' is a phenomenon where solid particles float above cold surfaces and move erratically, observed in non-volatile materials.
【E】: The 'reactive Leidenfrost effect' is a phenomenon where solid particles sink into cold surfaces and move slowly, observed in non-volatile materials.
【answer】: A
【context】: The new phenomenon of a 'reactive Leidenfrost (RL) effect' was characterized by a dimensionless 

In [45]:
"""
ミスの原因？

hit はしているがうまく抽出できていない
- 必要なS2に関する情報をhttps://en.wikipedia.org/wiki/Supermassive_black_hole　から取得できていない
- Referencesなどの不要なものが上位に入ってしまっている
→各選択肢との類似度を取ればcutされずに上位に入りそう
"""
print_wrong(3)

【id】: 28
【prompt】: What is the evidence for the existence of a supermassive black hole at the center of the Milky Way galaxy?
【A】: The Milky Way galaxy has a supermassive black hole at its center because of the bright flare activity observed near Sagittarius A*. The radius of the central object must be less than 17 light-hours, because otherwise S2 would collide with it. Observations of the star S14 indicate that the radius is no more than 6.25 light-hours, about the diameter of Uranus' orbit. No known astronomical object other than a black hole can contain 4.0 million M☉ in this volume of space.
【B】: The Milky Way galaxy has a supermassive black hole at its center because the star S14 follows an elliptical orbit with a period of 15.2 years and a pericenter of 17 light-hours from the center of the central object. From the motion of star S14, the object's mass can be estimated as 4.0 million M☉, or about 7.96×1036 kg. The radius of the central object must be less than 17 light-hours, be

In [46]:
"""
ミスの原因？

- hitしたwikipediaが違う
 - 不正解：https://en.wikipedia.org/wiki/Gravity_Probe_B
 - 正解：https://en.wikipedia.org/wiki/Spacetime
"""
print_wrong(4)

【id】: 35
【prompt】: What was the aim of the Gravity Probe B (GP-B) mission?
【A】: To prove that pressure contributes equally to spacetime curvature as does mass-energy.
【B】: To measure spacetime curvature near Earth, with particular emphasis on gravitomagnetism.
【C】: To measure the distribution of Fe and Al on the Moon's surface.
【D】: To confirm the relatively large geodetic effect due to simple spacetime curvature, and is also known as de Sitter precession.
【E】: To measure the discrepancy between active and passive mass to about 10−12.
【answer】: B
【context】: Gravity Probe B (GP-B) was a satellite-based experiment to test two unverified predictions of general relativity: the geodetic effect and frame-dragging. In a public press and media event at NASA Headquarters, GP-B Principal Investigator, Francis Everitt presented the final results of Gravity Probe B. ;19 November 2015 : Publication of GP-B Special Volume (Volume #32, Issue #22) in the peer-reviewed journal, Classical and Quantum Gr

In [47]:
"""
ミスの原因？

- hitしたwikipediaが違う
 - 不正解：https://en.wikipedia.org/wiki/Synaptic_transistor
 - 正解：https://en.wikipedia.org/wiki/Memristor
"""
print_wrong(5)

【id】: 39
【prompt】: What is the synapstor or synapse transistor?
【A】: A device used to demonstrate a neuro-inspired circuit that shows short-term potentiation for learning and inactivity-based forgetting.
【B】: A device used to demonstrate a neuro-inspired circuit that shows long-term potentiation for learning and activity-based forgetting.
【C】: A device used to demonstrate a neuro-inspired circuit that shows short-term depression for learning and inactivity-based forgetting.
【D】: A device used to demonstrate a neuro-inspired circuit that shows short-term potentiation for learning and activity-based forgetting.
【E】: A device used to demonstrate a neuro-inspired circuit that shows long-term potentiation for learning and inactivity-based forgetting.
【answer】: E
【context】: A synaptic transistor is an electrical device that can learn in ways similar to a neural synapse. The input and output of the synaptic transistor are continuous analog values, rather than digital on-off signals. A network

In [48]:
"""
ミスの原因？

- hitしたwikipediaが違う
 - 不正解: https://en.wikipedia.org/wiki/MACS0647-JD
 - 正解：https://en.wikipedia.org/wiki/Observable_universe
"""
print_wrong(6)

【id】: 41
【prompt】: What is the proper distance for a redshift of 8.2?
【A】: The proper distance for a redshift of 8.2 is about 6.2 Gpc, or about 24 billion light-years.
【B】: The proper distance for a redshift of 8.2 is about 7.2 Gpc, or about 26 billion light-years.
【C】: The proper distance for a redshift of 8.2 is about 9.2 Gpc, or about 30 billion light-years.
【D】: The proper distance for a redshift of 8.2 is about 8.2 Gpc, or about 28 billion light-years.
【E】: The proper distance for a redshift of 8.2 is about 10.2 Gpc, or about 32 billion light-years.
【answer】: C
【context】: __NOTOC__ MACS0647-JD is a galaxy with a redshift of about z = 10.7, equivalent to a light travel distance of 13.26 billion light-years (4 billion parsecs). Using Hubble's law, the redshift can be used to estimate the distance of an object from Earth. Photometric redshifts were originally determined by calculating the expected observed data from a known emission spectrum at a range of redshifts. In the absence of

In [49]:
"""
ミスの原因？

- hitしたwikipediaが違う
    - 不正解：https://en.wikipedia.org/wiki/Isaac_Newton
    - 正解：https://en.wikipedia.org/wiki/Newton%27s_law_of_universal_gravitation
"""
print_wrong(7)

【id】: 53
【prompt】: What did Newton adopt after his correspondence with Hooke in 1679-1680?
【A】: The language of inward or centripetal force.
【B】: The language of gravitational force.
【C】: The language of outward or centrifugal force.
【D】: The language of tangential and radial displacements.
【E】: The language of electromagnetic force.
【answer】: A
【context】: Newton and Hooke had brief exchanges in 1679–80, when Hooke, appointed to manage the Royal Society's correspondence, opened up a correspondence intended to elicit contributions from Newton to Royal Society transactions, which had the effect of stimulating Newton to work out a proof that the elliptical form of planetary orbits would result from a centripetal force inversely proportional to the square of the radius vector. Newton was well-versed in both classics and modern languages. In the , Newton formulated the laws of motion and universal gravitation that formed the dominant scientific viewpoint for centuries until it was supersede