In [1]:
import ctypes
import gc

import torch

libc = ctypes.CDLL("libc.so.6")

from dataclasses import dataclass
from typing import Optional, Union

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoModelForMultipleChoice, AutoTokenizer, Trainer, TrainingArguments
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase



In [2]:
import blingfire as bf
import faiss
from faiss import read_index, write_index
from sentence_transformers import SentenceTransformer

In [3]:
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking
def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k


def map_k(true_items, predictions, K=3):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_k = 0.0
    for u in range(U):
        user_preds = predictions[u]
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), K)):
            map_at_k += precision_at_k(user_results, k + 1) * user_results[k]
    return map_at_k / U


import numpy as np


def predictions_to_map_output(predictions):
    sorted_answer_indices = np.argsort(-predictions)  # Sortting indices in descending order
    top_answer_indices = sorted_answer_indices[:, :]  # Taking the first three indices for each row
    top_answers = np.vectorize(index_to_option.get)(
        top_answer_indices
    )  # Transforming indices to options - i.e., 0 --> A
    return np.apply_along_axis(lambda row: " ".join(row), 1, top_answers)


@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [4]:
df_valid = pd.read_csv("../preprocessed/002_gpu/003/train.csv")

model_dict = {"path": "../output/007_validation/006/checkpoint-47476", "mode": "007"}

all_preds = []
all_labels = []
tokenizer = AutoTokenizer.from_pretrained(model_dict["path"])

test_df = pd.concat(
    [
        pd.read_csv("../preprocessed/002_gpu/003/train.csv"),
        pd.read_csv("../preprocessed/002_gpu/003/6000_wiki_en_sci_questions.csv").head(200),  # 2000
    ]
).reset_index(drop=True)
test_df["id"] = test_df.index

option_to_index = {option: idx for idx, option in enumerate("ABCDE")}
index_to_option = {v: k for k, v in option_to_index.items()}

if model_dict["mode"] == "002":

    def preprocess(example):
        first_sentence = [example["prompt"]] * 5
        second_sentences = [example[option] for option in "ABCDE"]
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
        tokenized_example["label"] = option_to_index[example["answer"]]
        return tokenized_example

    tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
        preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
    )

elif model_dict["mode"] == "003":
    test_df["prompt"] = test_df["context"].str.slice(0, 800) + " #### " + test_df["prompt"]

    def preprocess(example):
        first_sentence = [example["prompt"]] * 5
        second_sentences = [example[option] for option in "ABCDE"]
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
        tokenized_example["label"] = option_to_index[example["answer"]]
        return tokenized_example

    tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
        preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
    )

elif model_dict["mode"] == "004":
    test_df["prompt"] = test_df["prompt"] + " ## " + test_df["context"]

    def preprocess(example):
        first_sentence = [example["prompt"]] * 5
        second_sentences = [example[option] for option in "ABCDE"]
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation=True, max_length=384)
        tokenized_example["label"] = option_to_index[example["answer"]]
        return tokenized_example

    tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
        preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
    )

elif model_dict["mode"] == "005":
    test_df["prompt"] = (
        test_df["context"].apply(lambda x: " ".join(x.split()[:100])) + f"... [SEP] " + test_df["prompt"]
    )

    def preprocess(example):
        first_sentence = [example["prompt"]] * 5
        second_sentences = [example[option] for option in "ABCDE"]
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
        tokenized_example["label"] = option_to_index[example["answer"]]
        return tokenized_example

    tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
        preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
    )

elif model_dict["mode"] == "007":
    test_df["prompt"] = (
        test_df["context"].apply(lambda x: " ".join(x.split()[:250])) + f"... [SEP] " + test_df["prompt"]
    )

    def preprocess(example):
        first_sentence = [example["prompt"]] * 5
        second_sentences = [example[option] for option in "ABCDE"]
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
        tokenized_example["label"] = option_to_index[example["answer"]]
        return tokenized_example

    tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=["id"])).map(
        preprocess, remove_columns=["context", "prompt", "A", "B", "C", "D", "E", "answer"]
    )

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [5]:
from tqdm.auto import tqdm

data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
test_dataloader = DataLoader(
    tokenized_test_dataset,
    1,
    shuffle=False,
    collate_fn=data_collator,
)

model = AutoModelForMultipleChoice.from_pretrained(model_dict["path"]).cuda()
model.eval()
preds = []
labels = []
for batch in tqdm(test_dataloader):
    for k in batch.keys():
        batch[k] = batch[k].cuda()
    with torch.no_grad():
        outputs = model(**batch)
    labels.append(batch["labels"].cpu().detach())
    preds.append(outputs.logits.cpu().detach())

preds = torch.cat(preds)
labels = torch.cat(labels)

result_dict = {
    "best_map@3": map_k(df_valid["answer"].to_numpy(), predictions_to_map_output(preds[: len(df_valid), :])),
    "new_map@3": map_k(test_df["answer"].to_numpy(), predictions_to_map_output(preds)),
}

print(model_dict)
print(result_dict)
del model
_ = gc.collect()
libc.malloc_trim(0)
torch.cuda.empty_cache()

  0%|          | 0/400 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'path': '../output/007_validation/006/checkpoint-47476', 'mode': '007'}
{'best_map@3': 0.9083333333333334, 'new_map@3': 0.8766666666666666}


In [7]:
def predictions_to_map_output_top(predictions):
    sorted_answer_indices = np.argsort(-predictions)  # Sortting indices in descending order
    top_answer_indices = sorted_answer_indices[:, :]  # Taking the first three indices for each row
    top_answers = np.vectorize(index_to_option.get)(
        top_answer_indices
    )  # Transforming indices to options - i.e., 0 --> A
    return np.apply_along_axis(lambda row: " ".join(row[0]), 1, top_answers)


valid_preds = predictions_to_map_output_top(preds[: len(test_df), :])
test_df["preds"] = valid_preds

In [8]:
test_df.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer,context,preds
0,0,While almost all astrophysicists today reject ...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,While almost all astrophysicists today reject ...,D
1,1,Many of these systems evolve in a self-similar...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,Many of these systems evolve in a self-similar...,A
2,2,It is possible that this usage is related with...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,It is possible that this usage is related with...,D
3,3,Renormalization is distinct from regularizatio...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,Renormalization is distinct from regularizatio...,C
4,4,Several qualitative observations can be made o...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,Several qualitative observations can be made o...,D


## ミスのチェック

In [9]:
wrong_df = test_df.query("answer != preds").reset_index(drop=True)
test_df["input"] = test_df["context"].apply(lambda x: " ".join(x.split()[:70])) + f"... [SEP] " + test_df["prompt"]


def print_wrong(index):
    for col, row in zip(test_df.columns, wrong_df.iloc[index]):
        if col == "input":
            encoded_input = tokenizer(row)
            # print(f"【{col}】:", tokenizer.decode(encoded_input["input_ids"]))
        else:
            print(f"【{col}】:", row)

In [10]:
"""
retrieve は完璧だが不正解
"""
print_wrong(0)

【id】: 2
【prompt】: It is possible that this usage is related with the Greek name of the island of Sicily, Trinacria (Τρινακρία "having three headlands").Liddell and Scott’s Greek-English Lexicon (A Lexicon Abridged from), Oxford, 1944, p.27, Cassell's Latin Dictionary, Marchant, J.R.V, & Charles, Joseph F., (Eds.), Revised Edition, 1928 The Sicilian triskeles is shown with the head of Medusa at the center.Matthews, Jeff (2005) Symbols of Naples The ancient symbol has been re-introduced in modern flags of Sicily since 1848. An early flag of Sicily, proposed in 1848, included the Sicilian triskeles or "Trinacria symbol". It has been suggested that its origin lies in Sicily, an island which has been associated with the triskelion since ancient times. The triskeles was adopted as emblem by the rulers of Syracuse. The oldest find of a triskeles in Sicily is a vase dated to 700 BCE, for which researchers assume a Minoan-Mycenaean origin. ===Roman period and Late Antiquity=== Late examples of 

In [11]:
def print_each_token(text, tokenizer=tokenizer):
    encoded_input = tokenizer(text)
    result = []
    for idx in encoded_input["input_ids"]:
        result.append(tokenizer.decode([idx]))
    print(text)
    print(" ".join(result))


print_each_token('(Τρινακρία "having three headlands").Liddell and Scott’s')
print_each_token("having three headlands")

(Τρινακρία "having three headlands").Liddell and Scott’s
[CLS] ( Τ ρι να κ ρί α " having three headland s " ) . Li d dell and Scott ’ s [SEP]
having three headlands
[CLS] having three headland s [SEP]


In [12]:
"""
retrieve は完璧だが不正解
正解：https://en.wikipedia.org/wiki/Wigner%E2%80%93Weyl_transform
"""

print_wrong(1)

【id】: 8
【prompt】: While such phenomena are sometimes referred to as "redshifts" and "blueshifts", in astrophysics light-matter interactions that result in energy shifts in the radiation field are generally referred to as "reddening" rather than "redshifting" which, as a term, is normally reserved for the effects discussed above. In physics, a redshift is an increase in the wavelength, and corresponding decrease in the frequency and photon energy, of electromagnetic radiation (such as light). The opposite change, a decrease in wavelength and simultaneous increase in frequency and energy, is known as a negative redshift, or blueshift. A red shift can be observed when part of the energy of the photon is transferred to the interacting matter, where it adds to its internal energy in a process called Stokes Raman scattering. For example, Doppler effect blueshifts () are associated with objects approaching (moving closer to) the observer with the light shifting to greater energies. This pheno

In [13]:
print_each_token('referred to as "reddening"')
print_each_token("referred to as  reddening ")
print_each_token("reddening")
print_each_token("Reddening")

referred to as "reddening"
[CLS] referred to as " re dden ing " [SEP]
referred to as  reddening 
[CLS] referred to as red den ing [SEP]
reddening
[CLS] red den ing [SEP]
Reddening
[CLS] Red den ing [SEP]


In [14]:
"""
retrieve は完璧だが不正解
"""

print_wrong(2)

【id】: 13
【prompt】: The Roche limit is the distance from a planet at which tidal effects would cause an object to disintegrate because the differential force of gravity from the planet overcomes the attraction of the parts of the object for one another. In celestial mechanics, the Roche limit, also called Roche radius, is the distance from a celestial body within which a second celestial body, held together only by its own force of gravity, will disintegrate because the first body's tidal forces exceed the second body's self-gravitation. It is also different from the Roche limit, which is the distance at which an object held together only by gravity begins to break up due to tidal forces. But note that, as defined above, the Roche limit refers to a body held together solely by the gravitational forces which cause otherwise unconnected particles to coalesce, thus forming the body in question. The Roche Division should not be confused with the Roche limit which is the distance at which a 

In [15]:
"""
hit はできているが優先度が低い
"""

print_wrong(3)

【id】: 28
【prompt】: In February 2021, astronomers released, for the first time, a very high-resolution image of 25,000 active supermassive black holes, covering four percent of the Northern celestial hemisphere, based on ultra-low radio wavelengths, as detected by the Low-Frequency Array (LOFAR) in Europe. ==See also== * * * * * * * * ==Notes== ==References== ==Further reading== * * * * * * ==External links== * Black Holes: Gravity's Relentless Pull Interactive multimedia Web site about the physics and astronomy of black holes from the Space Telescope Science Institute * Images of supermassive black holes * NASA images of supermassive black holes * <!---[https://web.archive.org/web/20160613034320/http://www.einstein- online.info/spotlights/milkyway_bh/?set_language=en The black hole at the heart of the Milky Way * ESO video clip of stars orbiting a galactic black hole * Star Orbiting Massive Milky Way Centre Approaches to within 17 Light- Hours ESO, October 21, 2002 * Images, Animations

In [16]:
from transformers import AutoModel, AutoTokenizer

stokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

text = "The Milky Way galaxy has a supermassive black hole at its center because of the bright flare activity observed near Sagittarius A*. The radius of the central object must be less than 17 light-hours, because otherwise S2 would collide with it. Observations of the star S14 indicate that the radius is no more than 6.25 light-hours, about the diameter of Uranus' orbit. No known astronomical object other than a black hole can contain 4.0 million M☉ in this volume of space."
print_each_token(text, stokenizer)

The Milky Way galaxy has a supermassive black hole at its center because of the bright flare activity observed near Sagittarius A*. The radius of the central object must be less than 17 light-hours, because otherwise S2 would collide with it. Observations of the star S14 indicate that the radius is no more than 6.25 light-hours, about the diameter of Uranus' orbit. No known astronomical object other than a black hole can contain 4.0 million M☉ in this volume of space.
[CLS] the milky way galaxy has a super ##mas ##sive black hole at its center because of the bright flare activity observed near sa ##git ##tar ##ius a * . the radius of the central object must be less than 17 light - hours , because otherwise s ##2 would col ##lide with it . observations of the star s ##14 indicate that the radius is no more than 6 . 25 light - hours , about the diameter of ur ##anus ' orbit . no known astronomical object other than a black hole can contain 4 . 0 million m ##☉ in this volume of space . [S

In [17]:
"""
"""

print_wrong(4)

【id】: 34
【prompt】: The Penrose process (also called Penrose mechanism) is theorised by Sir Roger Penrose as a means whereby energy can be extracted from a rotating black hole.R. Penrose and R. M. Floyd, "Extraction of Rotational Energy from a Black Hole", Nature Physical Science 229, 177 (1971).Misner, Thorne, and Wheeler, Gravitation, Freeman and Company, 1973. The process takes advantage of the ergosphere – a region of spacetime around the black hole dragged by its rotation faster than the speed of light, meaning that from the point of an outside observer any matter inside is forced to move in the direction of the rotation of the black hole. thumb|upright=1.2|Trajectories of bodies in a Penrose process. However, this is not a reverse of the Penrose process, as both increase the entropy of the black hole by throwing material into it. == See also == * * * High Life, a 2018 science- fiction film that includes a mission to harness the process * == References == == Further reading == * * 

In [18]:
"""
- hitしたwikipediaが違う
 - 不正解：https://en.wikipedia.org/wiki/Gravity_Probe_B
 - 正解：https://en.wikipedia.org/wiki/Spacetime
"""
print_wrong(5)

【id】: 35
【prompt】: Gravity Probe B (GP-B) was a satellite-based experiment to test two unverified predictions of general relativity: the geodetic effect and frame-dragging. In a public press and media event at NASA Headquarters, GP-B Principal Investigator, Francis Everitt presented the final results of Gravity Probe B. ;19 November 2015 : Publication of GP-B Special Volume (Volume #32, Issue #22) in the peer-reviewed journal, Classical and Quantum Gravity. Final science results were reported in 2011. ==Experimental setup== thumb The Gravity Probe B experiment comprised four London moment gyroscopes and a reference telescope sighted on IM Pegasi, a binary star in the constellation Pegasus. Gravity Probe B was expected to measure this effect to an accuracy of one part in 10,000, the most stringent check on general relativistic predictions to date. The Gravity Probe B mission timeline describes the events during the flight of Gravity Probe B, the science phase of its experimental campaig

In [19]:
"""
- hitしたwikipediaが違う
 - 不正解：https://en.wikipedia.org/wiki/Synaptic_transistor
 - 正解：https://en.wikipedia.org/wiki/Memristor
"""

print_wrong(6)

【id】: 39
【prompt】: A synaptic transistor is an electrical device that can learn in ways similar to a neural synapse. The transistor is designed to mimic the feature of the human synapse known as plasticity, or the variation of the speed and strength of the signal going from neuron to neuron. IEEE Trans. Electron Dev. and these organic synapstors were used to demonstrate an associative memory, which can be trained to present a pavlovian response.Pavlov's Dog Associative Learning Demonstrated on Synaptic- like Organic Transistors. A compact model was developed,Functional Model of a Nanoparticle-Organic Memory Transistor for Use as a Spiking Synapse. In effect, the gatekeeper neuron acts as the transistor of a gated synapse by modulating the transmission of the signal between the pre-synaptic and post-synaptic neurons. Neural Computations A recent report showed that these organic synapse- transistors (synapstor) are working at 1 volt and with a plasticity typical response time in the rang

In [20]:
"""
document の hit ができていない
- 不正解：https://en.wikipedia.org/wiki/Light-second
- 正解：https://en.wikipedia.org/wiki/Light-year
"""
print_wrong(7)

【id】: 49
【prompt】: Unit Definition Equivalent distance in Example Unit Definition m Example km miles light-second Average distance from the Earth to the Moon is about 1.282 light-seconds light-minute 60 light-seconds Average distance from the Earth to the Sun is 8.317 light-minutes light-hour 60 light- minutes = light-seconds The Perihelion of Saturn's orbit is about 1.25 light-hours light-day 24 light-hours = light-seconds Voyager 1 is currently 0.9 light-days from the Sun light-week 7 light-days = light-seconds The Oort cloud is thought to extend between 41 and 82 light- weeks out from the Sun light-month 30 light-days = light-seconds light-year 365.25 light-days = light-seconds Proxima Centauri is the nearest star to the Sun, about 4.24 light years away ==See also== * 100 megametres * Geometrized unit system * Light-year == References == Category:Units of length Category:Units of measurement in astronomy It is usually quoted as "light-time for unit distance" in tables of astronomica

In [21]:
"""
hit できていない
- 不正解：https://en.wikipedia.org/wiki/Physical_properties_of_soil
- 正解；https://en.wikipedia.org/wiki/Electrical_resistivity_and_conductivity
"""
print_wrong(8)

【id】: 52
【prompt】: Resistivity refers to the resistance to conduction of electric currents and affects the rate of corrosion of metal and concrete structures which are buried in soil. The resistance (R) of an object is defined as the ratio of voltage across it (V) to current through it (I), while the conductance (G) is the inverse: :R = {V\over I}, \qquad G = {I\over V} = \frac{1}{R} For a wide variety of materials and conditions, V and I are directly proportional to each other, and therefore R and G are constants (although they will depend on the size and shape of the object, the material it is made of, and other factors like temperature or strain). The SI unit of electrical resistance is the ohm (Ω), while electrical conductance is measured in siemens (S). ====Characteristics==== The resistance of an object depends in large part on the material it is made of—objects made of electrical insulators like rubber tend to have very high resistance and low conductivity, while objects made of

In [22]:
"""
- 不正解：https://en.wikipedia.org/wiki/Higher_order_coherence
- 正解：https://en.wikipedia.org/wiki/Coherence_(physics)
"""
print_wrong(9)

【id】: 65
【prompt】: The coherence encountered in most optical experiments, including the classic Young's double slit experiment and Mach–Zehnder interferometer, is first order coherence. Young's double slit experiment demonstrates the dependence of interference on coherence, specifically on the first-order correlation. The interferometric visibility gives a practical way to measure the coherence of two waves (or one wave with itself). This experiment is equivalent to the Mach–Zehnder interferometer with the caveat that Young's double slit experiment is concerned with spatial coherence, while the Mach–Zehnder interferometer relies on temporal coherence. The chief benefit of coherence scanning interferometry is that systems can be designed that do not suffer from the 2 pi ambiguity of coherent interferometry, and as seen in Fig. 18, which scans a 180μm x 140μm x 10μm volume, it is well suited to profiling steps and rough surfaces. The theory of partial coherence was awoken in the 1930s du

In [23]:
print_wrong(10)

【id】: 70
【prompt】: Ning Li and Douglas Torr, of the University of Alabama in Huntsville proposed how a time dependent magnetic field could cause the spins of the lattice ions in a superconductor to generate detectable gravitomagnetic and gravitoelectric fields in a series of papers published between 1991 and 1993. This experiment measured the magnetic fields of four superconducting gyroscopes to determine their spin axes. It is characterized by the Meissner effect, the complete ejection of magnetic field lines from the interior of the superconductor during its transitions into the superconducting state. Many studies have attempted to reproduce Podkletnov's experiment, always to negative results.Woods, C., Cooke, S., Helme, J., and Caldwell, C., "Gravity Modification by High Temperature Superconductors," Joint Propulsion Conference, AIAA 2001–3363, (2001).Hathaway, G., Cleveland, B., and Bao, Y., "Gravity Modification Experiment using a Rotating Superconducting Disc and Radio Frequency 

In [24]:
print_wrong(11)

【id】: 72
【prompt】: A gas pycnometer, the gas-based manifestation of a pycnometer, compares the change in pressure caused by a measured change in a closed volume containing a reference (usually a steel sphere of known volume) with the change in pressure caused by the sample under the same conditions. A gas pycnometer is a laboratory device used for measuring the density—or, more accurately, the volume—of solids, be they regularly shaped, porous or non-porous, monolithic, powdered, granular or in some way comminuted, employing some method of gas displacement and the volume:pressure relationship known as Boyle's Law. The simplest type of gas pycnometer (due to its relative lack of moving parts) consists of two chambers, one (with a removable gas-tight lid) to hold the sample and a second chamber of fixed, known (via calibration) internal volume – referred to as the reference volume or added volume. A gas pycnometer is also sometimes referred to as a helium pycnometer. ==Types of gas pycno

In [25]:
"""
正解：https://en.wikipedia.org/wiki/Born_reciprocity#:~:text=In%201981%2C%20Eduardo%20R.,been%20expanded%20upon%20by%20others.
"""

print_wrong(12)

【id】: 96
【prompt】: However, Einstein was convinced that a valid theory of gravity would necessarily have to include the relativity of inertia: == Inertial induction == In 1953, in order to express Mach's Principle in quantitative terms, the Cambridge University physicist Dennis W. Sciama proposed the addition of an acceleration dependent term to the Newtonian gravitation equation. The writing in which Einstein found inspiration was Mach's book The Science of Mechanics (1883, tr. 1893), where the philosopher criticized Newton's idea of absolute space, in particular the argument that Newton gave sustaining the existence of an advantaged reference system: what is commonly called "Newton's bucket argument". Einstein brought the principle into mainstream physics while working on general relativity. Indeed, it was Einstein who first coined the phrase Mach's principle. In the context of Born rigidity, Max Born (1909) subsequently coined the term "hyperbolic motion" () for the case of constant

## retrieve のチェック

In [None]:
smodel = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
smodel.max_seq_length = 386
smodel = smodel.half()

### ドキュメント候補の取得

In [26]:
all_df = pd.read_parquet("../preprocessed/102_sentence_details/000/all.parquet")

In [27]:
all_df.head()

Unnamed: 0,id,file
0,49495844,a.parquet
1,3579086,a.parquet
2,3579086,a.parquet
3,3579086,a.parquet
4,3579086,a.parquet


In [28]:
df = pd.read_csv("../preprocessed/002_gpu/003/train.csv")
df["answer_all"] = df.apply(lambda x: " ".join([x["A"], x["B"], x["C"], x["D"], x["E"]]), axis=1)
df["prompt_answer_stem"] = df["prompt"] + " " + df["answer_all"]
df.head()

Unnamed: 0,id,prompt,A,B,C,D,E,answer,context,answer_all,prompt_answer_stem
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,While almost all astrophysicists today reject ...,MOND is a theory that reduces the observed mis...,Which of the following statements accurately d...
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,Many of these systems evolve in a self-similar...,Dynamic scaling refers to the evolution of sel...,Which of the following is an accurate definiti...
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,It is possible that this usage is related with...,The triskeles symbol was reconstructed as a fe...,Which of the following statements accurately d...
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,Renormalization is distinct from regularizatio...,Regularizing the mass-energy of an electron wi...,What is the significance of regularization in ...
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,Several qualitative observations can be made o...,The angular spacing of features in the diffrac...,Which of the following statements accurately d...


In [29]:
df.iloc[35].prompt_answer_stem

"What was the aim of the Gravity Probe B (GP-B) mission? To prove that pressure contributes equally to spacetime curvature as does mass-energy. To measure spacetime curvature near Earth, with particular emphasis on gravitomagnetism. To measure the distribution of Fe and Al on the Moon's surface. To confirm the relatively large geodetic effect due to simple spacetime curvature, and is also known as de Sitter precession. To measure the discrepancy between active and passive mass to about 10−12."

In [30]:
index_path = "../preprocessed/102_sentence_details/000/ivfpq_100_64_8.index"
sentence_index = read_index(index_path)  # index 読み込み

#### id:35

In [33]:
# answer も含める場合
idx = 35
sentence_index.nprobe = 10

smodel = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
smodel.max_seq_length = 386
smodel = smodel.half()
prompt_embeddings = smodel.encode(
    [df.iloc[idx].prompt_answer_stem],
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)
# prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
search_score, search_index = sentence_index.search(prompt_embeddings, 20)
search_score, search_index

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(array([[0.66306055, 0.75334704, 0.79065216, 0.8074268 , 0.8325209 ,
         0.8396863 , 0.84900326, 0.8545032 , 0.85537136, 0.862408  ,
         0.8745032 , 0.87702626, 0.8776181 , 0.8788476 , 0.8793816 ,
         0.88369614, 0.88645774, 0.8890206 , 0.8922587 , 0.89465016]],
       dtype=float32),
 array([[ 8888877, 25021550,  7842669, 23985600, 26156094,  8888879,
         15855304,  8888886,  1337339, 26994400, 26974958, 17763718,
          8888890,  8888338, 10675594, 24528497, 15123012, 27012441,
          7624194,  8344393]]))

In [34]:
row_id = 25021550
print(all_df.iloc[row_id])
file_df = pd.read_parquet(f"../input/wikipedia-20230701/{all_df.iloc[row_id].file}")
file_df[file_df["id"] == all_df.iloc[row_id].id]

id          28758
file    s.parquet
Name: 25021550, dtype: object


Unnamed: 0,id,title,text,categories
326272,28758,Spacetime,"In physics, spacetime is a mathematical model ...","[Concepts in physics, Theoretical physics, The..."


In [36]:
sentence_index.nprobe = 10


prompt_embeddings = smodel.encode(
    [df.iloc[35].prompt],
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)
# prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
search_score, search_index = sentence_index.search(prompt_embeddings, 10)
search_score, search_index

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(array([[0.7878667 , 0.8294137 , 0.8301118 , 0.8629383 , 0.8669293 ,
         0.8794828 , 0.8952486 , 0.90531075, 0.90820307, 0.9099476 ]],
       dtype=float32),
 array([[ 8888877,  7842669,  8888886,  8888881,  8888885, 26994400,
          8888765,  8888890, 26994401,  7710460]]))

#### id 49

In [38]:
# answer も含める場合
idx = 49

sentence_index.nprobe = 10

prompt_embeddings = smodel.encode(
    [df.iloc[idx].prompt_answer_stem],
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)
# prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
search_score, search_index = sentence_index.search(prompt_embeddings, 20)
search_score, search_index

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(array([[0.712788  , 0.7207524 , 0.73808515, 0.75057006, 0.75323325,
         0.7535617 , 0.75887865, 0.7613823 , 0.7614457 , 0.7880337 ,
         0.788491  , 0.79881144, 0.808414  , 0.8136295 , 0.81395   ,
         0.81738585, 0.8180226 , 0.8190703 , 0.82230335, 0.8299021 ]],
       dtype=float32),
 array([[21793105,  2176217, 16209442,  1669014,  1669013, 17341760,
         13425171, 26783936, 20721124,  9380425, 21678838,  7259319,
          1669031, 16205895,  6658020, 19872399, 10099319, 24580459,
         27728427, 20436721]]))

In [39]:
# 不正解
row_id = 13425171
print(all_df.iloc[row_id])
file_df = pd.read_parquet(f"../input/wikipedia-20230701/{all_df.iloc[row_id].file}")
file_df[file_df["id"] == all_df.iloc[row_id].id]

id         178713
file    l.parquet
Name: 13425171, dtype: object


Unnamed: 0,id,title,text,categories
130594,178713,Light-second,The light-second is a unit of length useful in...,"[Units of length, Units of measurement in astr..."


In [40]:
file_df[file_df["title"].str.contains("Light-year")]

Unnamed: 0,id,title,text,categories


In [41]:
# 含まれていない
# https://en.wikipedia.org/w/index.php?title=Light-year&action=info
all_df[all_df["id"] == "23473595"]

Unnamed: 0,id,file


#### id52

In [43]:
# answer も含める場合
idx = 52

sentence_index.nprobe = 10
prompt_embeddings = smodel.encode(
    [df.iloc[idx].prompt_answer_stem],
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)
# prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
search_score, search_index = sentence_index.search(prompt_embeddings, 20)
search_score, search_index

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(array([[0.5912847 , 0.6261332 , 0.71793985, 0.72031945, 0.7344239 ,
         0.74571604, 0.7522569 , 0.77276134, 0.7784686 , 0.7818648 ,
         0.7829955 , 0.78772324, 0.7908109 , 0.79602337, 0.8035359 ,
         0.8140356 , 0.8199215 , 0.8229825 , 0.82347393, 0.8304806 ]],
       dtype=float32),
 array([[21465521, 21679312, 21465522,  7699985, 22854255,  6730010,
         22854174,  6717287, 20501546,  6719231,  6719713,  6719232,
         28584633, 27963806, 22854984, 28275088,  4906466, 14247367,
         22854240, 20501556]]))

In [44]:
file_df = pd.read_parquet(f"../input/wikipedia-20230701/e.parquet")

In [45]:
file_df[file_df["title"].str.contains("Electrical resistivity")]

Unnamed: 0,id,title,text,categories
72455,28205882,Electrical resistivity measurement of concrete,Concrete electrical resistivity can be obtaine...,"[Concrete, Impedance measurements]"
72456,3625483,Electrical resistivity tomography,thumb|2D resistivity inversion of ERT data thu...,"[Geophysical imaging, Inverse problems, Impeda..."


In [46]:
sentence_index.nprobe = 10
prompt_embeddings = smodel.encode(
    [df.iloc[idx].prompt],
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)
# prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
search_score, search_index = sentence_index.search(prompt_embeddings, 20)
search_score, search_index

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(array([[0.5747607 , 0.62398666, 0.6531236 , 0.69509405, 0.75609064,
         0.7599351 , 0.7737852 , 0.7831699 , 0.7901595 , 0.79393417,
         0.8020921 , 0.80468166, 0.80734813, 0.8141816 , 0.8166397 ,
         0.82229686, 0.82948357, 0.8394228 , 0.8448426 , 0.8565721 ]],
       dtype=float32),
 array([[21679312, 22854240, 22854174, 21465521,  3868828,  2900037,
         29140107, 22854785, 16931119, 22854255, 28584633,  7753013,
         22854982,  7699985,  6719713, 22854214,  6719232, 20501546,
          1540973, 19063710]]))

In [47]:
row_id = 6719713
print(all_df.iloc[row_id])
file_df[file_df["id"] == all_df.iloc[row_id].id]

id           9550
file    e.parquet
Name: 6719713, dtype: object


Unnamed: 0,id,title,text,categories
72499,9550,Electricity,Electricity is the set of physical phenomena a...,[]


In [48]:
# 含まれていない　https://en.wikipedia.org/w/index.php?title=Electrical_resistivity_and_conductivity&action=info
all_df[all_df["id"] == "61580"]

Unnamed: 0,id,file


#### id:65

In [49]:
# answer も含める場合
idx = 65

sentence_index.nprobe = 10
prompt_embeddings = smodel.encode(
    [df.iloc[idx].prompt_answer_stem],
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)
# prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
search_score, search_index = sentence_index.search(prompt_embeddings, 20)
search_score, search_index

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(array([[0.66265875, 0.70874953, 0.7234033 , 0.74069417, 0.7741275 ,
         0.7742683 , 0.780319  , 0.78573036, 0.78830934, 0.7885235 ,
         0.8090006 , 0.8111434 , 0.8143849 , 0.82143414, 0.8238852 ,
         0.8292051 , 0.8485422 , 0.85400474, 0.85409874, 0.8563406 ]],
       dtype=float32),
 array([[ 4638886,  9708866, 28709280,  4638880, 10610001,  5692950,
          4638885,  9708876, 10305303,  4745051,  9708869, 20694899,
         28709282, 25618442,  4638989, 22364488, 10610029, 27960320,
         28709286,  2948657]]))

In [50]:
sentence_index.nprobe = 10
prompt_embeddings = smodel.encode(
    [df.iloc[idx].prompt],
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)
# prompt_embeddings = prompt_embeddings.detach().cpu().numpy()
search_score, search_index = sentence_index.search(prompt_embeddings, 20)
search_score, search_index

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(array([[0.6600527 , 0.68630296, 0.6867917 , 0.7384731 , 0.75806123,
         0.7703807 , 0.77907646, 0.7915026 , 0.79507077, 0.7964417 ,
         0.7970445 , 0.79819703, 0.7998413 , 0.8006329 , 0.8019266 ,
         0.8019775 , 0.8061156 , 0.81213206, 0.82345235, 0.8261176 ]],
       dtype=float32),
 array([[ 9708866,  4745051,  4638886,  9708876,  9708869,  5692950,
         10305303, 10610005,  2948657, 10610001, 20694770, 28709282,
         20694899,  4638880, 17093942,  4638989, 22364488,  4638885,
         10609891, 13074685]]))

In [51]:
file_df = pd.read_parquet(f"../input/wikipedia-20230701/c.parquet")

In [52]:
row_id = 4638886
print(all_df.iloc[row_id])

id        7985327
file    c.parquet
Name: 4638886, dtype: object


In [53]:
file_df[file_df["id"] == all_df.iloc[row_id].id]

Unnamed: 0,id,title,text,categories
287979,7985327,Coherence theory (optics),"In physics, coherence theory is the study of o...","[Interferometry, Physical optics]"


In [54]:
all_df[all_df["id"] == "240011"]

Unnamed: 0,id,file


### similarity のチェック
hitはしているが、優先度が高くない理由を確かめる。必要に応じて別モデルの類似度のほうが良いのかもチェックする

In [55]:
df = pd.read_csv("../preprocessed/002_gpu/003/train.csv")
df["answer_all"] = df.apply(lambda x: " ".join([x["A"], x["B"], x["C"], x["D"], x["E"]]), axis=1)
df["prompt_answer_stem"] = df["prompt"] + " " + df["answer_all"]

#### id: 28
https://en.wikipedia.org/wiki/Supermassive_black_hole#:~:text=In%20February%202021%2C%20astronomers%20released,Array%20(LOFAR)%20in%20Europe.

page: 215706

In [56]:
index = 28
page_id = "215706"
print(df.iloc[index].prompt_answer_stem)

What is the evidence for the existence of a supermassive black hole at the center of the Milky Way galaxy? The Milky Way galaxy has a supermassive black hole at its center because of the bright flare activity observed near Sagittarius A*. The radius of the central object must be less than 17 light-hours, because otherwise S2 would collide with it. Observations of the star S14 indicate that the radius is no more than 6.25 light-hours, about the diameter of Uranus' orbit. No known astronomical object other than a black hole can contain 4.0 million M☉ in this volume of space. The Milky Way galaxy has a supermassive black hole at its center because the star S14 follows an elliptical orbit with a period of 15.2 years and a pericenter of 17 light-hours from the center of the central object. From the motion of star S14, the object's mass can be estimated as 4.0 million M☉, or about 7.96×1036 kg. The radius of the central object must be less than 17 light-hours, because otherwise S14 would col

In [57]:
file_df = pd.read_parquet(f"../input/wikipedia-20230701/s.parquet")
file_df[file_df["id"] == page_id].text

457253    A supermassive black hole (SMBH or sometimes S...
Name: text, dtype: object

In [64]:
import re

document = file_df[file_df["id"] == page_id].iloc[0].text

# 不要と思われる部分は削除する
document = document.split("==See also==")[0]
document = document.split("== See also ==")[0]
document = document.split("==References==")[0]
document = document.split("== References ==")[0]
document = document.split("==Further reading==")[0]
document = document.split("== Further reading ==")[0]
document = document.split("==External links==")[0]
document = document.split("== External links ==")[0]

# 見出し部分を : に置換
pattern = r"={2,}\s?(.*?)\s?={2,}"
document = re.sub(pattern, r"\1 :", document)

document_sentences = []
_, sentence_offsets = bf.text_to_sentences_and_offsets(document)
for o in sentence_offsets:
    if o[1] - o[0] > 3:
        sentence = document[o[0] : o[1]]
        abs_offsets = (o[0], o[1])
        row = {}
        document_sentences.append(sentence)

In [65]:
wrong_sentence = document_sentences[0]
print(wrong_sentence)

A supermassive black hole (SMBH or sometimes SBH) is the largest type of black hole, with its mass being on the order of hundreds of thousands, or millions to billions of times the mass of the Sun ().


In [66]:
"""
埋め込みを作って、類似度を計算し、上位からスコアとテキストを5件表示する
"""

text = df.iloc[index].prompt_answer_stem

smodel = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
smodel.max_seq_length = 386
smodel = smodel.half()

question_embeddings = smodel.encode(
    [text],
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)

document_embeddings = smodel.encode(
    document_sentences,
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)

num_include = 5

prompt_index = faiss.index_factory(document_embeddings.shape[1], "Flat")
prompt_index.add(document_embeddings)
ss, ii = prompt_index.search(question_embeddings, num_include)

for i in range(num_include):
    print(ss[0, i], document_sentences[ii[0, i]])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

0.4537541 In the Milky Way : Evidence indicates that the Milky Way galaxy has a supermassive black hole at its center, 26,000 light-years from the Solar System, in a region called Sagittarius A* because: * The star S2 follows an elliptical orbit with a period of 15.2 years and a pericenter (closest distance) of 17 light-hours ( or 120 AU) from the center of the central object.
0.65130067 Observational evidence indicates that almost every large galaxy has a supermassive black hole at its center.
0.663813 Individual studies : The nearby Andromeda Galaxy, 2.5 million light-years away, contains a central black hole, significantly larger than the Milky Way's.
0.67229897 Observations of the star S14 indicate that the radius is no more than 6.25 light-hours, about the diameter of Uranus' orbit.
0.67305374 The largest supermassive black hole in the Milky Way's vicinity appears to be that of Messier 87 (i.e., M87*), at a mass of at a distance of 48.92 million light-years.


#### id 70

In [72]:
index = 70
page_id = "26884"  # https://en.wikipedia.org/w/index.php?title=Superconductivity&action=info
print(df.iloc[index][df.iloc[index].answer])
print(df.iloc[index].prompt)

A magnetic field, precisely aligned with the spin axis.
What is the effect generated by a spinning superconductor?


In [73]:
# file_df = pd.read_parquet(f"../input/wikipedia-20230701/s.parquet")
file_df[file_df["id"] == page_id].text

456377    thumb|Video of the Meissner effect in a high-t...
Name: text, dtype: object

In [79]:
import re

document = file_df[file_df["id"] == page_id].iloc[0].text

# 不要と思われる部分は削除する
document = document.split("==See also==")[0]
document = document.split("== See also ==")[0]
document = document.split("==References==")[0]
document = document.split("== References ==")[0]
document = document.split("==Further reading==")[0]
document = document.split("== Further reading ==")[0]
document = document.split("==External links==")[0]
document = document.split("== External links ==")[0]

# 見出し部分を : に置換
pattern = r"={2,}\s?(.*?)\s?={2,}"
document = re.sub(pattern, r"\1 :", document)

document_sentences = []
_, sentence_offsets = bf.text_to_sentences_and_offsets(document)
for o in sentence_offsets:
    if o[1] - o[0] > 3:
        sentence = document[o[0] : o[1]]
        abs_offsets = (o[0], o[1])
        row = {}
        document_sentences.append(sentence)

In [80]:
"""
埋め込みを作って、類似度を計算し、上位からスコアとテキストを5件表示する
"""

text = df.iloc[index].prompt_answer_stem

smodel = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
smodel.max_seq_length = 386
smodel = smodel.half()

question_embeddings = smodel.encode(
    [text],
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)

document_embeddings = smodel.encode(
    document_sentences,
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)

num_include = 10

prompt_index = faiss.index_factory(document_embeddings.shape[1], "Flat")
prompt_index.add(document_embeddings)
ss, ii = prompt_index.search(question_embeddings, num_include)

for i in range(num_include):
    print(ss[0, i], document_sentences[ii[0, i]])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

0.41832596 London moment : Conversely, a spinning superconductor generates a magnetic field, precisely aligned with the spin axis.
0.82812345 This experiment measured the magnetic fields of four superconducting gyroscopes to determine their spin axes.
0.838499 It is characterized by the Meissner effect, the complete ejection of magnetic field lines from the interior of the superconductor during its transitions into the superconducting state.
0.9291841 The situation is different in a superconductor.
0.93931603 Meissner effect : When a superconductor is placed in a weak external magnetic field H, and cooled below its transition temperature, the magnetic field is ejected.
0.9738614 This is because the Gibbs free energy of the superconducting phase increases quadratically with the magnetic field while the free energy of the normal phase is roughly independent of the magnetic field.
0.9791089 A superconductor with little or no magnetic field within it is said to be in the Meissner state.
0.

#### id:72

In [81]:
index = 72
page_id = "37379"  # https://en.wikipedia.org/wiki/Relative_density
print(df.iloc[index][df.iloc[index].answer])
print(df.iloc[index].prompt)

A device used to determine the density of a liquid.
What is a pycnometer?


In [82]:
file_df = pd.read_parquet(f"../input/wikipedia-20230701/r.parquet")
file_df[file_df["id"] == page_id].text

82277    {\rho_\mathrm{H_2O}} }} Relative density, or s...
Name: text, dtype: object

In [85]:
import re

document = file_df[file_df["id"] == page_id].iloc[0].text

# 不要と思われる部分は削除する
document = document.split("==See also==")[0]
document = document.split("== See also ==")[0]
document = document.split("==References==")[0]
document = document.split("== References ==")[0]
document = document.split("==Further reading==")[0]
document = document.split("== Further reading ==")[0]
document = document.split("==External links==")[0]
document = document.split("== External links ==")[0]

# 見出し部分を : に置換
pattern = r"={2,}\s?(.*?)\s?={2,}"
document = re.sub(pattern, r"\1 :", document)

document_sentences = []
_, sentence_offsets = bf.text_to_sentences_and_offsets(document)
for o in sentence_offsets:
    if o[1] - o[0] > 3:
        sentence = document[o[0] : o[1]]
        abs_offsets = (o[0], o[1])
        row = {}
        document_sentences.append(sentence)

In [86]:
"""
埋め込みを作って、類似度を計算し、上位からスコアとテキストを5件表示する
"""

text = df.iloc[index].prompt_answer_stem

smodel = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
smodel.max_seq_length = 386
smodel = smodel.half()

question_embeddings = smodel.encode(
    [text],
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)

document_embeddings = smodel.encode(
    document_sentences,
    batch_size=8,
    device="cuda",
    show_progress_bar=True,
    # convert_to_tensor=True,
    normalize_embeddings=True,
)

num_include = 10

prompt_index = faiss.index_factory(document_embeddings.shape[1], "Flat")
prompt_index.add(document_embeddings)
ss, ii = prompt_index.search(question_embeddings, num_include)

for i in range(num_include):
    print(ss[0, i], document_sentences[ii[0, i]])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

0.3426491 A gas pycnometer, the gas-based manifestation of a pycnometer, compares the change in pressure caused by a measured change in a closed volume containing a reference (usually a steel sphere of known volume) with the change in pressure caused by the sample under the same conditions.
0.50191444 A pycnometer is usually made of glass, with a close-fitting ground glass stopper with a capillary tube through it, so that air bubbles may escape from the apparatus.
0.6370988 This shows that, for small Δx, changes in displacement are approximately proportional to changes in relative density. ===Pycnometer=== thumb|upright|An empty glass pycnometer and stopper thumb|upright|A filled pycnometer A pycnometer (from ), also called pyknometer or specific gravity bottle, is a device used to determine the density of a liquid.
0.772531 When a pycnometer is filled to a specific, but not necessarily accurately known volume, V and is placed upon a balance, it will exert a force F_\mathrm{b} = g\left

In [89]:
text = "0.6370988 This shows that, for small Δx, changes in displacement are approximately proportional to changes in relative density. ===Pycnometer=== thumb|upright|An empty glass pycnometer and stopper thumb|upright|A filled pycnometer A pycnometer (from ), also called pyknometer or specific gravity bottle, is a device used to determine the density of a liquid."
pattern = r" thumb\|([^A-Z]+)([A-Z])([^A-Z]+)([A-Z])"

# 正規表現パターンにマッチする部分を削除
text_without_sentences = re.sub(pattern, r"", text)
text_without_sentences

'0.6370988 This shows that, for small Δx, changes in displacement are approximately proportional to changes in relative density. ===Pycnometer=== filled pycnometer A pycnometer (from ), also called pyknometer or specific gravity bottle, is a device used to determine the density of a liquid.'

Unnamed: 0,id,file


## 005?くらいのやつ

In [41]:
"""
ミスの原因？

- hitしたwikipediaが違う
 - 正解：https://en.wikipedia.org/wiki/Causality_(physics)
 - 間違い：butterfly effect


タイトルと１行目からでは分からない内容を抽出しているので、現状のretrieval の方法ではカバーするのが難しい
"""
print_wrong(1)

【id】: 17
【prompt】: What is the butterfly effect?
【A】: The butterfly effect is a physical cause that occurs when a massive sphere is caused to roll down a slope starting from a point of unstable equilibrium, and its velocity is assumed to be caused by the force of gravity accelerating it.
【B】: The butterfly effect is a distributed causality that opens up the opportunity to understand the relationship between necessary and sufficient conditions in classical (Newtonian) physics.
【C】: The butterfly effect is a proportionality between the cause and the effect of a physical phenomenon in classical (Newtonian) physics.
【D】: The butterfly effect is a small push that is needed to set a massive sphere into motion when it is caused to roll down a slope starting from a point of unstable equilibrium.
【E】: The butterfly effect is a phenomenon that highlights the difference between the application of the notion of causality in physics and a more general use of causality as represented by Mackie's INU

In [44]:
"""
ミスの原因？


- 抽出はできているが優先順位が低いのでcutされる
  - >  == Reactive Leidenfrost effect == thumb|Reactive Leidenfrost effect of cellulose on silica, Non-volatile materials were discovered in 2015 to also exhibit a 'reactive Leidenfrost effect', whereby solid particles were observed to float above hot surfaces and skitter around erratically. 
→各選択肢との類似度を取ればcutされずに上位に入りそう
"""
print_wrong(2)

【id】: 18
【prompt】: What is the 'reactive Leidenfrost effect' observed in non-volatile materials?
【A】: The 'reactive Leidenfrost effect' is a phenomenon where solid particles float above hot surfaces and move erratically, observed in non-volatile materials.
【B】: The 'reactive Leidenfrost effect' is a phenomenon where solid particles float above hot surfaces and move erratically, observed in volatile materials.
【C】: The 'reactive Leidenfrost effect' is a phenomenon where solid particles sink into hot surfaces and move slowly, observed in non-volatile materials.
【D】: The 'reactive Leidenfrost effect' is a phenomenon where solid particles float above cold surfaces and move erratically, observed in non-volatile materials.
【E】: The 'reactive Leidenfrost effect' is a phenomenon where solid particles sink into cold surfaces and move slowly, observed in non-volatile materials.
【answer】: A
【context】: The new phenomenon of a 'reactive Leidenfrost (RL) effect' was characterized by a dimensionless 

In [45]:
"""
ミスの原因？

hit はしているがうまく抽出できていない
- 必要なS2に関する情報をhttps://en.wikipedia.org/wiki/Supermassive_black_hole　から取得できていない
- Referencesなどの不要なものが上位に入ってしまっている
→各選択肢との類似度を取ればcutされずに上位に入りそう
"""
print_wrong(3)

【id】: 28
【prompt】: What is the evidence for the existence of a supermassive black hole at the center of the Milky Way galaxy?
【A】: The Milky Way galaxy has a supermassive black hole at its center because of the bright flare activity observed near Sagittarius A*. The radius of the central object must be less than 17 light-hours, because otherwise S2 would collide with it. Observations of the star S14 indicate that the radius is no more than 6.25 light-hours, about the diameter of Uranus' orbit. No known astronomical object other than a black hole can contain 4.0 million M☉ in this volume of space.
【B】: The Milky Way galaxy has a supermassive black hole at its center because the star S14 follows an elliptical orbit with a period of 15.2 years and a pericenter of 17 light-hours from the center of the central object. From the motion of star S14, the object's mass can be estimated as 4.0 million M☉, or about 7.96×1036 kg. The radius of the central object must be less than 17 light-hours, be

In [46]:
"""
ミスの原因？

- hitしたwikipediaが違う
 - 不正解：https://en.wikipedia.org/wiki/Gravity_Probe_B
 - 正解：https://en.wikipedia.org/wiki/Spacetime
"""
print_wrong(4)

【id】: 35
【prompt】: What was the aim of the Gravity Probe B (GP-B) mission?
【A】: To prove that pressure contributes equally to spacetime curvature as does mass-energy.
【B】: To measure spacetime curvature near Earth, with particular emphasis on gravitomagnetism.
【C】: To measure the distribution of Fe and Al on the Moon's surface.
【D】: To confirm the relatively large geodetic effect due to simple spacetime curvature, and is also known as de Sitter precession.
【E】: To measure the discrepancy between active and passive mass to about 10−12.
【answer】: B
【context】: Gravity Probe B (GP-B) was a satellite-based experiment to test two unverified predictions of general relativity: the geodetic effect and frame-dragging. In a public press and media event at NASA Headquarters, GP-B Principal Investigator, Francis Everitt presented the final results of Gravity Probe B. ;19 November 2015 : Publication of GP-B Special Volume (Volume #32, Issue #22) in the peer-reviewed journal, Classical and Quantum Gr

In [47]:
"""
ミスの原因？

- hitしたwikipediaが違う
 - 不正解：https://en.wikipedia.org/wiki/Synaptic_transistor
 - 正解：https://en.wikipedia.org/wiki/Memristor
"""
print_wrong(5)

【id】: 39
【prompt】: What is the synapstor or synapse transistor?
【A】: A device used to demonstrate a neuro-inspired circuit that shows short-term potentiation for learning and inactivity-based forgetting.
【B】: A device used to demonstrate a neuro-inspired circuit that shows long-term potentiation for learning and activity-based forgetting.
【C】: A device used to demonstrate a neuro-inspired circuit that shows short-term depression for learning and inactivity-based forgetting.
【D】: A device used to demonstrate a neuro-inspired circuit that shows short-term potentiation for learning and activity-based forgetting.
【E】: A device used to demonstrate a neuro-inspired circuit that shows long-term potentiation for learning and inactivity-based forgetting.
【answer】: E
【context】: A synaptic transistor is an electrical device that can learn in ways similar to a neural synapse. The input and output of the synaptic transistor are continuous analog values, rather than digital on-off signals. A network

In [48]:
"""
ミスの原因？

- hitしたwikipediaが違う
 - 不正解: https://en.wikipedia.org/wiki/MACS0647-JD
 - 正解：https://en.wikipedia.org/wiki/Observable_universe
"""
print_wrong(6)

【id】: 41
【prompt】: What is the proper distance for a redshift of 8.2?
【A】: The proper distance for a redshift of 8.2 is about 6.2 Gpc, or about 24 billion light-years.
【B】: The proper distance for a redshift of 8.2 is about 7.2 Gpc, or about 26 billion light-years.
【C】: The proper distance for a redshift of 8.2 is about 9.2 Gpc, or about 30 billion light-years.
【D】: The proper distance for a redshift of 8.2 is about 8.2 Gpc, or about 28 billion light-years.
【E】: The proper distance for a redshift of 8.2 is about 10.2 Gpc, or about 32 billion light-years.
【answer】: C
【context】: __NOTOC__ MACS0647-JD is a galaxy with a redshift of about z = 10.7, equivalent to a light travel distance of 13.26 billion light-years (4 billion parsecs). Using Hubble's law, the redshift can be used to estimate the distance of an object from Earth. Photometric redshifts were originally determined by calculating the expected observed data from a known emission spectrum at a range of redshifts. In the absence of

In [49]:
"""
ミスの原因？

- hitしたwikipediaが違う
    - 不正解：https://en.wikipedia.org/wiki/Isaac_Newton
    - 正解：https://en.wikipedia.org/wiki/Newton%27s_law_of_universal_gravitation
"""
print_wrong(7)

【id】: 53
【prompt】: What did Newton adopt after his correspondence with Hooke in 1679-1680?
【A】: The language of inward or centripetal force.
【B】: The language of gravitational force.
【C】: The language of outward or centrifugal force.
【D】: The language of tangential and radial displacements.
【E】: The language of electromagnetic force.
【answer】: A
【context】: Newton and Hooke had brief exchanges in 1679–80, when Hooke, appointed to manage the Royal Society's correspondence, opened up a correspondence intended to elicit contributions from Newton to Royal Society transactions, which had the effect of stimulating Newton to work out a proof that the elliptical form of planetary orbits would result from a centripetal force inversely proportional to the square of the radius vector. Newton was well-versed in both classics and modern languages. In the , Newton formulated the laws of motion and universal gravitation that formed the dominant scientific viewpoint for centuries until it was supersede