In [1]:
import faiss
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
from datasets import load_dataset, load_from_disk
from faiss import read_index, write_index
from matplotlib import pyplot as plt
from sentence_transformers import SentenceTransformer

sns.set()
%matplotlib inline



In [2]:
# df = pd.read_csv("../preprocessed/332_tfidf/000/data2.csv")
# max_prob = np.load("../output/600_max/111/data2_pred.npy")
df = pd.read_csv("../preprocessed/334_tfidf_gpu/000/data2.csv")
tfidf_prob = np.load("../output/350_1st_infer/tfidf_gpu/data2_pred.npy")
max_prob = np.load("../output/600_max/110/data2_pred.npy")

ignore_index = [
    201,
    205,
    207,
    213,
    217,
    226,
    228,
    251,
    254,
    260,
    262,
    270,
    286,
    296,
    308,
    312,
    338,
    417,
    430,
    444,
    454,
    478,
    486,
    491,
    507,
    508,
    530,
    550,
    574,
    586,
    593,
    612,
]

num = 1200
df = df.drop(ignore_index).head(num)
tfidf_prob = np.delete(tfidf_prob, ignore_index, axis=0)[:num]
max_prob = np.delete(max_prob, ignore_index, axis=0)[:num]
"""

df = pd.read_csv("../preprocessed/332_tfidf/000/val_500_enhanced.csv")
tfidf_prob = np.load("../output/370_1st_infer/tfidf_gpu/data4_pred.npy")
max_prob = np.load("../output/650_max/110/data4_pred.npy")

ignore_index = []
num = 1000
df = df.drop(ignore_index).head(num).reset_index(drop=True)
tfidf_prob = np.delete(tfidf_prob, ignore_index, axis=0)[:num]
max_prob = np.delete(max_prob, ignore_index, axis=0)[:num]
"""

'\n\ndf = pd.read_csv("../preprocessed/332_tfidf/000/val_500_enhanced.csv")\ntfidf_prob = np.load("../output/370_1st_infer/tfidf_gpu/data4_pred.npy")\nmax_prob = np.load("../output/650_max/110/data4_pred.npy")\n\nignore_index = []\nnum = 1000\ndf = df.drop(ignore_index).head(num).reset_index(drop=True)\ntfidf_prob = np.delete(tfidf_prob, ignore_index, axis=0)[:num]\nmax_prob = np.delete(max_prob, ignore_index, axis=0)[:num]\n'

In [3]:
import Levenshtein
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

stop_words = list(stopwords.words("english"))


def get_tfidf(row: dict[str, str]) -> np.ndarray:
    """
    tfidfを計算する。大きめのngramにすることでうまく計算できるようにする
    """
    # tfidfの計算
    tfidf = TfidfVectorizer(
        ngram_range=(3, 7),
        # token_pattern=r"(?u)\b[\w/.-]+\b|!|/|\?|\"|\'",
        stop_words=stop_words,
    )
    base_cols = ["A", "B", "C", "D", "E"]
    fit_cols = base_cols + ["context", "prompt"]
    tfidf_vec = tfidf.fit([row[col] for col in fit_cols])
    # base_cols と context の 類似度を計算
    base_vec = tfidf_vec.transform([row[col] for col in base_cols])
    context_vec = tfidf_vec.transform([row["context"]])
    sim = cosine_similarity(base_vec, context_vec)
    return sim


def add_feat_by_prob(df, max_prob):
    first_prob = np.sort(max_prob)[:, -1]
    second_prob = np.sort(max_prob)[:, -2]
    third_prob = np.sort(max_prob)[:, -3]
    prob_diff = first_prob - second_prob
    df["first_prob"] = first_prob
    df["second_prob"] = second_prob
    df["third_prob"] = third_prob
    df["prob_diff"] = prob_diff
    df["prob_diff23"] = second_prob - third_prob

    option_to_index = {option: idx for idx, option in enumerate("ABCDE")}
    index_to_option = {v: k for k, v in option_to_index.items()}
    first_option = np.argsort(max_prob)[:, -1]
    df["first_option_index"] = first_option
    df["first_option"] = df["first_option_index"].map(index_to_option)
    second_option = np.argsort(max_prob)[:, -2]
    df["second_option_index"] = second_option
    df["second_option"] = df["second_option_index"].map(index_to_option)
    third_option = np.argsort(max_prob)[:, -3]
    df["third_option_index"] = third_option
    df["third_option"] = df["third_option_index"].map(index_to_option)

    df["first_len"] = 0
    df["second_len"] = 0
    df["third_len"] = 0
    for i, row in df.iterrows():
        df.loc[i, "first_len"] = len(row[row["first_option"]])
        df.loc[i, "second_len"] = len(row[row["second_option"]])
        df.loc[i, "third_len"] = len(row[row["third_option"]])

    # first と second のレーベンシュタイン距離
    dists = []
    for i, row in tqdm(df.iterrows()):
        dists.append(Levenshtein.distance(row[row["first_option"]], row[row["second_option"]]))
    df["dist_1_2"] = dists
    df["dist_1_2_rate"] = df["dist_1_2"] / df[["first_len", "second_len"]].max(axis=1)

    dists = []
    for i, row in tqdm(df.iterrows()):
        dists.append(Levenshtein.distance(row[row["first_option"]], row[row["third_option"]]))
    df["dist_1_3"] = dists
    df["dist_1_3_rate"] = df["dist_1_3"] / df[["first_len", "third_len"]].max(axis=1)

    # 正解がfirst, second, other のどれかを見る
    df["answer_location"] = "other"
    df.loc[df["first_option"] == df["answer"], "answer_location"] = "first"
    df.loc[df["second_option"] == df["answer"], "answer_location"] = "second"
    df.loc[df["third_option"] == df["answer"], "answer_location"] = "third"

    print("tfidfを計算")
    tfidf_array = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        tfidf_array.append(get_tfidf(row).squeeze())
    tfidf_array = np.array(tfidf_array)
    print(f"tfidf_array:{tfidf_array.shape}")

    df["first_tfidf"] = tfidf_array[np.arange(len(df)), first_option]
    df["second_tfidf"] = tfidf_array[np.arange(len(df)), second_option]
    df["third_tfidf"] = tfidf_array[np.arange(len(df)), third_option]
    df["should_swap"] = (
        (df["prob_diff"] < 0.2)
        & (df["dist_1_2_rate"] < 0.2)
        & (20 < df["first_len"])
        & (0.001 < df["first_tfidf"])
        & (df["first_tfidf"] < df["second_tfidf"])
    )
    return df


df = add_feat_by_prob(df, max_prob)

df.head()

0it [00:00, ?it/s]

0it [00:00, ?it/s]

tfidfを計算


  0%|          | 0/1168 [00:00<?, ?it/s]

tfidf_array:(1168, 5)


Unnamed: 0,prompt,A,B,C,D,E,answer,context,first_prob,second_prob,...,third_len,dist_1_2,dist_1_2_rate,dist_1_3,dist_1_3_rate,answer_location,first_tfidf,second_tfidf,third_tfidf,should_swap
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,Modified Newtonian dynamics > Modified Newtoni...,0.999922,0.068292,...,192,7,0.035176,106,0.540816,first,0.003931,0.003716,0.000971,False
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,Dynamic scaling > Dynamic scaling > Here the e...,0.905612,0.209489,...,282,13,0.045614,31,0.108772,first,0.006034,0.009055,0.005918,False
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,Triskelion > Use in European antiquity > Class...,0.811835,0.564985,...,367,203,0.602374,191,0.520436,first,0.004756,0.001791,0.005005,False
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,Regularization (physics) > Classical physics e...,0.944936,0.373496,...,219,144,0.555985,88,0.339768,first,0.001228,0.005128,0.0,False
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,Diffraction > Patterns > Several qualitative o...,0.997662,0.508306,...,241,11,0.045267,12,0.049793,first,0.093072,0.014546,0.015075,False


In [None]:
df["sum_first_second_tfidf"] = df["first_tfidf"] + df["second_tfidf"]
df[df["answer_location"] != "first"].sort_values(["sum_first_second_tfidf", "first_prob"])

In [7]:
condition = (df["first_prob"] < 0.9) & (df["max_tfidf_prob"] < 0.9)
# (0.001 < df["first_tfidf"]) & (df["dist_1_2_rate"] < 0.5) & (20 < df["first_len"])
tmp_df = df.loc[condition]
print(f"all:{len(tmp_df)} wrong:{len(tmp_df[tmp_df['answer_location'] != 'first'])}")
sns.histplot(data=tmp_df, x="sum_first_second_tfidf", hue="answer_location", multiple="stack", binwidth=0.01)

KeyError: 'max_tfidf_prob'

In [12]:
df[
    (df["first_prob"] < 0.99)
        &(df["prob_diff"] < 0.2)
        & (df["dist_1_2_rate"] < 0.5)
        & (20 < df["first_len"])
        & ((0.01 < df["first_tfidf"]) | (0.01 < df["second_tfidf"]))
        & (df["second_tfidf"] / (df["first_tfidf"] + 1e-9) > 1.2)
]

Unnamed: 0,prompt,A,B,C,D,E,answer,context,first_prob,second_prob,...,third_len,dist_1_2,dist_1_2_rate,dist_1_3,dist_1_3_rate,answer_location,first_tfidf,second_tfidf,third_tfidf,should_swap
28,What is the evidence for the existence of a su...,The Milky Way galaxy has a supermassive black ...,The Milky Way galaxy has a supermassive black ...,The Milky Way galaxy has a supermassive black ...,The Milky Way galaxy has a supermassive black ...,The star S2 follows an elliptical orbit with a...,E,Supermassive black hole > Evidence > From the ...,0.740287,0.553895,...,242,81,0.122356,472,0.712991,second,0.058329,0.097596,0.024519,True
39,What is the synapstor or synapse transistor?,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,E,"Memristor > Implementations > In July 2008, Er...",0.457426,0.410623,...,132,4,0.029851,2,0.014925,second,0.00687,0.010015,0.005513,True
447,Which of the following statements accurately d...,Bis-gamma-glutamylcystine reductase acts on ga...,Bis-gamma-glutamylcystine reductase acts on ga...,Bis-gamma-glutamylcystine reductase acts on ga...,Bis-gamma-glutamylcystine reductase acts on bi...,Bis-gamma-glutamylcystine reductase acts on ga...,C,Bis-gamma-glutamylcystine reductase > Bis-gamm...,0.660314,0.463919,...,169,5,0.028736,74,0.43787,first,0.031188,0.042505,0.050282,True


In [45]:
df[
    (df["first_prob"] < 0.99)
    & (df["prob_diff"] < 0.2)
    & (df["dist_1_2_rate"] < 0.5)
    & (20 < df["first_len"])
    & ((0.01 < df["first_tfidf"]) | (0.01 < df["second_tfidf"]))
    & (df["second_tfidf"] / (df["first_tfidf"] + 1e-9) > 1.4)
]

Unnamed: 0,prompt,A,B,C,D,E,answer,context,first_prob,second_prob,...,third_len,dist_1_2,dist_1_2_rate,dist_1_3,dist_1_3_rate,answer_location,first_tfidf,second_tfidf,third_tfidf,should_swap
13,What is the Roche limit?,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,D,Roche limit > Roche limit > In celestial mecha...,0.947438,0.834348,...,160,139,0.460265,172,0.569536,second,0.012845,0.022513,0.003466,False
28,What is the evidence for the existence of a su...,The Milky Way galaxy has a supermassive black ...,The Milky Way galaxy has a supermassive black ...,The Milky Way galaxy has a supermassive black ...,The Milky Way galaxy has a supermassive black ...,The star S2 follows an elliptical orbit with a...,E,Supermassive black hole > Evidence > From the ...,0.740287,0.553895,...,242,81,0.122356,472,0.712991,second,0.080636,0.131271,0.060301,True
39,What is the synapstor or synapse transistor?,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,E,"Memristor > Implementations > In July 2008, Er...",0.457426,0.410623,...,132,4,0.029851,2,0.014925,second,0.007564,0.011028,0.006071,True


In [50]:
df[
    (df["second_prob"] > 0.4)
    & (df["dist_1_2_rate"] < 0.5)
    & (20 < df["first_len"])
    & ((0.01 < df["first_tfidf"]) | (0.01 < df["second_tfidf"]))
    & (df["second_tfidf"] / (df["first_tfidf"] + 1e-9) > 1.4)
]

Unnamed: 0,prompt,A,B,C,D,E,answer,context,first_prob,second_prob,...,third_len,dist_1_2,dist_1_2_rate,answer_location,first_tfidf,second_tfidf,should_swap,dist_1_3,dist_1_3_rate,third_tfidf
7,Which of the following statements accurately d...,The blocking temperature of an antiferromagnet...,The blocking temperature of an antiferromagnet...,The blocking temperature of an antiferromagnet...,The blocking temperature of an antiferromagnet...,The blocking temperature of an antiferromagnet...,D,Antiferromagnetism > Antiferromagnetic materia...,0.999612,0.47694,...,289,51,0.186131,first,0.003286,0.036598,False,65,0.224913,0.018249
13,What is the Roche limit?,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,The Roche limit is the distance at which tidal...,D,Roche limit > Roche limit > In celestial mecha...,0.947438,0.834348,...,160,139,0.460265,second,0.012845,0.022513,False,172,0.569536,0.003466
28,What is the evidence for the existence of a su...,The Milky Way galaxy has a supermassive black ...,The Milky Way galaxy has a supermassive black ...,The Milky Way galaxy has a supermassive black ...,The Milky Way galaxy has a supermassive black ...,The star S2 follows an elliptical orbit with a...,E,Supermassive black hole > Evidence > From the ...,0.740287,0.553895,...,242,81,0.122356,second,0.080636,0.131271,True,472,0.712991,0.060301
39,What is the synapstor or synapse transistor?,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,A device used to demonstrate a neuro-inspired ...,E,"Memristor > Implementations > In July 2008, Er...",0.457426,0.410623,...,132,4,0.029851,second,0.007564,0.011028,True,2,0.014925,0.006071
219,What are fenamates?,Organic compounds derived from anthranilic aci...,Organic compounds derived from 2-chlorobenzoic...,Organic compounds derived from acridone that s...,Organic compounds derived from 2-chlorobenzofu...,Organic compounds derived from fenamic acid th...,A,Fenamic acid > Fenamic acid > Fenamic acid is ...,0.847758,0.5469,...,95,63,0.488372,first,0.014271,0.027533,False,72,0.55814,0.001098


In [87]:
df["second_option_index"]

0       1
1       1
2       3
3       0
4       0
       ..
1195    3
1196    1
1197    2
1198    2
1199    4
Name: second_option_index, Length: 1200, dtype: int64

In [90]:
max_prob[df["should_swap"], df.loc[df["should_swap"], "second_option_index"]] = (
    max_prob[df["should_swap"], df.loc[df["should_swap"], "first_option_index"]] + 1.0
)

In [79]:
df.iloc[447]

prompt             Which of the following statements accurately d...
A                  Bis-gamma-glutamylcystine reductase acts on ga...
B                  Bis-gamma-glutamylcystine reductase acts on ga...
C                  Bis-gamma-glutamylcystine reductase acts on ga...
D                  Bis-gamma-glutamylcystine reductase acts on bi...
E                  Bis-gamma-glutamylcystine reductase acts on ga...
answer                                                             C
context            Bis-gamma-glutamylcystine reductase > Bis-gamm...
first_prob                                                  0.660314
second_prob                                                 0.475209
third_prob                                                  0.131103
prob_diff                                                   0.185105
prob_diff23                                                 0.344106
first_option                                                       C
second_option                     

In [65]:
df.iloc[179].C

"The difference in distance travelled by light from the planet (or its moon) to Earth when the Earth is at the point in its orbit that is closest to its planet than when the Earth is at the farthest point in its orbit, the difference in distance being the diameter of the Earth's orbit around the Sun."

In [66]:
df.iloc[179].E

"The difference in the gravitational pull of the planet on its moons when the Earth is at the point in its orbit that is closest to its planet than when the Earth is at the farthest point in its orbit, the difference in distance being the diameter of the Earth's orbit around the Sun."

In [67]:
df.iloc[179].context

'Speed of light > Measurement > Ole Christensen Rømer used an astronomical measurement to make the first quantitative estimate of the speed of light in the year 1676. When measured from Earth, the periods of moons orbiting a distant planet are shorter when the Earth is approaching the planet than when the Earth is receding from it. The distance travelled by light from the planet (or its moon) to Earth is shorter when the Earth is at the point in its orbit that is closest to its planet than when the Earth is at the farthest point in its orbit, the difference in distance being the diameter of the Earth\'s orbit around the Sun. The observed change in the moon\'s orbital period is caused by the difference in the time it takes light to traverse the shorter or longer distance. Rømer observed this effect for Jupiter\'s innermost major moon Io and deduced that light takes 22 minutes to cross the diameter of the Earth\'s orbit. Libration point orbit > Libration point orbit > In orbital mechanic