In [1]:
import os
import unicodedata

import numpy as np
import pandas as pd
from datasets import load_dataset, load_from_disk
from tqdm.auto import tqdm

In [2]:
%%time
paraphs_parsed_dataset = load_from_disk("../input/all-paraphs-parsed-expanded")
modified_texts = paraphs_parsed_dataset.map(
    lambda example: {
        "temp_text": f"{example['title']} {example['section']} {example['text']}".replace("\n", " ").replace("'", "")
    },
    num_proc=4,
)["temp_text"]

wiki_df = pd.DataFrame({"text": modified_texts})

CPU times: user 5.7 s, sys: 2.03 s, total: 7.73 s
Wall time: 17.4 s


In [3]:
df = pd.read_csv("../preprocessed/901_concat/data2.csv")

df["text"] = df.apply(
    lambda row: f'{row["prompt"]}\n{row["prompt"]}\n{row["prompt"]}\n{row["A"]}\n{row["B"]}\n{row["C"]}\n{row["D"]}\n{row["E"]}',
    axis=1,
).values

df.head()

Unnamed: 0,prompt,A,B,C,D,E,answer,text
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,Which of the following statements accurately d...
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,Which of the following is an accurate definiti...
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,Which of the following statements accurately d...
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,What is the significance of regularization in ...
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,Which of the following statements accurately d...


In [4]:
stop_words = [
    "each",
    "you",
    "the",
    "use",
    "used",
    "where",
    "themselves",
    "nor",
    "it's",
    "how",
    "don't",
    "just",
    "your",
    "about",
    "himself",
    "with",
    "weren't",
    "hers",
    "wouldn't",
    "more",
    "its",
    "were",
    "his",
    "their",
    "then",
    "been",
    "myself",
    "re",
    "not",
    "ours",
    "will",
    "needn",
    "which",
    "here",
    "hadn",
    "it",
    "our",
    "there",
    "than",
    "most",
    "couldn't",
    "both",
    "some",
    "for",
    "up",
    "couldn",
    "that'll",
    "she's",
    "over",
    "this",
    "now",
    "until",
    "these",
    "few",
    "haven",
    "of",
    "wouldn",
    "into",
    "too",
    "to",
    "very",
    "shan",
    "before",
    "the",
    "they",
    "between",
    "doesn't",
    "are",
    "was",
    "out",
    "we",
    "me",
    "after",
    "has",
    "isn't",
    "have",
    "such",
    "should",
    "yourselves",
    "or",
    "during",
    "herself",
    "doing",
    "in",
    "shouldn't",
    "won't",
    "when",
    "do",
    "through",
    "she",
    "having",
    "him",
    "haven't",
    "against",
    "itself",
    "that",
    "did",
    "theirs",
    "can",
    "those",
    "own",
    "so",
    "and",
    "who",
    "you've",
    "yourself",
    "her",
    "he",
    "only",
    "what",
    "ourselves",
    "again",
    "had",
    "you'd",
    "is",
    "other",
    "why",
    "while",
    "from",
    "them",
    "if",
    "above",
    "does",
    "whom",
    "yours",
    "but",
    "being",
    "wasn't",
    "be",
]

In [5]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

corpus_df_valid = df["text"]
modified_texts = wiki_df["text"]

vectorizer1 = TfidfVectorizer(
    ngram_range=(1, 2), token_pattern=r"(?u)\b[\w/.-]+\b|!|/|\?|\"|\'", stop_words=stop_words
)
vectorizer1.fit(corpus_df_valid)
vocab_df_valid = vectorizer1.get_feature_names_out()
print(f"length of vectorizer vocab is {len(vocab_df_valid)}")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    token_pattern=r"(?u)\b[\w/.-]+\b|!|/|\?|\"|\'",
    stop_words=stop_words,
    vocabulary=vocab_df_valid,
)
vectorizer.fit(modified_texts)
corpus_tf_idf = vectorizer.transform(corpus_df_valid)
print(f"length of vectorizer vocab is {len(vectorizer.get_feature_names_out())}")



length of vectorizer vocab is 50256
length of vectorizer vocab is 50256
CPU times: user 7min 3s, sys: 1.44 s, total: 7min 4s
Wall time: 7min 4s


In [6]:
corpus_tf_idf.shape

(1200, 50256)

In [7]:
%%time
chunk_size = 100000
top_per_chunk = 10
top_per_query = 10

all_chunk_top_indices = []
all_chunk_top_values = []

for idx in tqdm(range(0, len(wiki_df), chunk_size)):
    wiki_vectors = vectorizer.transform(wiki_df["text"][idx : idx + chunk_size])
    temp_scores = (corpus_tf_idf * wiki_vectors.T).toarray()
    chunk_top_indices = temp_scores.argpartition(-top_per_chunk, axis=1)[:, -top_per_chunk:]
    chunk_top_values = temp_scores[np.arange(temp_scores.shape[0])[:, np.newaxis], chunk_top_indices]

    all_chunk_top_indices.append(chunk_top_indices + idx)
    all_chunk_top_values.append(chunk_top_values)

top_indices_array = np.concatenate(all_chunk_top_indices, axis=1)
top_values_array = np.concatenate(all_chunk_top_values, axis=1)

merged_top_scores = np.sort(top_values_array, axis=1)[:, -top_per_query:]
merged_top_indices = top_values_array.argsort(axis=1)[:, -top_per_query:]
articles_indices = top_indices_array[np.arange(top_indices_array.shape[0])[:, np.newaxis], merged_top_indices]

  0%|          | 0/22 [00:00<?, ?it/s]

CPU times: user 8min 28s, sys: 11.2 s, total: 8min 40s
Wall time: 8min 40s


In [33]:
str(None)

'None'

In [None]:
df

In [32]:
index = 96
print(wiki_df.iloc[int(articles_indices[index, -1])].text)
print()
print(wiki_df.iloc[int(articles_indices[index, -2])].text)
print()
print(wiki_df.iloc[int(articles_indices[index, -3])].text)
print()
print(wiki_df.iloc[int(articles_indices[index, -4])].text)
print()

print()
print(df.iloc[index].prompt)
df.iloc[index][df.iloc[index].answer]

Born reciprocity Born reciprocity However Borns idea of a quantum metric operator was later taken up by Hideki Yukawa when developing his nonlocal quantum theory in the 1950s. In 1981, Eduardo R. Caianiello proposed a "maximal acceleration", similarly as there is a minimal length at Planck scale, and this concept of maximal acceleration has been expanded upon by others. It has also been suggested that Born reciprocity may be the underlying physical reason for the T-duality symmetry in string theory, and that Born reciprocity may be of relevance to developing a quantum geometry.Born chose the term "reciprocity" for the reason that in a crystal lattice, the motion of a particle can be described in p-space by means of the reciprocal lattice.

Glossary of Lie groups and Lie algebras M maximal 1. For "maximal compact subgroup", see #compact. 2. For "maximal torus", see #torus.

That that is is that that is not is not is that it it is That that is is that that is not is not is that it it is 

'Eduardo R. Caianiello'