In [1]:
import os
import unicodedata

import numpy as np
import pandas as pd
from datasets import load_dataset, load_from_disk
from tqdm.auto import tqdm

In [2]:
import cudf
import cuml
import cupy
from cuml import PCA
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))  # 使用言語に応じて変更



In [3]:
%%time
paraphs_parsed_dataset = load_from_disk("../input/all-paraphs-parsed-expanded")
modified_texts = paraphs_parsed_dataset.map(
    lambda example: {
        "temp_text": f"{example['title']} {example['section']} {example['text']}".replace("\n", " ").replace("'", "")
    },
    num_proc=4,
)["temp_text"]

wiki_df = pd.DataFrame({"text": modified_texts})

CPU times: user 4.9 s, sys: 1.4 s, total: 6.3 s
Wall time: 6.27 s


In [4]:
import re

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)


def clean_text_for_fts(text):
    token_pattern = re.compile(r"(?u)\b[\w/.-]+\b|!|/|\?|\"|\'")
    text = " ".join(token_pattern.findall(text))
    return text


wiki_df["tokenized_text"] = wiki_df["text"].parallel_apply(clean_text_for_fts)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=525320), Label(value='0 / 525320')…

In [38]:
def clean_text_for_fts(text):
    token_pattern = re.compile(r"(?u)\b[\w/.-]+\b|!|/|\?|\"|\'")
    text = " ".join(token_pattern.findall(text))
    return text


clean_text_for_fts("Which of the following statements accurately d... 2.5 2-5	")

'Which of the following statements accurately d 2.5 2-5'

In [17]:
df = pd.read_csv("../preprocessed/901_concat/data2.csv")

df["all_text"] = df.apply(
    lambda row: f'{row["prompt"]}\n{row["A"]}\n{row["B"]}\n{row["C"]}\n{row["D"]}\n{row["E"]}',
    axis=1,
).values
df["tokenized_text"] = df["all_text"].parallel_apply(clean_text_for_fts)

df.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=300), Label(value='0 / 300'))), HB…

Unnamed: 0,prompt,A,B,C,D,E,answer,all_text,tokenized_text
0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,D,Which of the following statements accurately d...,Which of the following statements accurately d...
1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A,Which of the following is an accurate definiti...,Which of the following is an accurate definiti...
2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A,Which of the following statements accurately d...,Which of the following statements accurately d...
3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,C,What is the significance of regularization in ...,What is the significance of regularization in ...
4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,D,Which of the following statements accurately d...,Which of the following statements accurately d...


In [47]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer as SkTfidfVectorizer

corpus_df_valid = df["tokenized_text"]

vectorizer1 = SkTfidfVectorizer(ngram_range=(1, 1), stop_words=list(stop_words))
vectorizer1.fit(corpus_df_valid)
vocab_df_valid = vectorizer1.get_feature_names_out()
print(len(vocab_df_valid))

10295
CPU times: user 130 ms, sys: 20.8 ms, total: 151 ms
Wall time: 148 ms


In [48]:
%%time
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=list(stop_words), vocabulary=cudf.Series(vocab_df_valid))
vectorizer.fit(wiki_df["tokenized_text"])

CPU times: user 1.98 s, sys: 4.91 s, total: 6.89 s
Wall time: 7.16 s


<cuml.feature_extraction._tfidf_vectorizer.TfidfVectorizer at 0x7f064e876350>

In [18]:
"""
%%time
from sklearn.feature_extraction.text import TfidfVectorizer as SkTfidfVectorizer

corpus_df_valid = df["all_text"]

vectorizer1 = SkTfidfVectorizer(ngram_range=(1, 2), stop_words=stop_words)
vectorizer1.fit(df["all_text"])
vocab_df_valid = vectorizer1.get_feature_names_out()

vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words=stop_words, vocabulary=vocab_df_valid)
vectorizer.fit(wiki_df["text"])

%%time
vectorizer = SkTfidfVectorizer(
    ngram_range=(1, 2),
    token_pattern=r"(?u)\b[\w/.-]+\b|!|/|\?|\"|\'",
    stop_words=list(stop_words),
    vocabulary=vocab_df_valid,
)
vectorizer.fit(wiki_df["text"])
"""

CPU times: user 179 ms, sys: 94 µs, total: 179 ms
Wall time: 176 ms


9439

In [20]:
chunk_size = 100000
top_per_chunk = 10
top_per_query = 10

all_chunk_top_indices = []
all_chunk_top_values = []

In [21]:
corpus_tf_idf = vectorizer.transform(corpus_df_valid)
corpus_tf_idf.shape

(10, 367)

In [22]:
%%time
wiki_vectors = vectorizer.transform(wiki_df["text"][0 : 0 + chunk_size])

CPU times: user 167 ms, sys: 322 ms, total: 489 ms
Wall time: 486 ms


In [23]:
%%time
temp_scores = (corpus_tf_idf * wiki_vectors.T).toarray()

CPU times: user 20.9 ms, sys: 8.6 ms, total: 29.5 ms
Wall time: 27 ms


In [24]:
%%time
chunk_size = 100000
top_per_chunk = 10
top_per_query = 10

all_chunk_top_indices = []
all_chunk_top_values = []

for idx in tqdm(range(0, len(wiki_df), chunk_size)):
    wiki_vectors = vectorizer.transform(wiki_df["text"][idx : idx + chunk_size])
    temp_scores = (corpus_tf_idf * wiki_vectors.T).toarray()
    chunk_top_indices = temp_scores.argpartition(-top_per_chunk, axis=1)[:, -top_per_chunk:]
    chunk_top_values = temp_scores[np.arange(temp_scores.shape[0])[:, np.newaxis], chunk_top_indices]

    all_chunk_top_indices.append(chunk_top_indices + idx)
    all_chunk_top_values.append(chunk_top_values)

  0%|          | 0/22 [00:00<?, ?it/s]

CPU times: user 3.92 s, sys: 8.34 s, total: 12.3 s
Wall time: 12.2 s


In [25]:
top_indices_array = np.concatenate(all_chunk_top_indices, axis=1)
top_values_array = np.concatenate(all_chunk_top_values, axis=1)

merged_top_scores = np.sort(top_values_array, axis=1)[:, -top_per_query:]
merged_top_indices = top_values_array.argsort(axis=1)[:, -top_per_query:]
articles_indices = top_indices_array[np.arange(top_indices_array.shape[0])[:, np.newaxis], merged_top_indices]

In [34]:
index = 7
print(wiki_df.iloc[int(articles_indices[index, -1])].text)
print()
print(wiki_df.iloc[int(articles_indices[index, -2])].text)
print()
print(wiki_df.iloc[int(articles_indices[index, -3])].text)
print()

print()
print(df.iloc[index].all_text)
df.iloc[index]

Antiferromagnetism Antiferromagnetic materials Antiferromagnets can couple to ferromagnets, for instance, through a mechanism known as exchange bias, in which the ferromagnetic film is either grown upon the antiferromagnet or annealed in an aligning magnetic field, causing the surface atoms of the ferromagnet to align with the surface atoms of the antiferromagnet. This provides the ability to "pin" the orientation of a ferromagnetic film, which provides one of the main uses in so-called spin valves, which are the basis of magnetic sensors including modern hard disk drive read heads. The temperature at or above which an antiferromagnetic layer loses its ability to "pin" the magnetization direction of an adjacent ferromagnetic layer is called the blocking temperature of that layer and is usually lower than the Néel temperature.

Bismuth ferrite Bismuth ferrite Bismuth ferrite (BiFeO3, also commonly referred to as BFO in materials science) is an inorganic chemical compound with perovskite

prompt            Which of the following statements accurately d...
A                 The blocking temperature of an antiferromagnet...
B                 The blocking temperature of an antiferromagnet...
C                 The blocking temperature of an antiferromagnet...
D                 The blocking temperature of an antiferromagnet...
E                 The blocking temperature of an antiferromagnet...
answer                                                            D
all_text          Which of the following statements accurately d...
tokenized_text    Which of the following statements accurately d...
Name: 7, dtype: object