In [35]:
import sqlite3

import pandas as pd

In [78]:
df = pd.read_parquet("../input/llm-science-wikipedia-data-b/b.parquet")
df = df.head()
df

Unnamed: 0,id,title,text
0,59737009,Biodiversity Monitoring Switzerland,The Biodiversity Monitoring Switzerland (BDM) ...
1,59737081,Bigoudène,"In Breton tradition, a coiffe bigoudène is a w..."
2,59737128,Barnabas Bala,Barnabas Yusuf Bala (20 December 1956 – 11 Jul...
3,59739888,Bagdah,Bagdah or Bagdaha may refer to: Bagdah (commun...
4,59739916,Big John Hamilton,Big John Hamilton may refer to: Big John Hamil...


In [79]:
window_size = 4
sliding_size = 3
filter_len = 3
filter_len_max = 1000
import blingfire as bf


def extract_chunk_by_sliding_window(text_list: list[str], window_size: int, sliding_size: int) -> list[str]:
    """
    text のリストをsliding windowで結合する。window_size個のtextが含まれるまで結合し、sliding_size個ずつずらして結合する。
    """
    chunks = []
    for i in range(0, len(text_list), sliding_size):
        chunk = " ".join(text_list[i : i + window_size])
        chunks.append(chunk)
    return chunks


def split_sentences(text):
    document = text.replace("\n", " ")
    _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
    section_sentences = []
    for o in sentence_offsets:
        if filter_len < o[1] - o[0] and o[1] - o[0] < filter_len_max:
            section_sentences.append(document[o[0] : o[1]])
    chunks = extract_chunk_by_sliding_window(section_sentences, window_size, sliding_size)
    return chunks


df["text"] = df["text"].apply(split_sentences)
df = df.explode(["text"]).reset_index(drop=True)
df["text"] = df["title"] + " > " + df["text"]
df

Unnamed: 0,id,title,text
0,59737009,Biodiversity Monitoring Switzerland,Biodiversity Monitoring Switzerland > The Biod...
1,59737009,Biodiversity Monitoring Switzerland,Biodiversity Monitoring Switzerland > Biodiver...
2,59737009,Biodiversity Monitoring Switzerland,Biodiversity Monitoring Switzerland > Together...
3,59737009,Biodiversity Monitoring Switzerland,Biodiversity Monitoring Switzerland > The Biod...
4,59737009,Biodiversity Monitoring Switzerland,Biodiversity Monitoring Switzerland > Data on ...
...,...,...,...
74,59737128,Barnabas Bala,Barnabas Bala > Retrieved 23 January 2019. Ade...
75,59737128,Barnabas Bala,Barnabas Bala > Retrieved 21 August 2019. Form...
76,59737128,Barnabas Bala,Barnabas Bala > Retrieved 11 July 2021. v t e
77,59739888,Bagdah,Bagdah > Bagdah or Bagdaha may refer to: Bagda...


In [71]:
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")  # 使用言語に応じて変更
stop_words = set(stopwords.words("english"))  # 使用言語に応じて変更


def clean_text_for_fts(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = " ".join([i.strip().lower() for i in text.split()])
    filtered_words = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    return text


df["clean_title"] = df["title"].apply(clean_text_for_fts)
df["clean_text"] = df["text"].apply(clean_text_for_fts)
df

Unnamed: 0,id,title,text,clean_text,clean_title
0,70803281,Bolton Street Park,Bolton Street Park > Bolton Street Park > Ball...,bolton street park bolton street park ballpark...,bolton street park
1,70803281,Bolton Street Park,Bolton Street Park > The Washington Senators t...,bolton street park the washington senators tra...,bolton street park
2,70803281,Bolton Street Park,Bolton Street Park > Bolton Street Park > The ...,bolton street park bolton street park the wash...,bolton street park
3,70803281,Bolton Street Park,Bolton Street Park > The South Atlantic League...,bolton street park the south atlantic league w...,bolton street park
4,70803281,Bolton Street Park,Bolton Street Park > Bolton Street Park > The ...,bolton street park bolton street park the sout...,bolton street park
5,70803281,Bolton Street Park,Bolton Street Park > ## Present site ## Fairmo...,bolton street park present site fairmount bapt...,bolton street park
6,70803281,Bolton Street Park,Bolton Street Park > Bolton Street Park > ## P...,bolton street park bolton street park present ...,bolton street park
7,70803281,Bolton Street Park,Bolton Street Park > Shoeless Joe Jackson had ...,bolton street park shoeless joe jackson had pl...,bolton street park
8,70803281,Bolton Street Park,Bolton Street Park > Bolton Street Park > Shoe...,bolton street park bolton street park shoeless...,bolton street park
9,70803359,Boat Rock,Boat Rock > Boat Rock > Boat Rock is a tiny sa...,boat rock boat rock boat rock is a tiny sandst...,boat rock


In [86]:
df

Unnamed: 0,id,title,text
0,59737009,Biodiversity Monitoring Switzerland,Biodiversity Monitoring Switzerland > The Biod...
1,59737009,Biodiversity Monitoring Switzerland,Biodiversity Monitoring Switzerland > Biodiver...
2,59737009,Biodiversity Monitoring Switzerland,Biodiversity Monitoring Switzerland > Together...
3,59737009,Biodiversity Monitoring Switzerland,Biodiversity Monitoring Switzerland > The Biod...
4,59737009,Biodiversity Monitoring Switzerland,Biodiversity Monitoring Switzerland > Data on ...
...,...,...,...
74,59737128,Barnabas Bala,Barnabas Bala > Retrieved 23 January 2019. Ade...
75,59737128,Barnabas Bala,Barnabas Bala > Retrieved 21 August 2019. Form...
76,59737128,Barnabas Bala,Barnabas Bala > Retrieved 11 July 2021. v t e
77,59739888,Bagdah,Bagdah > Bagdah or Bagdaha may refer to: Bagda...


In [82]:
%%time
import os

file_path = "tutorial.db"

if os.path.exists(file_path):
    # ファイルを削除
    os.remove(file_path)
db = sqlite3.connect(file_path)
cur = db.cursor()
cur.execute('create virtual table imdb using fts5(text , title, tokenize="porter unicode61");')  # UNINDEXED

cur.executemany(
    "insert into imdb (text, title) values (?,?);",
    df[["text", "title"]].to_records(index=False),
)
db.commit()

CPU times: user 13.3 ms, sys: 481 µs, total: 13.7 ms
Wall time: 19.9 ms


In [111]:
file_path = "../preprocessed/325_fts_db/000/fts.db"
db = sqlite3.connect(file_path)
cur = db.cursor()


q = " OR ".join("on".split())

res = cur.execute(
    f"""select text, rank
                      from imdb
                      where text MATCH "{q}"
                      ORDER BY rank
                      limit 5""",
).fetchall()
res

[("Allison Steiner > Today Steiner is a professor in the Department of Climate and Space Sciences and Engineering. Steiner's research focuses on atmospheric chemistry, and specifically, exchanges between the biosphere and the atmosphere. One of the most prominent publications she worked on focused on furthering climate change research worldwide through the use of climate modeling, while other highly cited publications focus on air pollution's effect on regional air quality.",
  -1.530544533817006),
 ('Blossoms 666 > The first issue was released on January 23, 2019, making Blossoms 666 the first limited series to be published by the Archie Horror imprint. The second issue was released on March 6, the third issue on April 17, and the fourth issue on May 29. The fifth and final issue was released on July 17.',
  -1.5255910996964115),
 ('Box Office (album) > Archived from the original on 4 July 2018. Retrieved 12 April 2020. Box Office by Aja on iTunes, archived from the original on 2019-0