In [1]:
import pandas as pd
import numpy as np
import sys

In [13]:
from sentence_transformers import SentenceTransformer

# 1. Preprocess wiki text data

In [2]:
with open("data/wiki/parsed.xml") as f:
    all_data = f.readlines()

In [3]:
len(all_data)

1026423

In [4]:
data_dict = dict()
curr_title = ""
search_title = True
for line in all_data:
    if line.strip() == "==========":
        search_title = True
    elif search_title and line!="\n" and "<title>" in line[:150]:
        line_ = line.replace("</title>", "<title>")
        curr_title = line_.split("<title>")[1]
        line = line.split("</title><text>")[1].strip()
        data_dict[curr_title] = [line]
        search_title = False
    elif line.strip() != "":
        data_dict[curr_title].append(line.strip())

In [5]:
article_names = list(data_dict.keys())
len(article_names)

21217

In [6]:
for title, contents in data_dict.items():
    headers = title
    temp_line = ""
    new_contents = []
    for line in contents:
        # print(line)
        # continue
        if len(line) < 50:
            temp_line += line + ". "
            headers += ", " + line
        elif len(line) >= 50 and temp_line != "":
            line = temp_line + line
            temp_line = ""
            new_contents.append(line)
        else:
            new_contents.append(line)
    data_dict[title] = {
        "header": headers,
        "contents": new_contents
        }
    

In [7]:
df_wiki = pd.DataFrame({"title":list(data_dict.keys())})
df_wiki["header"] = df_wiki["title"].apply(lambda x: data_dict.get(x).get("header"))
df_wiki.head()

Unnamed: 0,title,header
0,Anarchism,"Anarchism, Etymology, terminology, and definit..."
1,Albedo,"Albedo, Terrestrial albedo, White-sky, black-s..."
2,A,"A, History, Typographic variants, Use in writi..."
3,Alabama,"Alabama, Etymology, History, Pre-European sett..."
4,Achilles,"Achilles, Etymology, Description, Birth and ea..."


In [8]:
df_wiki["contents"] = df_wiki["title"].apply(lambda x: data_dict.get(x).get("contents"))

In [9]:
df_wiki.head()

Unnamed: 0,title,header,contents
0,Anarchism,"Anarchism, Etymology, terminology, and definit...",[Anarchism is a political philosophy and movem...
1,Albedo,"Albedo, Terrestrial albedo, White-sky, black-s...",[Albedo (; ) is the measure of the diffuse ref...
2,A,"A, History, Typographic variants, Use in writi...","[A, or a, is the first letter and the first vo..."
3,Alabama,"Alabama, Etymology, History, Pre-European sett...",[Alabama is a state in the Southeastern regio...
4,Achilles,"Achilles, Etymology, Description, Birth and ea...","[In Greek mythology, Achilles ( ) or Achilleus..."


In [10]:
df_para_ = []
for title, vals in data_dict.items():
    for line in vals["contents"]:
        df_para_.append([title, line])
df_wiki_paragraph = pd.DataFrame(df_para_, columns=["title", 'paragraph'])

In [11]:
df_wiki_paragraph

Unnamed: 0,title,paragraph
0,Anarchism,Anarchism is a political philosophy and moveme...
1,Anarchism,"Etymology, terminology, and definition. Main a..."
2,Anarchism,History. Main article: History of anarchism. P...
3,Anarchism,"Modern era. During the French Revolution, part..."
4,Anarchism,"Post-war era. By the end of World War II, the ..."
...,...,...
340699,Horn,"Places. Cape Horn, the southernmost point of S..."
340700,Horn,"Music. Horn (album), an album by Pharaoh Overl..."
340701,Horn,"Slang. Telephone, also known as ""the horn"" Two..."
340702,Horn,"Other uses. Horn (Chinese constellation), part..."


# 2. Sentence Transformer

In [40]:
340704 / 21000 * 1000

16224.0

In [14]:
smodel = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [15]:
title_enc = smodel.encode(df_wiki["title"].values)

In [48]:
query = """Ocean ecosystem and aquatic animals, includes whales, dolphins, and porpoises that have fully aquatic lifestyle"""
k = 250

In [49]:
query_emb = smodel.encode(query)
scores = title_enc @ query_emb
arg_sort = np.argsort(scores)[::-1]
top_idx = arg_sort[:k]
top_score = scores[arg_sort][:k]
tb_desc_tmp = df_wiki.loc[top_idx,:].copy()
tb_desc_tmp["score"] = top_score
tb_desc_tmp

Unnamed: 0,title,header,contents,score
9838,Marine biology,"Marine biology, Biological oceanography, Marin...",[Marine biology is the scientific study of the...,0.544665
10350,List of marine aquarium fish species,"List of marine aquarium fish species, Angelfis...",[The following list of marine aquarium fish sp...,0.512990
5649,List of freshwater aquarium invertebrate species,List of freshwater aquarium invertebrate speci...,"[This is a list of invertebrates, animals with...",0.476913
4262,Dolphin,"Dolphin, Etymology, Hybridization, Evolution, ...",[A dolphin is an aquatic mammal within the inf...,0.469884
16643,Whale,"Whale, Etymology and definitions, Taxonomy and...",[Whales are a widely distributed and diverse g...,0.469153
...,...,...,...,...
10301,Strategic sealift ships,"Strategic sealift ships, Ships, Sergeant Matej...",[Strategic sealift ships are part of the Unite...,0.176647
13454,Sahara desert (ecoregion),"Sahara desert (ecoregion), Setting, Climate, H...","[The Sahara desert, as defined by the World Wi...",0.176530
4337,Elephant,"Elephant, Etymology, Taxonomy, Evolution and e...",[Elephants are the largest existing land anima...,0.176375
9017,Life,"Life, Definitions, Biology, Alternative defini...",[Life is a quality that distinguishes matter t...,0.176247


In [53]:
tb_desc_tmp.to_parquet("data/ocean_processed/titles.parquet")

In [56]:
ocean_paragraphs = df_wiki_paragraph[df_wiki_paragraph["title"].isin(tb_desc_tmp["title"])]
ocean_paragraphs

Unnamed: 0,title,paragraph
285,Animalia (book),Animalia is an illustrated children's book by ...
286,Animalia (book),Synopsis. Animalia is an alliterative alphabet...
287,Animalia (book),Related products. Julia MacRae Books published...
288,Animalia (book),Adaptations. A television series was also crea...
289,Animalia (book),Awards. Animalia won the Young Australian's Be...
...,...,...
337970,Gill,Amphibians. Tadpoles of amphibians have from t...
337971,Gill,"Invertebrates. Crustaceans, molluscs, and some..."
337972,Gill,Plastrons. A plastron is a type of structural ...
337973,Gill,See also. Aquatic respiration Artificial gills...


In [57]:
ocean_paragraphs.to_parquet("data/ocean_processed/paragraphs.parquet")

In [58]:
ocean_titles = tb_desc_tmp

# Focus on ocean ecosystem and Cetacean animals

In [61]:
from semantic_search_engine import SemanticSearchEngine