In [1]:
from similarity_search.model import EmbeddingModel
from elasticsearch import Elasticsearch
import eland as ed
import pandas as pd
import tqdm
from tqdm.auto import tqdm

tqdm.pandas()

In [4]:
ES_URL = "http://0.0.0.0:9200"
PASSAGE_FILE = "./data/marco-passage-with-embedding-msmarco-MiniLM-L-12-v3.pkl"
client = Elasticsearch(ES_URL)

if client.indices.exists(index="collections"):
    print("sd")

sd


In [3]:
model = EmbeddingModel(
    url=ES_URL,
    hub_model_id="sentence-transformers/msmarco-MiniLM-L-12-v3",
    task_type="text_embedding",
    es_client=client,
    es_model_id="msmarco-MiniLM-L-12-v3".lower().replace("-", "_"),
)
if not model.exists:
    model.deploy()

In [4]:
data = pd.read_pickle(PASSAGE_FILE)
data["text_embeddings"][0].shape

(384,)

In [5]:
collections_mappings = {
    "mappings": {
        "properties": {
            "id": {"type": "integer"},
            "text": {
                "type": "text",
            },
            "text_embeddings": {
                "index": True,
                "type": "dense_vector",
                "dims": 384,
                "similarity": "cosine",
            },
        }
    }
}
if not client.indices.exists(index="collections"):
    client.indices.create(index="collections", body=collections_mappings)

In [6]:
def index_df(row, es_client):
    doc = {
        "id": row["id"],
        "text": row["text"],
        "text_embeddings": row["text_embeddings"],
    }
    resp = es_client.index(
        index="collections",
        document=doc,
    )


# data.progress_apply(lambda x: index_df(x, client), axis=1)

In [7]:
from similarity_search.index import Index


_collections = Index("collections", client, 10, model)
_collections.fields

['id', 'text', 'text_embeddings']

In [8]:
from similarity_search.utils import get_df


resp = _collections.full_text_search("text", "hydrogen", fields=["id", "text"])
get_df(resp)

[{'score': 10.957546,
  'source': {'id': 3905806,
   'text': 'hydrogen price per ton hydrogen price in industrial pure hydrogen price hot-sales liquid hydrogen price hot-selling liquid hydrogen price industrial hydrogen price methyl hydrogen price ethylene hydrogen price liquid hydrogen price 99 liquid hydrogen chloride liquid hydrogen fluoride buy liquid hydrogen tank liquid hydrogen uses liquid hydrogen More ...'}},
 {'score': 10.290877,
  'source': {'id': 1721142,
   'text': 'Buy Compressed Hydrogen Gas or Liquid Hydrogen (H2) Buy Compressed Hydrogen Gas or Liquid Hydrogen (H2) We offer compressed hydrogen gas and liquid hydrogen (H2) in a variety of purities and concentrations. See the chart below and download the spec sheets and safety data sheets for more information on buying liquid hydrogen and hydrogen gas from Praxair.'}},
 {'score': 10.084263,
  'source': {'id': 8596871,
   'text': 'Hydrogen safety covers the safe production, handling and use of hydrogen-particularly hydroge

In [9]:
resp = _collections.knn_search("text_embeddings", "hydrogen", fields=["id", "text"])
resp

  resp = self.es_client.knn_search(index=self.name, knn=query, source=fields)


[{'score': 0.7867255,
  'source': {'id': 128984,
   'text': 'Hydrogen gas has the molecular formula H 2. At room temperature and under standard pressure conditions, hydrogen is a gas that is tasteless, odorless and colorless. Hydrogen can exist as a liquid under high pressure and an extremely low temperature of 20.28 kelvin (−252.87°C, −423.17 °F). Hydrogen is often stored in this way as liquid hydrogen takes up less space than hydrogen in its normal gas form. Liquid hydrogen is also used as a rocket fuel.'}},
 {'score': 0.78181136,
  'source': {'id': 4509079,
   'text': 'Wiktionary (0.00 / 0 votes) Rate this definition: hydrogen (Noun). The lightest chemical element (symbol H) with an atomic number of 1 and atomic weight of 1.00794. hydrogen (Noun). Molecular hydrogen (H), a colourless, odourless and flammable gas at room temperature.'}},
 {'score': 0.77747226,
  'source': {'id': 4509080,
   'text': 'Wiktionary (0.00 / 0 votes) Rate this definition: hydrogen (Noun). The lightest chemi