In [None]:
# Copyright (C) 2019-2023 vdaas.org vald team <vald@vdaas.org>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# You may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#	https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Let's use the OpenAI Embeddings API and the vector search engine Vald to search for similar sentences!

## Data Preparation
We will use the [AG News](https://huggingface.co/datasets/ag_news) test data this time.

In [None]:
!pip install datasets pandas ipywidgets

In [None]:
from datasets import load_dataset

dataset = load_dataset("ag_news", split="test")

In [None]:
len(dataset["text"])

In [None]:
import pandas as pd

df = pd.DataFrame(dataset["text"], columns=["text"])

## Vectorization of text
There are several ways to vectorize sentences, using the paid OpenAI Embeddings API and the free sentence-transformers.

### When using the OpenAI Embeddings API
Please create an OpenAI account and issue your api-key [here](https://platform.openai.com/api-keys) and rewrite sk-XXX on the right side of the following line. Do not put double quotes before and after.

In [None]:
%env OPENAI_API_KEY=sk-XXX

In [None]:
!pip install openai

In [None]:
import os
import time
import openai

openai.api_key = os.environ["OPENAI_API_KEY"]
client = openai.OpenAI()


def get_embedding(text, model="text-embedding-ada-002"):
    time.sleep(0.2)
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [None]:
model = "text-embedding-ada-002"
len(get_embedding("This is a test text.", model))

In [None]:
from tqdm import tqdm

tqdm.pandas()

df["text_embedding"] = df["text"].progress_apply(lambda x: get_embedding(x, model))

In [None]:
# The embedding is processed and saved so that it can be restored.
w_df = df.copy()
w_df["text_embedding"] = w_df["text_embedding"].apply(list)
w_df.to_csv("./text-embedding-openai.csv", index=False)

### When using sentence-transformers
This example uses a multilingual model.

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

# When using CPU
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

# When using GPU
# model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", device="cuda")

In [None]:
def get_embedding(text, model):
    return model.encode(text)

In [None]:
len(get_embedding("This is a test text.", model))

In [None]:
from tqdm import tqdm

tqdm.pandas()

df["text_embedding"] = df["text"].progress_apply(lambda x: get_embedding(x, model))

In [None]:
# The embedding is processed and saved so that it can be restored.
w_df = df.copy()
w_df["text_embedding"] = w_df["text_embedding"].progress_apply(list)
w_df.to_csv("./text-embedding-st.csv", index=False)

## Preparation of the Vald cluster
Please refer to [Get Started](https://vald.vdaas.org/docs/tutorial/get-started/) here to build a Vald cluster.

Set agent.ngt.dimension in values.yaml to the number of dimensions of the vectors you actually want to insert (1536 if you use OpenAI Embeddings API, 768 if you use sentence-transformers). And set agent.ngt.distance_type to l2.

In [None]:
!pip install vald-client-python

In [None]:
import pandas as pd

df = pd.read_csv("./text-embedding-openai.csv")
# df = pd.read_csv("./text-embedding-st.csv")

In [None]:
import numpy as np
from tqdm import tqdm

tqdm.pandas()

df["text_embedding"] = (
    df["text_embedding"].progress_apply(eval).progress_apply(np.array)
)

In [None]:
import grpc
import numpy as np
from vald.v1.payload import payload_pb2
from vald.v1.vald import search_pb2_grpc, upsert_pb2_grpc

In [None]:
## Host name to connect to (Host:Port)
host = "localhost:80"

dimension = 1536  # When using OpenAI Embeddings API
# dimension = 768  # When using sentence-transformers

In [None]:
channel = grpc.insecure_channel(host)

We will try to see if I can insert a vector.

In [None]:
usstub = upsert_pb2_grpc.UpsertStub(channel)

In [None]:
vec = payload_pb2.Object.Vector(id="0", vector=df["text_embedding"][0])
uscfg = payload_pb2.Upsert.Config(skip_strict_exist_check=True)
usstub.Upsert(payload_pb2.Upsert.Request(vector=vec, config=uscfg))

Try to see if the inserted vectors can be searched. 

After inserting the data, wait a few minutes because the search results will not be reflected until the index creation is finished.

In [None]:
sstub = search_pb2_grpc.SearchStub(channel)

In [None]:
svec = np.array([0.01] * dimension, dtype="float32")  # Test vector for query
scfg = payload_pb2.Search.Config(num=10, radius=-1.0, epsilon=0.01, timeout=3000000000)
sstub.Search(payload_pb2.Search.Request(vector=svec, config=scfg))

## Insert all text into Vald
Even after the insertion is complete, the search results will not be reflected until the index creation is finished, so please wait a few more minutes before searching.

In [None]:
from tqdm import tqdm

for row in tqdm(df.itertuples(), total=len(df)):
    vec = payload_pb2.Object.Vector(id=str(row.Index), vector=row.text_embedding)
    uscfg = payload_pb2.Upsert.Config(skip_strict_exist_check=True)
    usstub.Upsert(payload_pb2.Upsert.Request(vector=vec, config=uscfg))

## Search for text similar to any query

In [None]:
def get_search_response(text, model, k):
    qvec = get_embedding(text, model)
    scfg = payload_pb2.Search.Config(
        num=k, radius=-1.0, epsilon=0.01, timeout=3000000000
    )
    return sstub.Search(payload_pb2.Search.Request(vector=qvec, config=scfg))

In [None]:
def display_results_top_k(text, model, k):
    response = get_search_response(text, model, k=k)
    for result in response.results:
        rtext = df["text"][int(result.id)]
        rdistance = result.distance
        print(f"text: {rtext}, distance: {rdistance}")
        print()

In [None]:
text = "Automatic identification of difficult sentences."
display_results_top_k(text, model, k=3)

# When you do not use a vector search engine
Vector distance calculation can also be done by application side calculation. However, with Vald, you can perform fast searches even when the amount of data increases.

Since Vald is an approximate nearest neighbor search, you may be concerned about its accuracy. Let's compare the accuracy and speed of Vald with the results of an exact calculation using an example using numpy.

The trade-off between accuracy and speed can be adjusted with the parameters of Vald, and in this case, agent.ngt.creation_edge_size=20 and agent.ngt.search_edge_size=40 are set.

## Comparison of accuracy

In [None]:
def display_top_k_with_numpy(text, df, k):
    insert_features = np.array([x for x in df["text_embedding"].values])
    query_feature = get_embedding(text=text, model=model)
    distances = np.linalg.norm(
        query_feature - insert_features, axis=1
    )  # Equivalent to distance_type=L2
    distance_indexes = np.argsort(distances)[:k]

    for idx in distance_indexes:
        print(f"text: {df["text"][int(idx)]}, distance: {distances[int(idx)]}")
        print()

In [None]:
text = "Automatic identification of difficult sentences."
display_top_k_with_numpy(text, df, k=3)

## Comparison of search speed
Let's compare the speed of using Vald and calculating vector distances on the application side, changing the amount of data.

### Data Preparation

We wanted about 1M of data after unique processing, so we used the following data set of [wikipedia sentences](https://huggingface.co/datasets/wikitext). 

Since it uses a large amount of memory and takes a long time, we recommend that you first try to make the data smaller by slicing it, etc.

In [None]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")

In [None]:
df = pd.DataFrame(data=dataset["text"], columns=["text"])

In [None]:
len(df)

In [None]:
# Remove duplicates
df = df.drop_duplicates(subset="text", keep="first", ignore_index=True)

In [None]:
len(df)

In [None]:
df.to_csv("./wikitext-uniq.csv", index=False)

### Vectorization of text
For speed, use sentence-transformers instead of the OpenAI Embeddings API for vectorization.

In [None]:
import pandas as pd

df = pd.read_csv("./wikitext-uniq.csv")

In [None]:
from sentence_transformers import SentenceTransformer

# When using CPU
# model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

# When using GPU
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", device="cuda")

In [None]:
def get_embedding(text, model):
    return model.encode(text)

In [None]:
from tqdm import tqdm

tqdm.pandas()

df["text_embedding"] = df["text"].progress_apply(lambda x: get_embedding(x, model))

In [None]:
# # The embedding is processed and saved so that it can be restored.
w_df = df.copy()
w_df["text_embedding"] = w_df["text_embedding"].apply(list)
w_df.to_csv("./wikitext-uniq-with-text-embedding.csv", index=False)

### Query Speed Comparison

In [None]:
import pandas as pd

df = pd.read_csv("./wikitext-uniq-with-text-embedding.csv")

In [None]:
import numpy as np
from tqdm import tqdm

tqdm.pandas()

df["text_embedding"] = (
    df["text_embedding"].progress_apply(eval).progress_apply(np.array)
)

#### When you do not use a vector search engine

In [None]:
def get_insert_features(df):
    insert_features = np.array([x for x in df["text_embedding"].values])
    return insert_features

In [None]:
def get_indexes_top_k_with_numpy(insert_features, query_feature, k):
    distances = np.linalg.norm(
        query_feature - insert_features, axis=1
    )  # Equivalent to distance_type=L2
    distance_indexes = np.argsort(distances)[:k]

    return distance_indexes

In [None]:
text = "Where are the idyllic areas?"
query_feature = get_embedding(text=text, model=model)

##### 10,000 data

In [None]:
insert_features = get_insert_features(df[:10000])

In [None]:
%%time
indexes = get_indexes_top_k_with_numpy(insert_features, query_feature, k=3)

##### 100,000 data

In [None]:
insert_features = get_insert_features(df[:100000])

In [None]:
%%time
indexes = get_indexes_top_k_with_numpy(insert_features, query_feature, k=3)

##### 970,000 data

In [None]:
insert_features = get_insert_features(df)

In [None]:
%%time
indexes = get_indexes_top_k_with_numpy(insert_features, query_feature, k=3)

#### When you use the vector search engine Vald
Since the first communication takes time to establish a connection, please take a second measurement that is closer to the actual speed.

In [None]:
import grpc
from vald.v1.payload import payload_pb2
from vald.v1.vald import search_pb2_grpc, upsert_pb2_grpc

In [None]:
## Host name to connect to (Host:Port)
host = "localhost:80"

dimension = 768

In [None]:
channel = grpc.insecure_channel(host)

In [None]:
usstub = upsert_pb2_grpc.UpsertStub(channel)
sstub = search_pb2_grpc.SearchStub(channel)

In [None]:
from tqdm import tqdm

uscfg = payload_pb2.Upsert.Config(skip_strict_exist_check=True)


def multi_upsert(df, chunk_size=200):
    for i in tqdm(range(0, len(df), chunk_size)):
        requests = [
            payload_pb2.Upsert.Request(
                vector=payload_pb2.Object.Vector(
                    id=str(row.Index), vector=row.text_embedding
                ),
                config=uscfg,
            )
            for row in df[i : i + chunk_size].itertuples()
        ]
        usstub.MultiUpsert(payload_pb2.Upsert.MultiRequest(requests=requests))

In [None]:
def get_indexes_top_k(vec, k):
    scfg = payload_pb2.Search.Config(
        num=k, radius=-1.0, epsilon=0.01, timeout=3000000000
    )
    response = sstub.Search(payload_pb2.Search.Request(vector=vec, config=scfg))
    return [int(result.id) for result in response.results]

In [None]:
# For initial communication
text = "This is a test text."
query_feature = get_embedding(text=text, model=model)
multi_upsert(df[:10])

In [None]:
indexes = get_indexes_top_k(query_feature, k=3)

##### 10,000 data

In [None]:
multi_upsert(df[:10000])

In [None]:
text = "Where are the idyllic areas?"
query_feature = get_embedding(text=text, model=model)

In [None]:
%%time
indexes = get_indexes_top_k(query_feature, k=3)

##### 100,000 data

In [None]:
multi_upsert(df[10000:100000])

In [None]:
%%time
indexes = get_indexes_top_k(query_feature, k=3)

##### 970,000 data

In [None]:
multi_upsert(df[100000:])

In [None]:
%%time
indexes = get_indexes_top_k(query_feature, k=3)

The search speed depends on the vector distribution and settings, but in LY Corporation's in-house environment, the 99%ile value of SEARCH is less than 200 ms even when the number of data is more than 10 million.

## Acknowledgments
The wikitext used for the data set was used without modification under the following license.

https://creativecommons.org/licenses/by-sa/4.0/deed.en

We would like to thank Wikipedia and the creator of the data set for making the data available.