In [None]:
%config Completer.use_jedi = False

## Mock data

Mock data to speed up development. Replace this mock data with the actual parsed dataset.

In [1]:
from random import random, seed
seed(345)

product_data = [
    {"asin": "1", 
     "title": "watch", 
     "description": "to check the time", 
     "price": 120.35, 
     "image_vector": {"values": [random() for x in range(4096)]},
     "reduced_image_vector": {"values": [random() for x in range(256)]},
    },
    {"asin": "2", 
     "title": "chair", 
     "description": "object to seat", 
     "price": 39.90, 
     "image_vector": {"values": [random() for x in range(4096)]},
     "reduced_image_vector": {"values": [random() for x in range(256)]},
    },
    {"asin": "3", 
     "title": "table", 
     "description": "to eat dinner on", 
     "price": 52.85, 
     "image_vector": {"values": [random() for x in range(4096)]},
     "reduced_image_vector": {"values": [random() for x in range(256)]},
    },
]

## Install pyvespa

`pip3 install pyvespa`

## Create Vespa Application Package

Create an ApplicationPackage instance to hold all relevant info about our search application.

In [2]:
from vespa.package import ApplicationPackage, Field

app_package = ApplicationPackage(name = "product_search")

Add relevant fields to the application schema. Note that Vespa has native support for tensor.

In [3]:
from vespa.package import Field, HNSW

app_package.schema.add_fields(        
    Field(name = "asin", type = "string", indexing = ["attribute", "summary"]),
    Field(name = "title", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
    Field(name = "description", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
    Field(name = "price", type = "float", indexing = ["attribute", "summary"]),
    Field(name = "image_vector", type = "tensor<float>(x[4096])", indexing = ["attribute"]),
    Field(name = "reduced_image_vector", type = "tensor<float>(x[256])", indexing = ["attribute"], 
          ann=HNSW(
              distance_metric="euclidean",
              max_links_per_node=16,
              neighbors_to_explore_at_insert=200,
          ))
)

When performing vector search, we want to send a vector as a query, so we need to let Vespa know what to expect by adding a query type field.

In [None]:
from vespa.package import QueryTypeField

app_package.query_profile_type.add_fields(
    QueryTypeField(name="ranking.features.query(reduced_image_vector)", type="tensor<float>(x[256])")
)

Enable term-matching search over multiple fields (title and description) by default:

In [None]:
from vespa.package import FieldSet

app_package.schema.add_field_set(
    FieldSet(name = "default", fields = ["title", "description"])
)

Create a rank profile based on BM25 to rank documents based on term-matching criteria.

In [None]:
from vespa.package import RankProfile

app_package.schema.add_rank_profile(
    RankProfile(name = "bm25", first_phase = "bm25(title) + bm25(description)")
)

Create a rank profile based on the dot-product of the query and document `reduced_image_vector` to rank documents based on vector-based search.

In [None]:
app_package.schema.add_rank_profile(
    RankProfile(
        name = "dot_product", 
        first_phase = "sum(query(reduced_image_vector)*attribute(reduced_image_vector))")
)

## Deploy your application

In [None]:
from vespa.package import VespaDocker

vespa_docker = VespaDocker(port=8080)

app = vespa_docker.deploy(
    application_package = app_package,
    disk_folder="/Users/tmartins/product_search" # include the desired absolute path here
)

In [None]:
app.deployment_message

## Feed data to your application

In [None]:
for data in product_data:
    app.feed_data_point(schema = "product_search", data_id=data["asin"], fields=data)

## Search

`pyvespa` introduce the concept of `QueryModel` that allow us to combine different ways to match and rank documents under a unified framework. It enable us to perform term-based, vector-based and hybrid search through a convenient interface.

### term-based search

The following query model instructs Vespa to match any document that share at least one term with the query (`OR` operator) and rank the documents according to the `bm25` query profile we defined in our application package.

In [None]:
from vespa.query import QueryModel, OR, RankProfile as Ranking

or_model = QueryModel(
    match_phase=OR(), 
    rank_profile=Ranking(name="bm25")
)

After the query model is defined, all that is left to do is to search:

In [None]:
res = app.query(
    query="men's watch", 
    query_model=or_model
)
res.hits

If we are not happy with the `OR` operator we can easily experiment with the more strict `AND` operator, that match only documents that share all the query terms, or the `WeakWand` operator, that provides a provides a trade-off between speed and number of documents matched.

In [None]:
from vespa.query import AND, WeakAnd

and_model = QueryModel(
    match_phase=AND(), 
    rank_profile=Ranking(name="bm25")
)

weak_and_model = QueryModel(
    match_phase=WeakAnd(field="default", hits=1000), 
    rank_profile=Ranking(name="bm25")
)

### vector-based search

We can also perform a vector-based search by using approximate nearest neighbor (`ANN` operator) between the query and document vectors that we defined in out application package and then rank by the dot-product of those vectors.

In [None]:
from vespa.query import ANN, QueryRankingFeature

nn_model = QueryModel(
    match_phase=ANN(
        doc_vector="reduced_image_vector", 
        query_vector="reduced_image_vector", 
        hits = 1000,
        label = "nn"
    ),
    rank_profile=Ranking(name="dot_product")
)

Instead of sending a `query` string, we send the value of the query vector by specifying the `QueryRankingFeature` as a query property.

In [None]:
vector_to_search = product_data[0]["reduced_image_vector"]["values"]

res = app.query(
    query_properties=[QueryRankingFeature(name="reduced_image_vector", value=vector_to_search)],
    query_model=nn_model,
)
res.hits

### hybrid search

Performing a hybrid search is as simple as specifying the `Union` of a term-based operator (e.g. `OR`) and a vector-based operator (e.g. `ANN`).

In [None]:
from vespa.query import Union

hybrid_model = QueryModel(
    match_phase=Union(
        OR(),
        ANN(
            doc_vector="reduced_image_vector", 
            query_vector="reduced_image_vector", 
            hits = 1000,
            label = "nn"
        )
    ),
    rank_profile=Ranking(name="dot_product")
)

In [None]:
res = app.query(
    query="men's watch",
    query_properties=[QueryRankingFeature(name="reduced_image_vector", value=vector_to_search)],
    query_model=hybrid_model,
)
res.hits