## Basic cord19 app - without embeddings

In [None]:
from vespa.package import Document, Field, Schema, FieldSet, RankProfile, ApplicationPackage

document = Document(
    fields=[
        Field(name = "cord_uid", type = "string", indexing = ["attribute", "summary"]),
        Field(name = "title", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
    ]
)

msmarco_schema = Schema(
    name = "cord19",
    document = document,
    fieldsets = [FieldSet(name = "default", fields = ["title"])],
    rank_profiles = [RankProfile(name = "default", first_phase = "nativeRank(title)")]
)

app_package = ApplicationPackage(name = "cord19", schema=msmarco_schema)

## Summary of what we need to have in place

### Tensor field

Include a field in the document schema:

In [None]:
field title_bert type tensor<float>(x[768]) {
    indexing: attribute
}

The current API allow us to specify a Tensor field like the above with:

In [None]:
Field(name = "title_bert", type = "tensor<float>(x[768])", indexing = ["attribute"])

### Query profile

Include the query profile

In [None]:
<query-profile id="default" type="root">
  <field name="maxHits">1000</field>
</query-profile>

Include query profile type

In [None]:
<query-profile-type id="root">
  <field name="ranking.features.query(tensor_bert)" type="tensor&lt;float&gt;(x[768])" />
</query-profile-type>

**Proposed API**:

In [None]:
query_profile_type = QueryProfileType(
    id="root", 
    fields = [
        QueryTypeField(
            name="ranking.features.query(tensor_bert)",
            type="tensor&lt;float&gt;(x[768])"
        )
    ]
)
query_profile = QueryProfile(
    id="default", 
    type=query_profile_type, 
    fields=[QueryField(name="maxHits", value=1000)]
)

However, we can include "default" query profile and "root" query profile type by default, leading to a much simpler API to simply modify those if needed.

In [None]:
app_package.add_query_type_field(
    name="ranking.features.query(tensor_bert)",
    type="tensor<float>(x[768])"
)

### Use embeddings on an expression

Use query and document vector in a rank-profile

In [None]:
rank-profile bert_title_body_all inherits default {
    first-phase {
        expression: sum(query(tensor_bert)*attribute(title_bert))
    }
}

The current API allow us to specify the rank-profile above with:

In [None]:
RankProfile(
    name = "bert_title", 
    inherits="default", 
    first_phase = "sum(query(tensor_bert)*attribute(title_bert))"
)

### Feed document tensors

Syntax to feed tensor values

In [None]:
"tensorfield": {
    "values": [ 2.0, 3.0, 5.0, 7.0 ]
}

### Query using embeddings

In [None]:
{
  "yql": ...,
  "ranking.features.query(tensor_bert)": "[0.013267785266013195, -0.021684982513878254, ..., -0.007751454443551412]",
  ...
}

### Generate embeddings to test the approach

In [9]:
import numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("bert-base-nli-mean-tokens")

def create_document_embedding(text, model, normalize=True):
    vector = model.encode([text])[0].tolist()
    if normalize:
        norm = np.linalg.norm(vector)
        if norm > 0.0:
            vector = vector / norm
    return vector.tolist()

In [10]:
embedding = create_document_embedding(text="this is a text", model = model)

## cord19 app with embeddings

In [None]:
from vespa.package import Document, Field, Schema, FieldSet, RankProfile, ApplicationPackage

document = Document(
    fields=[
        Field(name = "cord_uid", type = "string", indexing = ["attribute", "summary"]),
        Field(name = "title", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "title_bert", type = "tensor<float>(x[768])", indexing = ["attribute"])
    ]
)
msmarco_schema = Schema(
    name = "cord19",
    document = document,
    fieldsets = [FieldSet(name = "default", fields = ["title"])],
    rank_profiles = [
        RankProfile(name = "default", first_phase = "nativeRank(title)"),
        RankProfile(
            name = "bert_title", 
            inherits="default", 
            first_phase = "sum(query(tensor_bert)*attribute(title_bert))"
        )
    ]
)
app_package = ApplicationPackage(
    name = "cord19", 
    schema=msmarco_schema,
    query_profile=query_profile

app_package.add_query_profile_type_field(
    name="ranking.features.query(tensor_bert)",
    type="tensor<float>(x[768])"
)    