In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%config Completer.use_jedi = False

## Basic app based on bm25

In [3]:
from vespa.package import ApplicationPackage, Field, FieldSet, RankProfile

app_package = ApplicationPackage(name="cord19")
app_package.schema.add_fields(
    Field(name = "cord_uid", type = "string", indexing = ["attribute", "summary"]),
    Field(name = "title", type = "string", indexing = ["index", "summary"], index = "enable-bm25")
)
app_package.schema.add_field_set(
    FieldSet(name = "default", fields = ["title"])
)
app_package.schema.add_rank_profile(
    RankProfile(name = "bm25", first_phase = "bm25(title)")
)

## Deploy the application

Define the absolute disk path to store the application files.

In [4]:
import os

os.environ["WORK_DIR"] = "/Users/tmartins"
disk_folder = os.path.join(os.getenv("WORK_DIR"), "sample_application")

Deploy to a docker container

In [5]:
from vespa.package import VespaDocker

vespa_docker = VespaDocker(port=8089)

app = vespa_docker.deploy(
    application_package = app_package,
    disk_folder=disk_folder
)

Waiting for configuration server.
Waiting for configuration server.
Waiting for configuration server.
Waiting for configuration server.
Waiting for application status.
Waiting for application status.


Steps required to deploy BERT:

* Add onnx-model in the sd.
* Create a models folder in the same level of the schemas folder
* Create a rank-profile that define inputs and use the model to rank
* Add the input field tensors related to the docs
* Add query profile with the relevant tensors

## Deploy application from disk

In [None]:
vespa_docker.container = None

In [None]:
vespa_docker.container

In [None]:
app = vespa_docker.deploy_from_disk(
    application_name="cord19", 
    disk_folder="/Users/tmartins/projects/vespa/pyvespa/docs/sphinx/source/use_cases/cord19/sample_application", 
    container_memory="10G"
)

In [None]:
app.deployment_message

## Feed data to the application

In [None]:
import json

test_sets = json.load(open("cord19/test_sets.json", "r"))

In [None]:
test_sets

In [None]:
documents_to_feed = []
for test_set in test_sets:
    for query_point in test_sets[test_set]:
        query = query_point["query"]
        print(query)
        result = app.query(
            query=query, 
            query_model=Query(
                match_phase = OR(),
                rank_profile = Ranking(name="bm25")
            ),
            timeout="20s",    
            hits = 100
        )
        assert len(result.hits) > 0
        for hit in result.hits:
            documents_to_feed.append(
                {"cord_uid": hit["fields"]["cord_uid"],
                 "title": hit["fields"]["title-full"]}
            )

In [None]:
import json

with open("cord19/documents_to_feed.json", "w") as f:
    f.write(json.dumps(documents_to_feed))

In [None]:
import json

with open("cord19/documents_to_feed.json", "r") as f:
    documents_to_feed = json.load(f)

In [None]:
documents_to_feed[0:2]

In [None]:
training_data_batch.head()

In [None]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
for document in documents_to_feed:
    response = app.feed_data_point(
        schema = "cord19",
        data_id = str(document["cord_uid"]),
        fields = {
            "cord_uid": str(document["cord_uid"]),
            "title": str(document["title"]),
            "title_token_ids": {"values": tokenizer(
                str(document["title"]), 
                truncation=True, 
                padding="max_length",
                max_length=63, 
                add_special_tokens=False
            )["input_ids"]}
        }
    )
    print(response.text)

In [None]:
from vespa.application import Vespa

app = Vespa(url = "http://localhost", port = 8080)

In [None]:
from vespa.query import Query, OR, RankProfile as Ranking

query = 'coronavirus origin'
result = app.query(
    query=query, 
    query_model=Query(
        match_phase = OR(),
        rank_profile = Ranking(name="default")),
    timeout="20s",    
)

In [None]:
result.json

In [None]:
from vespa.query import RankProfile as Ranking

query = 'coronavirus origin'
result = app.query(
    query=query, 
    query_model=Query(
        match_phase = OR(),
        rank_profile = Ranking(name="bert_index_1")),
    timeout="20s",
    debug_request=False,
    **{"ranking.features.query(query_token_ids)": str(tokenizer(
                str(query), 
                truncation=True, 
                padding="max_length",
                max_length=64, 
                add_special_tokens=False
            )["input_ids"])}
    
)

In [None]:
[hit["relevance"] for hit in result.hits]

In [None]:
result.request_body

In [None]:
result.json

### Define query models that we want to evaluate

In [None]:
from vespa.query import Query, RankProfile, OR

query_models = {
    "or_bm25": Query(
        match_phase = OR(),
        rank_profile = Ranking(name="bm25")
    ),
    "or_bm25_bert": Query(
        match_phase = OR(),
        rank_profile = Ranking(name="bert")
    ),
    "or_bm25_bert_index_1": Query(
        match_phase = OR(),
        rank_profile = Ranking(name="bert_index_1")
    )
    
}
        

In [None]:
from vespa.evaluation import MatchRatio, Recall, ReciprocalRank, NormalizedDiscountedCumulativeGain

eval_metrics = [MatchRatio(), Recall(at=10), ReciprocalRank(at=10), NormalizedDiscountedCumulativeGain(at=10)]

In [None]:
        evaluation = []
        for query_data in labelled_data:
            evaluation_query = self.evaluate_query(
                eval_metrics=eval_metrics,
                query_model=query_model,
                query_id=query_data["query_id"],
                query=query_data["query"],
                id_field=id_field,
                relevant_docs=query_data["relevant_docs"],
                default_score=default_score,
                **kwargs
            )
            evaluation.append(evaluation_query)
        evaluation = DataFrame.from_records(evaluation)


In [None]:
for test in test_sets:
    print(test)

In [None]:
query_data["relevant_docs"]

In [None]:
from pandas import DataFrame

evaluations = {}
for test_set in test_sets:
    evaluations[test_set] = {}
    for query_model in query_models:
        evaluation = []
        for query_data in test_sets[test_set]:
            print(query_data["query_id"])
            evaluation_query = app.evaluate_query(
                eval_metrics=eval_metrics,
                query_model=query_models[query_model],
                query_id=query_data["query_id"],
                query=query_data["query"],
                id_field = "cord_uid",
                relevant_docs=query_data["relevant_docs"],
                hits = 10,
                timeout="100s",
                **{"ranking.features.query(query_token_ids)": str(tokenizer(
                            str(query_data["query"]), 
                            truncation=True, 
                            padding="max_length",
                            max_length=64, 
                            add_special_tokens=False
                        )["input_ids"])}            
            )
            evaluation.append(evaluation_query)
        evaluations[test_set][query_model] = DataFrame.from_records(evaluation)

In [None]:
evaluations

In [None]:
import pandas as pd

metric_values = []
for test_set in test_sets:
    for query_model in query_models:
        for metric in eval_metrics:
            metric_values.append(
                pd.DataFrame(
                    data={
                        "test_set": test_set, 
                        "query_model": query_model, 
                        "metric": metric.name, 
                        "value": evaluations[test_set][query_model][metric.name + "_value"].to_list()
                    }
                )
            )
metric_values = pd.concat(metric_values, ignore_index=True)

In [None]:
metric_values.head()

In [None]:
metric_values.metric.unique()

In [None]:
import plotly.express as px


fig = px.box(metric_values[metric_values.metric == "reciprocal_rank_10"], x="query_model", y="value", title = "RR @ 10")
fig.show()

In [None]:
metric_values.groupby(['query_model', 'metric']).median()