In [1]:
from sentence_transformers import CrossEncoder, SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
query = 'How many people live in Berlin?'
docs = ['Berlin has a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.',
        'Germany\'a capital has a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.',
        'New York City is famous for the Metropolitan Museum of Art.']

In [3]:
# Cross encoders take two inputs to compare them (e.g., a query an document)

# We get the model from https://huggingface.co/cross-encoder/msmarco-MiniLM-L6-en-de-v1 
# All of these models get specified by a string name, also shown at the top of the page.
# The sentence_tranformers package builds on top of the huggingface library which provides
# a bunch of pretrained models for us to use.
cross_encoder_model = CrossEncoder('cross-encoder/msmarco-MiniLM-L6-en-de-v1', max_length=512)

Downloading (…)lve/main/config.json: 100%|██████████| 840/840 [00:00<00:00, 1.50MB/s]
Downloading pytorch_model.bin: 100%|██████████| 428M/428M [00:15<00:00, 27.5MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 541/541 [00:00<00:00, 1.05MB/s]
Downloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 25.9MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 9.10M/9.10M [00:00<00:00, 23.6MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 150/150 [00:00<00:00, 276kB/s]


In [4]:
# Create pairs of inputs for the model
pairs = [(query, doc) for doc in docs]

In [5]:
# A cross-encoder takes a pair (tuple[str,str]) as input
scores = cross_encoder_model.predict(pairs)
scores

array([ 9.022886,  8.658829, -9.72064 ], dtype=float32)

In [None]:
# A bi-encoder takes one input (str) and turns it into a vector.
# Then, we have to compare those vectors with dot product to estimate
# their relevance (at least in the IR setting)

# This model obtained from https://huggingface.co/sentence-transformers/msmarco-MiniLM-L12-cos-v5
biencoder_model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L12-cos-v5')

In [None]:
# Encode query and documents. Note that we can encode the list of documents all at once!
# We don't have to encode them one at a time. 
#
# encode() has many useful arguments you should check out if you want to use a bi-encoder in practice
query_emb = biencoder_model.encode(query)
doc_emb = biencoder_model.encode(docs)

In [None]:
type(doc_emb)

In [None]:
# See that the document embeddings are a matrix, with one row per document
doc_emb.shape

In [None]:
# Compute dot score (unnormalized cosine similarity!) between query and all document embeddings.
# These scores are our measure of relevance
#
# NOTE: this dot_score() is going to return a pytorch Tensor object. We'll want to 
# move this to a numpy representation which we can get with cpu() and then we'll
# convert it to a list of values
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

In [None]:
## Let's actually print the ranking

# Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

# Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

# Output passages & scores
for doc, score in doc_score_pairs:
    print(score, doc)