# Can LM answer KG's questions?

Use questions for Wikidata to retrieve Wikipedia paragraphs with a pre-trained dense retrieval model

In [2]:
# load Wiki passage corpus and questions from NQ
import os
from beir.datasets.data_loader import GenericDataLoader
from utils import model_path, data_path

dataset = 'nq'
data_path = os.path.join(data_path, dataset)

nq_corpus, nq_queries, nq_qrels = GenericDataLoader(data_folder=data_path).load(split="test")

print(len(nq_corpus), 'passages')
print(len(nq_qrels), 'questions')

2681468 passages
3452 questions


In [6]:
from utils import load_data, data_path

# load WD18 questions, entities and relations corpus
dataset_version = 'WD18_entities_original_answers'  # text answers-all
# dataset_version = 'WD18_entities_wav2vec2-large-960h-lv60-self_answers'  # wav2vec2-large-960h-lv60-self BM transcripts
e_corpus, wd_queries, wd_qrels = load_data(dataset_version, 'valid', data_path)
r_corpus, _, _ = load_data('WD18_relations-extra_original_relations')

print(len(e_corpus), 'entities')
print(len(r_corpus), 'relations')
print(len(wd_qrels), 'questions')

Loaded valid split of WD18 with original queries, entities corpus and answers qrels
Loaded train split of WD18 with original queries, relations-extra corpus and relations qrels
28497 entities
8913 relations
1316 questions


In [8]:
# load model
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval import models

model_names = ['msmarco-distilbert-base-tas-b', 'nq-distilbert-base-v1']  # the last one is the best on NQ in beir
similarities = ['dot', 'cos_sim']

# choose one of the models from the list above
i = -1
model_name = model_names[i]
similarity = similarities[i]

# load model
model = DRES(models.SentenceBERT(model_name))
print(model_name, similarity)

# encode everything and retrieve answers
retriever = EvaluateRetrieval(model, score_function=similarity)
results = retriever.retrieve(nq_corpus, wd_queries)

  0%|          | 0.00/245M [00:00<?, ?B/s]

nq-distilbert-base-v1 cos_sim


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/246 [00:00<?, ?it/s]

In [9]:
# examine the results
topk = 5  # show topk predictions by the model
n = 10  # for the first n questions

for q_id, answers in list(results.items())[:n]:
    print(wd_queries[q_id])  # show WD question
    for _id, score in list(answers.items())[:topk]:  # show topn answers retrieved by the model
        print(nq_corpus[_id]['text'], score)  # show the retrieved passage with the score

What is a film directed by wiebke von carolsfeld?
Almost Famous is a 2000 American comedy-drama film written and directed by Cameron Crowe, and starring Billy Crudup, Kate Hudson and Patrick Fugit. It tells the fictional story of a teenage journalist writing for Rolling Stone magazine in the early 1970s while covering the fictitious rock band Stillwater, and his efforts to get his first cover story published. The film is semi-autobiographical, as Crowe himself was a teenage writer for Rolling Stone. 0.1929573267698288
The film was shot with two cameras side by side, with one negative edited as a sound film and the other edited as an "International Sound Version" for distribution in non-English speaking areas. 0.28667551279067993
American Idol was nominated for the Emmy's Outstanding Reality Competition Program for nine years but never won.[216] Director Bruce Gower won a Primetime Emmy Award for Outstanding Directing For A Variety, Music Or Comedy Series in 2009, and the show won a Cre

# Can KG answer LM's questions?

In [None]:
# use Wiki questions from NQ and match them to entities and relations from WikiData KG