In [34]:
import requests
from requests.auth import HTTPBasicAuth
import json
import nltk
from nltk.text import sent_tokenize
from nltk import word_tokenize
from rank_bm25 import BM25Okapi
from rank_bm25 import BM25Plus
from rank_bm25 import BM25L
from utils import remove_brackets

# disable warnings related to verify
requests.urllib3.disable_warnings()

# download requried resources
nltk.download('punkt', download_dir='./')

[nltk_data] Downloading package punkt to ./...
[nltk_data]   Package punkt is already up-to-date!


True

## Document API

In [4]:
url = "https://guacamole.univ-avignon.fr/dblp1/_search?q=%22Digital%20assistant%22&size=10"
resp = requests.get(url, auth=HTTPBasicAuth('inex', 'qatc2011'), verify=False)

In [5]:
contents = json.loads(resp.content)

In [6]:
contents['hits']['hits'][0]['_source'].keys()

dict_keys(['id', 'authors', 'title', 'year', 'n_citation', 'page_start', 'page_end', 'doc_type', 'publisher', 'volume', 'issue', 'doi', 'references', 'fos', 'venue', 'abstract', 'nb_references', 'author'])

In [7]:
docs = [hit['_source'] for hit in contents['hits']['hits']]

In [8]:
docs[0]

{'id': 1564531496,
 'authors': [{'name': 'Nico Maibaum',
   'org': 'University of Rostock#TAB#',
   'id': 160412567},
  {'name': 'Igor Sedov',
   'org': 'University of Rostock#TAB#',
   'id': 2103374065},
  {'name': 'Clemens H. Cap',
   'org': 'University of Rostock#TAB#',
   'id': 670430106}],
 'title': 'A Citizen Digital Assistant for e-Government',
 'year': 2002,
 'n_citation': 5,
 'page_start': '284',
 'page_end': '287',
 'doc_type': 'Conference',
 'publisher': 'Springer, Berlin, Heidelberg',
 'volume': '',
 'issue': '',
 'doi': '10.1007/978-3-540-46138-8_46',
 'references': [1572889834, 2051944488, 2093988169, 2138209909],
 'fos': [{'name': 'World Wide Web', 'w': 0.45551},
  {'name': 'User assistance', 'w': 0.47753},
  {'name': 'E-Government', 'w': 0.0},
  {'name': 'Authentication', 'w': 0.49624},
  {'name': 'Computer security', 'w': 0.45855},
  {'name': 'Computer science', 'w': 0.43787},
  {'name': 'Smart card', 'w': 0.49259},
  {'name': 'Digital document', 'w': 0.0},
  {'name': 

In [9]:
sent_tokenize(docs[0]['abstract'])

['In this short paper we describe the architectural concept of a Citizen Digital Assistant (CDA) and preliminary results of our implementation.',
 'A CDA is a mobile user device, similar to a Personal Digital Assistant (PDA).',
 'It supports the citizen when dealing with public authorities and proves his rights - if desired, even without revealing his identity.',
 'Requirements for secure and trusted interactions in e-Government solutions are presented and shortcomings of state of the art digital ID cards are considered.',
 'The Citizen Digital Assistant eliminates these shortcomings and enables a citizen-controlled communication providing the secure management of digital documents, identities, and credentials.']

## BM25

In [15]:
with open('../topics/SP12022topics.json','rb') as f_in:
    topics = json.load(f_in)

In [16]:
topic = topics[0]
topic

{'topic_id': 'G01',
 'topic_text': 'Digital assistants like Siri and Alexa entrench gender biases says UN',
 'topic_url': 'https://www.theguardian.com/technology/2019/may/22/digital-voice-assistants-siri-alexa-gender-biases-unesco-says',
 'query_id': 'G01.1',
 'query_text': 'Digital assistant',
 'abstract_url': 'https://guacamole.univ-avignon.fr/dblp1/_search?q="Digital assistant"'}

In [17]:
topic_content_file = '../topics/topic_related_content/topic' + topic['topic_id'] + '.md'
topic_text = topic['topic_text']
abstract_url = topic['abstract_url']
resp = requests.get(abstract_url, auth=HTTPBasicAuth('inex', 'qatc2011'), verify=False)
contents = json.loads(resp.content)
abstracts = [hit['_source']['abstract'] for hit in contents['hits']['hits']]

In [38]:
docs = []
for abstract in abstracts:
    docs.extend(sent_tokenize(abstract))
tokenized_corpus = [word_tokenize(doc.lower()) for doc in docs]
bm25 = BM25Okapi(tokenized_corpus)
bm25plus = BM25Plus(tokenized_corpus)
bm25L = BM25L(tokenized_corpus)

In [39]:
with open(topic_content_file,'r',encoding='utf-8') as f_in:
    topic_content = []
    for line in f_in:
        line = line.strip().lower()
        if line:
            topic_content.extend(word_tokenize(remove_brackets(line.strip())))

In [45]:
title_as_query_scores1 = bm25.get_scores(word_tokenize(topic_text.lower()))
print(title_as_query_scores1)
title_as_query_scores2 = bm25plus.get_scores(word_tokenize(topic_text.lower()))
print(title_as_query_scores2)
title_as_query_scores3 = bm25L.get_scores(word_tokenize(topic_text.lower()))
print(title_as_query_scores3)

[0.91104245 0.80261701 0.20949334 1.00408621 1.30490518 0.91104245
 0.80261701 0.20949334 1.00408621 1.30490518 0.87581333 0.61866356
 0.         0.75849266 0.86174748 0.         0.         0.71087929
 3.58942399 2.32317842 0.20949334 0.         0.89308061 0.
 0.         0.7353412  0.         2.584723   3.26311333 0.
 0.71087929 0.7846698  0.75849266 0.96954138 0.         0.26359664
 0.         0.         0.24464239 0.51817331 0.         0.
 0.         0.         0.        ]
[5.40374656 4.70981677 4.76788128 5.75882434 6.02427548 5.40374656
 4.70981677 4.76788128 5.75882434 6.02427548 5.34824324 4.5398907
 3.96840334 5.16340505 4.76443824 3.96840334 3.96840334 5.08839041
 7.75530937 6.52240361 4.76788128 3.96840334 5.37544775 3.96840334
 3.96840334 4.64767108 3.96840334 7.18359602 8.02746038 3.96840334
 5.08839041 4.69323812 5.16340505 5.49591134 3.96840334 4.97435276
 3.96840334 3.96840334 4.9020187  4.44706333 3.96840334 3.96840334
 3.96840334 3.96840334 3.96840334]
[1.78913456 0.872

In [47]:
article_as_query_scores1 = bm25.get_scores(topic_content)
print(article_as_query_scores1)
article_as_query_scores2 = bm25plus.get_scores(topic_content)
print(article_as_query_scores2)
article_as_query_scores3 = bm25L.get_scores(topic_content)
print(article_as_query_scores3)

[ 40.00564763  51.71556647  61.73998786  53.95169306  51.7821711
  40.00564763  51.71556647  61.73998786  53.95169306  51.7821711
  38.73201225  37.33893928  13.36977693  59.58939866  25.37119845
  28.88610269  46.2363724   56.91981031  63.93168605  68.19961695
  43.79798526  64.74918758  55.04827488  64.25824217  26.38560643
  77.15492727  29.32169481 100.71077927  74.70768156  40.70575396
  46.83192013  52.23135643  87.99227928  64.87256409  48.51119183
  36.78318696  33.81508135  85.62525659  68.84870093  38.88155814
  13.78795961  24.7856463   24.7856463   12.28012259  28.4376667 ]
[430.81802341 424.29859234 437.22191613 441.43555814 446.05506826
 430.81802341 424.29859234 437.22191613 441.43555814 446.05506826
 406.15433213 400.43817233 378.86558506 467.45095073 385.36126119
 394.21146158 427.38668374 448.15023252 425.40477708 435.83486215
 438.38763675 442.5029229  435.78994519 432.41025748 394.00366789
 470.81815588 396.70658663 494.36210128 447.07653666 407.65785442
 451.133637

In [48]:
print('============== BM25Okapi ==============')
print('============== Query title ==============')
print(topic_text)
print('============== Highest ranked passage by title ==============')
print(docs[title_as_query_scores1.argmax()])
print('============== Highest ranked passage by content ==============')
print(docs[article_as_query_scores1.argmax()])

print('============== BM25plus ==============')
print('============== Query title ==============')
print(topic_text)
print('============== Highest ranked passage by title ==============')
print(docs[title_as_query_scores2.argmax()])
print('============== Highest ranked passage by content ==============')
print(docs[article_as_query_scores2.argmax()])

print('============== BM25OL==============')
print('============== Query title ==============')
print(topic_text)
print('============== Highest ranked passage by title ==============')
print(docs[title_as_query_scores3.argmax()])
print('============== Highest ranked passage by content ==============')
print(docs[article_as_query_scores3.argmax()])


Digital assistants like Siri and Alexa entrench gender biases says UN
Digital assistants are emerging to become more prevalent in our daily lives.
In particular, the ubiquitous access to remote resources is one of the most interesting characteristics achievable by using mobile devices such as Personal Digital Assistants, cellular phones and tablets.
Digital assistants like Siri and Alexa entrench gender biases says UN
This paper presents an architecture that allows users to search and visualize complex 3D models over Personal Digital Assistants.
In particular, the ubiquitous access to remote resources is one of the most interesting characteristics achievable by using mobile devices such as Personal Digital Assistants, cellular phones and tablets.
Digital assistants like Siri and Alexa entrench gender biases says UN
This paper presents an architecture that allows users to search and visualize complex 3D models over Personal Digital Assistants.
The object selected for visualization is fo