In [1]:
import requests
from requests.auth import HTTPBasicAuth
import json
from nltk.text import sent_tokenize
from nltk import word_tokenize
from transformers import BertModel, BertTokenizer
import torch
from utils import create_dataset

# disable warnings related to verify
requests.urllib3.disable_warnings()

# download requried resources
# nltk.download('punkt', download_dir='./')

  from .autonotebook import tqdm as notebook_tqdm


## Document API

In [None]:
url = "https://guacamole.univ-avignon.fr/dblp1/_search?q=%22Digital%20assistant%22&size=10"
resp = requests.get(url, auth=HTTPBasicAuth('inex', 'qatc2011'), verify=False)

In [None]:
contents = json.loads(resp.content)

In [None]:
contents['hits']['hits'][0]['_source'].keys()

In [None]:
docs = [hit['_source'] for hit in contents['hits']['hits']]

In [None]:
docs[0]

In [None]:
sent_tokenize(docs[0]['abstract'])

## BERT

In [2]:
# dataset = create_dataset('../topics/SP12022topics.json','../topics/topic_related_content/')
# with open('dataset.json', 'w') as f_out:
#     json.dump(dataset, f_out)
with open('dataset.json') as f_in:
    dataset = json.load(f_in)

100%|██████████| 114/114 [00:45<00:00,  2.51it/s]


In [None]:
with open('../topics/SP12022topics.json') as f_in:
    topics = json.load(f_in)

In [None]:
topics[-1]

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [5]:
# Get document embedding
data = dataset[0]
docs = data['docs']
with torch.no_grad():
    doc_tokens = tokenizer.batch_encode_plus(docs, padding=True, return_tensors='pt')
    cls_embs = model(**doc_tokens)['last_hidden_state'][:, 0, :]

In [6]:
topic_text = data['topic_text']
with torch.no_grad():
    doc_tokens = tokenizer.encode(topic_text, padding=True, return_tensors='pt')
    query_embs = model(doc_tokens)['last_hidden_state'][:, 0, :]
    title_as_query_scores = torch.matmul(cls_embs, query_embs.T).squeeze(-1).numpy()
    print(title_as_query_scores)

[135.31577 130.41748 149.39636 161.14398 149.92615 135.31577 130.41748
 149.39636 161.14398 149.92615 158.52365 132.56924 156.75887 149.72632
 159.8262  165.9302  153.39676 147.86209 177.9921  164.29349 160.3603
 146.49261 152.16037 161.59946 163.38069 161.5852  176.1186  170.62413
 153.7778  166.42651 139.54791 161.43842 153.62009 161.40884 163.5029
 143.28473 161.55833 149.6452  165.79279 143.73909 154.99445 153.79788
 149.1309  144.6236  152.81723]


In [19]:
topic_text = sent_tokenize(data['topic_content'].replace('---', '.').replace('# ', ''))
with torch.no_grad():
    doc_tokens = tokenizer.batch_encode_plus(topic_text, padding='longest', return_tensors='pt', truncation='longest_first')
    query_embs = model(**doc_tokens)['last_hidden_state'][:, 0, :]
    article_as_query_scores = torch.matmul(cls_embs, query_embs.T).sum(1).numpy()
    print(article_as_query_scores)

[1892.4512 1780.1522 2011.1755 2109.6228 2008.6647 1892.4512 1780.1522
 2011.1755 2109.6228 2008.6647 2105.9995 1804.301  1966.3557 1734.3851
 2147.2358 2087.899  2016.7938 1977.3971 2240.832  2176.9646 2134.708
 1993.1211 1972.145  2093.4468 2120.46   2194.8215 2181.8884 2213.2297
 2094.808  2131.919  1890.9269 2090.6172 2044.88   2112.2615 2128.8484
 1916.2476 2001.7626 1961.4286 2150.4255 1912.295  2037.6437 2017.2584
 1937.3527 1909.5934 2010.5336]


In [21]:
print('============== BERT ==============')
print('============== Query title ==============')
print(data['topic_text'])
print('============== Highest ranked passage by title ==============')
print(docs[title_as_query_scores.argmax()])
print('============== Highest ranked passage by content ==============')
print(docs[article_as_query_scores.argmax()])

Digital assistants like Siri and Alexa entrench gender biases says UN
Digital assistants are emerging to become more prevalent in our daily lives.
Digital assistants are emerging to become more prevalent in our daily lives.
