In [13]:
import json
import requests
import numpy as np
from requests.auth import HTTPBasicAuth
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.text import sent_tokenize

# disable warnings related to verify
requests.urllib3.disable_warnings()

In [2]:
def gen_url(abstract_url, size=10):
    """
    Give finer control of the api
    :abstract_url str: base url
    :size int: number of documents to return
    """
    return "{}&size={}".format(abstract_url, size)

In [14]:
topics = None
with open('../topics/SP12022topics.json') as f_in:
    topics = json.load(f_in)

topic = topics[0]

topic_content_file = '../topics/topic_related_content/topic' + topic['topic_id'] + '.md'
topic_text = topic['topic_text']
abstract_url = topic['abstract_url']
resp = requests.get(gen_url(abstract_url, 100), auth=HTTPBasicAuth('inex', 'qatc2011'), verify=False)
contents = json.loads(resp.content)
abstracts = []
for hit in contents['hits']['hits']:
    if hit['_source']['abstract'] not in abstracts:
        abstracts.extend(sent_tokenize(hit['_source']['abstract']))


In [15]:
def similarity_dot_prod(X):
    """
    Compute simularity using dot product
    :X: vectorized data, where
       X[0] = query vector
       X[1:] = data vectors
    """
    similarity = X @ X[0].T
    # sim(qeury, query) is not meaningful, remove it
    similarity = np.delete(similarity.toarray(), 0, 0)
    return similarity, np.argmax(similarity)

if abstracts[0] != topic_text:
    abstracts.insert(0, topic_text) # insert topic text

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(abstracts)

abstracts.pop(0) # remove topic_text
_, most_relevent_idx = similarity_dot_prod(X)

print("most relevent doc is at index {}".format(most_relevent_idx))
abstracts[most_relevent_idx]


most relevent doc is at index 222


'It’s possible to export data to common file formats like .pdf or .word.'

In [12]:
X.toarray()[0].sum()

3.1128264635380027

In [22]:
vectorizer.transform(['hello world']).toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])