In [43]:
# installing haystack
! pip install git+https://github.com/deepset-ai/haystack.git

Collecting git+https://github.com/deepset-ai/haystack.git
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-req-build-izk0eu7q


Building wheels for collected packages: farm-haystack
  Building wheel for farm-haystack (setup.py) ... [?25ldone
[?25h  Created wheel for farm-haystack: filename=farm_haystack-0.5.0-py3-none-any.whl size=101538 sha256=3d6b7e5c44e977202365507600697972b974223d44dab008fd2c790315c43ac5
  Stored in directory: /tmp/pip-ephem-wheel-cache-g3nfzkl5/wheels/a7/05/3b/9b33368d9af06a39f8e6af2e97fa2af876e893ade323cfc2c9
Successfully built farm-haystack


In [44]:
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.6.2

In [45]:
# General libraries
import re, os, string, random, requests
import pandas as pd
from subprocess import Popen, PIPE, STDOUT

# Haystack importings
from haystack import Finder
from haystack.reader.farm import FARMReader
from haystack.utils import print_answers
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever

# Scikit-learn importings
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
# Starting ElasticSearch server as daemon
es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )

# wait until ElasticSearch has started
! sleep 30

In [47]:
def get_index(n):
    """Return a random string of length n"""
    letters = string.ascii_lowercase
    result_str = ''.join(random.choice(letters) for i in range(n))
    return result_str

def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

In [48]:
def trim_doc(doc):
    """Trim doc with respect to the boundary of a sentence."""
    
    trimmedText = []
    charCount = 0
    for sentence in doc.split('.'):
        if charCount < DOC_THRESHOLD:
            charCount+=len(sentence.strip())
            trimmedText.append(sentence)

    finalText = ".".join(trimmedText)
    
    return finalText


def clean_text(text):
    """Doc cleaning"""
    
    # Lowering text
    text = text.lower()
    
    # Removing punctuation
    text = "".join([c for c in text if c not in PUNCTUATION])
    
    # Removing whitespace and newlines
    text = re.sub('\s+',' ',text)
    
    # Trimming doc
    text = trim_doc(text)
    return text

In [49]:
def sort_coo(coo_matrix):
    """Sort a dict with highest score"""
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [50]:
# Constants
ES_INDEX = get_index(10) # Elastic Search DB index name
PUNCTUATION = """!"#$%&'()*+,-/:;<=>?@[\]^_`{|}~""" # excluding . (full-stop) from the set of punctuations
DOC_THRESHOLD = 10000 # character limit for a doc
TOP_K_RETRIEVER = 10 # top k documents to analyze further for a given query
TOP_K_READER = 5 # top k number of answers to return
TOP_K_KEYWORDS = 10 # top k number of keywords to retrieve in a ranked document
BASE_URL = "http://localhost:9200/"+ES_INDEX+"/_doc/"
STOPWORD_PATH = "../input/ai-papers/data/stopwords.txt"

In [51]:
data = pd.read_csv("../input/nips-papers/papers.csv")
data.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and Its Applications,,1-self-organization-of-associative-database-and-its-applications.pdf,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisa...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cortex and Its Application to Arti...,,10-a-mean-field-theory-of-layer-iv-of-visual-cortex-and-its-application-to-a...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISUAL CORTEX\nAND ITS APPLICATION...
2,100,1988,Storing Covariance by the Associative Long-Term Potentiation and Depression ...,,100-storing-covariance-by-the-associative-long-term-potentiation-and-depress...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\nLONG?TERM POTENTIATION AND DEP...
3,1000,1994,Bayesian Query Construction for Neural Network Models,,1000-bayesian-query-construction-for-neural-network-models.pdf,Abstract Missing,Bayesian Query Construction for Neural\nNetwork Models\nGerhard Paass\nJorg ...
4,1001,1994,"Neural Network Ensembles, Cross Validation, and Active Learning",,1001-neural-network-ensembles-cross-validation-and-active-learning.pdf,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, and Active Learning\n\nAnders K..."


In [52]:
data.shape

(7241, 7)

In [53]:
# Structuring data to haystack required format
# Format: [{'text': 'paper_content', 'meta':{'name':'title'}}]
docs = []
corpora = []
doc_len = []

for index, row in data.iterrows():
    dicts = {}
    dicts['text'] = clean_text(row['paper_text'])
    doc_len.append(len(dicts['text']))
    corpora.append(dicts['text'])
    dicts['meta'] = {}
    dicts['meta']['name'] = clean_text(row['title'])
    docs.append(dicts)

In [54]:
# Average characters in a document after trimming
sum(doc_len)/len(docs)

10245.576577820742

In [55]:
# Be careful while overwriting data on the same ES index
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index=ES_INDEX)

In [56]:
# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(docs)

In [57]:
# Instantiating ES retriever 
retriever = ElasticsearchRetriever(document_store=document_store)

In [58]:
# Initializing reader on the top of roberta-base-squad2 pre-trained model, which will be downloaded on the first run
# Here, we can set the size of context window for our answers and use the GPU if available

reader = FARMReader(model_name_or_path="ahotrod/albert_xxlargev1_squad2_512",use_gpu=True, context_window_size=500)

In [59]:
# Fitting reader and retriever to Finder
finder = Finder(reader, retriever)

In [60]:
# Question prediction with TOP_K_RETRIEVER and TOP_K_READER
question = "What is the use of CNN?"
prediction = finder.get_answers(question=question, top_k_retriever=TOP_K_RETRIEVER, top_k_reader=TOP_K_READER)


Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.73s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.43s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.70s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.19s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.70s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.60s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.61s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.43s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.60s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.60s/ Batches]


In [61]:
# Printing answers with minimal detail
# details = minimal | medium | all

print_answers(prediction, details="minimal")

[   {   'answer': 'to extract semantic vectors',
        'context': 'there are also questions requiring extra knowledge or '
                   'commonsense reasoning such as does it appear to be rainy. '
                   'properly modeling questions is essential for solving the '
                   'vqa problem. a commonly employed strategy is to use a cnn '
                   'or an rnn to extract semantic vectors. the general issue '
                   'is that the resulting question representation lacks '
                   'detailed information from the given image which however is '
                   'vital for understanding visual content. we take the '
                   'question and image in figure 1 as an example. to answ'},
    {   'answer': 'to extract the visual representation',
        'context': 'stions about the content of an image. the answer can be a '
                   'sentence a phrase or a single word. our model contains '
                   'four components 

In [62]:
top_5_docs = []

for doc in prediction['answers']:
    DOC_URL = BASE_URL + doc['document_id']
    response = requests.get(DOC_URL)
    if response.status_code == 200:
        full_doc = {}
        full_doc['title'] = response.json()['_source']['name']
        full_doc['text'] = response.json()['_source']['text']
        full_doc['answer'] = doc['answer']
        top_5_docs.append(full_doc)
    else:
        print("Document not found! Restart the ES Server")
        break

### Getting Top K keywords using TF-IDF Method

In [63]:
#load a set of stop words
stopwords=get_stop_words(STOPWORD_PATH)

# Initializing TF-IDF Vectorizer with stopwords
vectorizer = TfidfVectorizer(stop_words=stopwords, smooth_idf=True, use_idf=True)

# Creating vocab with our corpora
vectorizer.fit_transform(corpora)

# Storing vocab
feature_names = vectorizer.get_feature_names()

  'stop_words.' % sorted(inconsistent))


In [64]:
def get_keywords(vectorizer, feature_names, doc):
    """Return top k keywords from a doc using TF-IDF method"""

    #generate tf-idf for the given document
    tf_idf_vector = vectorizer.transform([doc])
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only TOP_K_KEYWORDS
    keywords=extract_topn_from_vector(feature_names,sorted_items,TOP_K_KEYWORDS)
    
    return list(keywords.keys())


In [65]:
for doc in top_5_docs:
    doc['keywords'] = get_keywords(vectorizer, feature_names, doc['text'])

In [66]:
final = pd.DataFrame(top_5_docs)

In [67]:
print(question)
print("Top 5 articles with keywords\n")
final

What is the use of CNN?
Top 5 articles with keywords



Unnamed: 0,title,text,answer,keywords
0,visual question answering with question representation update qru,visual question answering with question representation update qru ruiyu li j...,to extract semantic vectors,"[vqa, image, question, answering, reasoning, questions, regions, reasoner, c..."
1,are you talking to a machine dataset and methods for multilingual image ques...,are you talking to a machine dataset and methods for multilingual image ques...,to extract the visual representation,"[mqa, answer, question, dataset, image, lstm, fmiqa, freestyle, questionansw..."
2,semisupervised convolutional neural networks for text categorization via reg...,semisupervised convolutional neural networks for text categorization via reg...,text categorization,"[cnn, text, tvembedding, embedding, region, onehot, word, unlabeled, regions..."
3,deep convolutional neural network for image deconvolution,deep convolutional neural network for image deconvolution li xu lenovo resea...,to handle strong noise,"[deconvolution, image, denoise, artifacts, blur, ssdae, degradation, cnn, ne..."
4,visual question answering with question representation update qru,visual question answering with question representation update qru ruiyu li j...,to extract image features,"[vqa, image, question, answering, reasoning, questions, regions, reasoner, c..."
