In [None]:
!pip install git+https://github.com/deepset-ai/haystack.git
!pip install sentence-transformers

In [3]:
import os

import pandas as pd
import numpy as np

from haystack.document_store.faiss import FAISSDocumentStore
from haystack.retriever.dense import EmbeddingRetriever

In [6]:
DIR = 'drive/MyDrive/AI'
df = pd.read_csv('drive/MyDrive/AI/arxiv_short.csv')
df.head()

Unnamed: 0,authors,title,abstract
0,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",Calculation of prompt diphoton production cross sections at Tevatron and\n ...,A fully differential calculation in perturbative quantum chromodynamics is...
1,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pebble game with colors, and u..."
2,Hongjun Pan,The evolution of the Earth-Moon system based on the dark matter field\n flu...,The evolution of Earth-Moon system is described by the dark matter field\n...
3,David Callan,A determinant of Stirling cycle numbers counts unlabeled acyclic\n single-s...,We show that a determinant of Stirling cycle numbers counts unlabeled acyc...
4,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\alpha}$,"In this paper we show how to compute the $\Lambda_{\alpha}$ norm, $\alpha\..."


In [7]:
df.isnull().sum()

authors     0
title       0
abstract    0
dtype: int64

In [8]:
document_store_faiss = FAISSDocumentStore(faiss_index_factory_str="Flat",return_embedding=True)

retriever_faiss = EmbeddingRetriever(document_store_faiss, embedding_model='distilroberta-base-msmarco-v2',model_format='sentence_transformers')

04/08/2021 03:49:30 - INFO - haystack.retriever.dense -   Init retriever using embeddings of model distilroberta-base-msmarco-v2
04/08/2021 03:49:30 - INFO - sentence_transformers.SentenceTransformer -   Load pretrained SentenceTransformer: distilroberta-base-msmarco-v2
04/08/2021 03:49:30 - INFO - sentence_transformers.SentenceTransformer -   Did not find folder distilroberta-base-msmarco-v2
04/08/2021 03:49:30 - INFO - sentence_transformers.SentenceTransformer -   Search model on server: http://sbert.net/models/distilroberta-base-msmarco-v2.zip
04/08/2021 03:49:30 - INFO - sentence_transformers.SentenceTransformer -   Downloading sentence transformer model from http://sbert.net/models/distilroberta-base-msmarco-v2.zip and saving it at /root/.cache/torch/sentence_transformers/sbert.net_models_distilroberta-base-msmarco-v2


HBox(children=(FloatProgress(value=0.0, max=305420057.0), HTML(value='')))




04/08/2021 03:49:52 - INFO - sentence_transformers.SentenceTransformer -   Load SentenceTransformer from folder: /root/.cache/torch/sentence_transformers/sbert.net_models_distilroberta-base-msmarco-v2


In [10]:
document_store_faiss.delete_all_documents()
document_store_faiss.write_documents(
                                df[['authors', 'title', 'abstract']].rename(
                                                                  columns={
                                                                      'title':'name',
                                                                      'author' : 'author',
                                                                      'abstract':'text'
                                                                   }
                                                                 ).to_dict(orient='records'))

In [11]:
document_store_faiss.update_embeddings(retriever=retriever_faiss)

04/08/2021 03:51:54 - INFO - haystack.document_store.faiss -   Updating embeddings for 50000 docs...
  0%|          | 0/50000 [00:00<?, ?it/s]

HBox(children=(FloatProgress(value=0.0, description='Batches', max=313.0, style=ProgressStyle(description_widt…




 20%|██        | 10000/50000 [01:21<05:24, 123.36it/s]

HBox(children=(FloatProgress(value=0.0, description='Batches', max=313.0, style=ProgressStyle(description_widt…

 20%|██        | 10000/50000 [01:40<05:24, 123.36it/s]




 40%|████      | 20000/50000 [02:45<04:06, 121.84it/s]

HBox(children=(FloatProgress(value=0.0, description='Batches', max=313.0, style=ProgressStyle(description_widt…

 40%|████      | 20000/50000 [03:00<04:06, 121.84it/s]




 60%|██████    | 30000/50000 [04:06<02:43, 122.10it/s]

HBox(children=(FloatProgress(value=0.0, description='Batches', max=313.0, style=ProgressStyle(description_widt…

 60%|██████    | 30000/50000 [04:20<02:43, 122.10it/s]




 80%|████████  | 40000/50000 [05:28<01:21, 122.50it/s]

HBox(children=(FloatProgress(value=0.0, description='Batches', max=313.0, style=ProgressStyle(description_widt…

 80%|████████  | 40000/50000 [05:40<01:21, 122.50it/s]




100%|██████████| 50000/50000 [06:50<00:00, 121.67it/s]


In [13]:
def get_results(query, retriever, n_docs = 10):
  return [(item.text, item.to_dict()['meta']) for item in retriever.retrieve(q, top_k = n_docs)]

In [18]:
q = 'Poisson Dirichlet distribution with two-parameters'
print('-'*100)
print('Results: ')
print('-'*75)
res = get_results(q, retriever_faiss) 
for r in res:
  print(r) 

----------------------------------------------------------------------------------------------------
Results: 
---------------------------------------------------------------------------


HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…


('  The two-parameter Poisson--Dirichlet distribution is a probability\ndistribution on the totality of positive decreasing sequences with sum 1 and\nhence considered to govern masses of a random discrete distribution. A\ncharacterization of the associated point process (that is, the random point\nprocess obtained by regarding the masses as points in the positive real line)\nis given in terms of the correlation functions. Using this, we apply the theory\nof point processes to reveal the mathematical structure of the two-parameter\nPoisson--Dirichlet distribution. Also, developing the Laplace transform\napproach due to Pitman and Yor, we are able to extend several results\npreviously known for the one-parameter case. The Markov--Krein identity for the\ngeneralized Dirichlet process is discussed from the point of view of functional\nanalysis based on the two-parameter Poisson--Dirichlet distribution.\n', {'vector_id': '7498', 'authors': 'Kenji Handa', 'name': 'The two-parameter Poisson-