# The program is to create index of Each pdf file in elasticsearch using Python

# Here I have imported the required package
    -> Elasticsearch for creating and searching indexes
    -> os and glob to set directory and get the access to all pdf files directory
    -> PyPDF2 is the key to read all pdf files
    -> And Pandas is the package to create dataframe from readed text

In [1]:
from elasticsearch import Elasticsearch
import os
import glob
import PyPDF2
import pandas as pd

# selected/changed directory

In [2]:
os.chdir("./Books/")
files = glob.glob("*.*")

In [3]:
len(files)

11

In [4]:
for book in files:
    print(book)

thebook.pdf
Data-Mining.pdf
s9449-building-a-distributed-gpu-dataframe-with-python_V2.pdf
data-mining-concepts-and-techniques-2nd-edition-impressao.pdf
s9577-rapids-the-platform-inside-and-out.pdf
NIC225296.pdf
DSA_Book.pdf
[Joel_Grus]_Data_Science_from_Scratch_First_Princ.pdf
pyspark.pdf
Parallel computing.pdf
ch1.pdf


# following function creates dataframe from the readed pdf files

In [5]:
def create_dataframe(files):
    this_loc = 1
    df = pd.DataFrame(columns=['Name',"Pages","Content"])
    
    for file in files:
        pdfFileObj = open(file,'rb')
        pdfreader = PyPDF2.PdfFileReader(pdfFileObj)
        n_pages = pdfreader.numPages
        this_doc = ''
        for i in range(n_pages):
            pageObj = pdfreader.getPage(i)
            this_doc += pageObj.extractText()
        df.loc[this_loc]= file, pdfreader.getNumPages(),this_doc
        this_loc = this_loc + 1
    return df

In [6]:
df = create_dataframe(files)



In [7]:
df.head()

Unnamed: 0,Name,Pages,Content
1,thebook.pdf,234,INTRODUCTIONTOMACHINELEARNING\nIntroductiontoM...
2,Data-Mining.pdf,24,Computer Science\nAbout the Book\n˜ is textboo...
3,s9449-building-a-distributed-gpu-dataframe-wit...,50,Building a Distributed GPU DataFrame with Pyth...
4,data-mining-concepts-and-techniques-2nd-editio...,770,TheMorganKaufmannSeriesinDataManagementSystems...
5,s9577-rapids-the-platform-inside-and-out.pdf,49,Joshua \nPatterson \n3-19-2019RAPIDS: PLATFORM...


# Creating elasticsearch object

In [8]:
es = Elasticsearch()

In [9]:
col_names=df.columns

# Adding the index for each file by iterating on dataframe

In [13]:
for row_number in range(df.shape[0]):
    body = dict([(name, str(df.iloc[row_number][name])) for name in col_names])
    es.index(index="elsbooktrial",doc_type="books",body=body)

In [31]:
search_rslt = es.search(index="elsbooktrial",body={"_source":["Name","Pages"],
                                                  "query":{
                                                      "match_phrase":{"Content":"Computing"}
                                                  }})

In [32]:
search_rslt

{'took': 79,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 7, 'relation': 'eq'},
  'max_score': 0.83708656,
  'hits': [{'_index': 'elsbooktrial',
    '_type': 'books',
    '_id': 'oskelm8BVcgdjGvKkMxq',
    '_score': 0.83708656,
    '_source': {'Pages': '186', 'Name': 'Parallel computing.pdf'}},
   {'_index': 'elsbooktrial',
    '_type': 'books',
    '_id': 'nskelm8BVcgdjGvKjcyi',
    '_score': 0.751288,
    '_source': {'Pages': '830', 'Name': 'NIC225296.pdf'}},
   {'_index': 'elsbooktrial',
    '_type': 'books',
    '_id': 'mckelm8BVcgdjGvKisyT',
    '_score': 0.6758751,
    '_source': {'Pages': '234', 'Name': 'thebook.pdf'}},
   {'_index': 'elsbooktrial',
    '_type': 'books',
    '_id': 'm8kelm8BVcgdjGvKi8zD',
    '_score': 0.63997686,
    '_source': {'Pages': '50',
     'Name': 's9449-building-a-distributed-gpu-dataframe-with-python_V2.pdf'}},
   {'_index': 'elsbooktrial',
    '_type': 'books',
    '_id': 'n8