In [1]:
import pandas as pd
import re
import json
import time
import requests
from requests.auth import HTTPBasicAuth

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

In [None]:
USER = 'elastic'
PWD = 'Z2_wd+WN6St89_WNpbGE'
ES_ENDPOINT = 'http://localhost:9200'

### Read data

In [None]:
df = pd.read_csv('books.csv')
df = df.set_index('id')
df.head()

In [None]:
#transform dataframe into json format
docs = df.to_dict(orient='records')
doc_ids = df.index
print(doc_ids[0])
print(docs[0])


### Elasticsearch Python wrapper

In [None]:
def create_index(es, index_name, settings=None):
    """
    Create an Elasticsearch index
    @param es: an Elasticsearch object
    @param index_name: the name of the new index to be created
    @param settings: the index settings
    @return whether the index was created
    """
    is_created = False
    try:
        if es.indices.exists(index_name):
            es.indices.delete(index=index_name, ignore=[404])
        es.indices.create(index=index_name, body=settings)
        is_created = True
    except Exception as ex:
        print(str(ex))
    return is_created

In [None]:
# Index settings
settings_basic = {
    "settings": {
        "number_of_shards": 4,
        "number_of_replicas": 2,
        "analysis": {
            "analyzer": {"std_english": {"type": "standard", "stopwords": "_english_" }}
        }
    }
}

In [None]:
index_name = 'books'
#connect to the local elasticsearch node and authenticate
es = Elasticsearch([ES_ENDPOINT], http_auth=(USER, PWD))
#create an index
is_created = create_index(es, index_name, settings=settings_basic)
print(f'Index creation: {is_created}')

In [None]:
#loops over the first 10 documents
for i, doc in zip(doc_ids[0:10], docs[0:10]):
    #index the documents with corresponding ids
    res = es.index(index=index_name, id=i, document=doc)
    print(res)
# see also the bulk functions for importing under: elasticsearch.helpers

In [None]:
# get settings info of the selected index
es.indices.get_settings(index='books')

In [None]:
# retrieve a document with a given ID
es.get(index='books', id=doc_ids[0])

In [None]:
# delete the index
es.indices.delete(index='books', ignore=404)

### Elasticsearch with python cURL (Requests)

In [None]:
class Elastic:
    """
    A convenience object to send HTTP requests to Elasticsearch
    """
    def __init__(self, endpoint, username, password):
        """
        @param endpoint: the URL of the Elasticsearch instance
        @param username: the Elasticsearch username 
        @param password: the Elasticsearch password
        """
        self.header = {'Content-Type': 'application/json', 'charset':'UTF-8'}
        #self.header={'Content-Type': '--data-binary application/x-ndjson'}
        self.endpoint = endpoint
        self.username = username
        self.password = password
        self.methods_mapping = {'get': requests.get, 
                                'put':requests.put, 
                                'post':requests.post, 
                                'delete':requests.delete}
        
    def curl(self, method, handle, json=None):
        """
        Sends an HTTP request to the Elasticsearch instanc
        @param method: can be 'get', 'put', 'post', 'delete'
        @param handle: the API handle to be appended to the Elasticsearch url
        @param json: the json payload of the HTTP request
        """
        http_method = self.methods_mapping[method.lower()]
        r = http_method(f'{self.endpoint}/{handle}', auth=HTTPBasicAuth(USER, PWD), 
                        headers=self.header, json=json)
        return r

In [None]:
e = Elastic(ES_ENDPOINT, USER, PWD)

In [None]:
create_index_json={"settings": {
    "number_of_shards": 4, 
    "number_of_replicas": 2, 
    "index.max_result_window": 20000,
    "index" : {
        "similarity" : {
          "default" : {
            "type" : "BM25", "b": 0.5, "k1": 0
          }
        }
    },
    "analysis": {
      "analyzer": {
        "std_english": {"type": "standard", "stopwords": "_english_" }
      }
    }
  }
}

# create an index
r = e.curl('put', 'books', json=create_index_json)
r.json()

In [None]:
# get the index details and settings
r = e.curl('get', 'books')
r.json()

In [None]:
r = e.curl('put', 'books/_settings', {'index' : {'refresh_interval' : -1}})
r.json()

In [None]:
# index documents with their individual ids (use bulk for speedup)
for doc_id, doc in list(zip(doc_ids, docs))[0:100]:
    r = e.curl('post', f'books/_doc/{doc_id}', json=doc)
r.json()

In [None]:
r = e.curl('get', f'books/_doc/{doc_ids[0]}')
r.json()

In [None]:
#delete the index
r = e.curl('delete', 'books')
r.json()