In [1]:
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print("Connected to Elastic Search")
pprint(client_info.body)


Connected to Elastic Search
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'dwZJ7aR-Rk-TNTwNsmuV-w',
 'name': '8c40224e1107',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-08-05T10:05:34.233336849Z',
             'build_flavor': 'default',
             'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.11.1',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.15.0'}}


# Create index

1. Simplest Way

In this method, the mappings which define the structure of documents within an index are infered automatically

In [7]:
es.indices.delete(index='my_index', ignore_unavailable=True)
es.indices.create(index="my_index")

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

# 2. Specify the number of replicas and shards
## Shards: Elasticsearch divides the data in an index into multiple shards. Each shard is a self-contained index that Elasticsearch can distribute across multiple nodes in a cluster. 

## Replicas: For fault tolerance and high availability, an index can have replica shards, which are copies of the primary shards.

In [9]:
es.indices.delete(index='my_index', ignore_unavailable=True)

es.indices.create(index="my_index",settings={"index":{
    "number_of_shards":3, 
    "number_of_replicas":2
}
    }
)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

# Insert one document

## Create a dummy index just to test inserting one document

In [10]:
es.indices.delete(index='my_index',ignore_unavailable=True)
es.indices.create(index='my_index')

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

In [12]:
document = {
    'title':'title',
    'text':'text',
    'created_on':'2024-09-22',
}
response = es.index(index='my_index',body=document)
response

ObjectApiResponse({'_index': 'my_index', '_id': 'kX83KJwBLLh-Z5ugFLKa', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [13]:
print(response["result"])

created


In [19]:
print(response["_shards"])

{'total': 2, 'successful': 1, 'failed': 0}


In [21]:
print(response)

{'_index': 'my_index', '_id': 'kX83KJwBLLh-Z5ugFLKa', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}


# Insert multiple documents

In [23]:
import json

dummy_data = json.load(open("./dummy_data.json"))
dummy_data

[{'title': 'Sample Title 1',
  'text': 'This is the first sample document text.',
  'created_on': '2024-09-22'},
 {'title': 'Sample Title 2',
  'text': 'Here is another example of a document.',
  'created_on': '2024-09-24'},
 {'title': 'Sample Title 3',
  'text': 'The content of the third document goes here.',
  'created_on': '2024-09-24'}]

In [25]:
def insert_document(document):
    response = es.index(index='my_index',body=document)
    return response

def print_def(response):
    print(f"Document Id is {response['_id']} is {response['result']} and is split into {response['_shards']['total']} shards")

for document in dummy_data:
    response = insert_document(document)
    print_def(response)


Document Id is kn9OKJwBLLh-Z5ugQbLu is created and is split into 2 shards
Document Id is k39OKJwBLLh-Z5ugQrIl is created and is split into 2 shards
Document Id is lH9OKJwBLLh-Z5ugQrI2 is created and is split into 2 shards
