In [38]:
import pandas as pd
import re
import json
import time
import requests
from requests.auth import HTTPBasicAuth
from elasticsearch import Elasticsearch
import elasticsearch
from elasticsearch.helpers import bulk

print(elasticsearch.__version__)

(8, 6, 2)


In [39]:
# !!! CUSTOMIZE THIS SECTION WITH YOUR CREDENTIALS !!!

USER = 'elastic'
PWD = '880A2904-264F-47F4-807B-DC9DDB47183B'
index_name = 'books'
ES_ENDPOINT = 'https://localhost:9200'

path_to_ca_certificates = './certs/ca/ca.crt'

### Read data

In [79]:
df = pd.read_csv('books.csv')
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,author_name,title,country,language,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14602826,"Yearsley, Ann",Poems on several occasions,England,English,1786
14602830,"A, T.",A Satyr against Vertue. (A poem: supposed to b...,England,English,1679
14602831,"A, T.","The Aeronaut, a poem; founded almost entirely,...",Ireland,English,1816
14602832,"Albert, Prince Consort, consort of Victoria, Q...","The Prince Albert, a poem",Ireland,English,1868
14602833,"Anslow, Robert","The Defeat of the Spanish Armada, A.D. 1588. A...",England,English,1888


In [41]:
#transform dataframe into json format
docs = df.to_dict(orient='records')
doc_ids = df.index
print(doc_ids[0])
print(docs[0])

14602826
{'author_name': 'Yearsley, Ann', 'title': 'Poems on several occasions', 'country': 'England', 'language': 'English', 'year': 1786}



### Elasticsearch Python wrapper

In [42]:
def create_index(es, index_name, settings=None):
    """
    Create an Elasticsearch index
    @param es: an Elasticsearch object
    @param index_name: the name of the new index to be created
    @param settings: the index settings
    @return whether the index was created
    """
    is_created = False
    try:
        if es.indices.exists(index=[index_name]):
            es.indices.delete(index=[index_name], ignore=[404])
        es.indices.create(index=index_name, settings=settings)
        is_created = True
    except Exception as ex:
        print(str(ex))
    return is_created

In [43]:
# Index settings
settings_basic = {
        "number_of_shards": 4,
        "number_of_replicas": 2,
        "analysis": {
            "analyzer": {"std_english": {"type": "standard", "stopwords": "_english_" }}
        }
}

In [80]:
#connect to the local elasticsearch node and authenticate
es = Elasticsearch(hosts=[ES_ENDPOINT], ca_certs=path_to_ca_certificates, basic_auth=(USER, PWD))
#create an index
# is_created = create_index(es, index_name, settings=settings_basic)
# print(f'Index creation: {is_created}')

Index creation: True


In [81]:
#loops over the first 10 documents
for i, doc in zip(doc_ids[0:10], docs[0:10]):
    #index the documents with corresponding ids
    res = es.index(index=index_name, id=i, document=doc)
    print(res)
# see also the bulk functions for importing under: elasticsearch.helpers

{'_index': 'books', '_id': '14602826', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'books', '_id': '14602830', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'books', '_id': '14602831', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'books', '_id': '14602832', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
{'_index': 'books', '_id': '14602833', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
{'_index': 'books', '_id': '14602834', '_version': 1, 'result': 'created', '_shards': {'total': 3, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'book

In [46]:
actions = [
  {
    "_index": index_name,
    "_id": doc_id,
    "_source": doc
  }
  for doc_id, doc in list(zip(doc_ids, docs))
]

# send actions in bulk (the API takes care of chunking them optimally)
bulk(es, actions)

(52695, [])

In [47]:
# get settings info of the selected index
es.indices.get_settings(index=index_name)

ObjectApiResponse({'books': {'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '4', 'provided_name': 'books', 'creation_date': '1676638651484', 'analysis': {'analyzer': {'std_english': {'type': 'standard', 'stopwords': '_english_'}}}, 'number_of_replicas': '2', 'uuid': 'VCQkym_sTF2vgylZ9loxtw', 'version': {'created': '8050399'}}}}})

In [48]:
# retrieve a document with a given ID
es.get(index=index_name, id=doc_ids[1])

ObjectApiResponse({'_index': 'books', '_id': '14602830', '_version': 2, '_seq_no': 3, '_primary_term': 1, 'found': True, '_source': {'author_name': 'A, T.', 'title': 'A Satyr against Vertue. (A poem: supposed to be spoken by a Town-Hector )', 'country': 'England', 'language': 'English', 'year': 1679}})

In [49]:
# this is how you would delete the index
# es.indices.delete(index=index_name, ignore=404)

### Elasticsearch with python cURL (Requests)

In [50]:
class Elastic:
    """
    A convenience object to send HTTP requests to Elasticsearch
    """
    def __init__(self, endpoint, username, password, path_to_ca_certificates):
        """
        @param endpoint: the URL of the Elasticsearch instance
        @param username: the Elasticsearch username 
        @param password: the Elasticsearch password
        """
        self.header = {'Content-Type': 'application/json', 'charset':'UTF-8'}
        #self.header={'Content-Type': '--data-binary application/x-ndjson'}
        self.endpoint = endpoint
        self.username = username
        self.password = password
        self.path_to_ca_certificates = path_to_ca_certificates
        self.methods_mapping = {'get': requests.get, 
                                'put':requests.put, 
                                'post':requests.post, 
                                'delete':requests.delete}
        
    def curl(self, method, handle, json=None):
        """
        Sends an HTTP request to the Elasticsearch instance
        @param method: can be 'get', 'put', 'post', 'delete'
        @param handle: the API handle to be appended to the Elasticsearch url
        @param json: the json payload of the HTTP request
        """
        http_method = self.methods_mapping[method.lower()]
        r = http_method(f'{self.endpoint}/{handle}', auth=HTTPBasicAuth(USER, PWD), 
                        headers=self.header, json=json,
                        verify = self.path_to_ca_certificates)
        return r

In [51]:
e = Elastic(ES_ENDPOINT, USER, PWD, path_to_ca_certificates)

In [85]:
# # create another index jsut as an example. in the following, we will keep using the book index
# # created using the Elasticsearch API

# create_index_json={
#   "mappings" : {
#       "properties" : {
#         "author_name" : {
#           "type" : "text"
#         },
#         "country" : {
#           "type" : "keyword"
#         },
#         "language" : {
#           "type" : "keyword"
#         },
#         "title" : {
#           "type" : "text"
#         },
#         "year" : {
#           "type" : "long"
#         }
#       }
#   },
#   "settings": {
#     "number_of_shards": 4, 
#     "number_of_replicas": 2, 
#     "index.max_result_window": 20000,
#     "index" : {
#         "similarity" : {
#           "default" : {
#             "type" : "BM25", "b": 0.75, "k1": 1.2
#           }
#         }
#     },
#     "analysis": {
#       "analyzer": {
#         "std_english": {"type": "standard", "stopwords": "_english_" }
#       }
#     }
#   }
# }

# # create an index
# r = e.curl('put', index_name, json=create_index_json)
# r.json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'books'}

In [86]:
# # get the index details and settings
# r = e.curl('get', index_name)
# r.json()

{'books': {'aliases': {},
  'mappings': {'properties': {'author_name': {'type': 'text'},
    'country': {'type': 'keyword'},
    'language': {'type': 'keyword'},
    'title': {'type': 'text'},
    'year': {'type': 'long'}}},
  'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}},
    'number_of_shards': '4',
    'provided_name': 'books',
    'similarity': {'default': {'type': 'BM25', 'b': '0.75', 'k1': '1.2'}},
    'max_result_window': '20000',
    'creation_date': '1676639168957',
    'analysis': {'analyzer': {'std_english': {'type': 'standard',
       'stopwords': '_english_'}}},
    'number_of_replicas': '2',
    'uuid': 'pu8U4EsdTC6wZPOEDfRw9Q',
    'version': {'created': '8050399'}}}}}

In [54]:
# # deactivate refresh in preparation of data indexing
# r = e.curl('put', 'books/_settings', {'index' : {'refresh_interval' : -1}})
# r.json()

{'acknowledged': True}

In [55]:
# # index documents with their individual ids (use bulk below for speedup)
# for doc_id, doc in list(zip(doc_ids, docs))[0:10]:
#     r = e.curl('post', f'books/_doc/{doc_id}', json=doc)
# r.json()

{'_index': 'books',
 '_id': '14602838',
 '_version': 3,
 'result': 'updated',
 '_shards': {'total': 3, 'successful': 1, 'failed': 0},
 '_seq_no': 13082,
 '_primary_term': 1}

In [56]:
# # bulk indexing (via official API)

# #connect to the local elasticsearch node and authenticate
# es = Elasticsearch([ES_ENDPOINT], ca_certs=path_to_ca_certificates, basic_auth=(USER, PWD))

# actions = [
#   {
#     "_index": index_name,
#     "_id": doc_id,
#     "_source": doc
#   }
#   for doc_id, doc in list(zip(doc_ids, docs))
# ]

# # send actions in bulk (the API takes care of chunking them optimally)
# bulk(es, actions)

(52695, [])

In [57]:
# # reset the refresh interval to 2 seconds
# r = e.curl('put', 'books/_settings', {'index' : {'refresh_interval' : '2s'}})
# r.json()

{'acknowledged': True}

In [58]:
# r = e.curl('get', f'books/_doc/{doc_ids[42]}')
# r.json()

{'_index': 'books',
 '_id': '14602871',
 '_version': 2,
 '_seq_no': 13092,
 '_primary_term': 1,
 'found': True,
 '_source': {'author_name': 'Smedley, Edward, Fellow of Sidney Sussex College, Cambridge',
  'title': 'The Death of Saul and Jonathan. A poem',
  'country': 'England',
  'language': 'English',
  'year': 1814}}

In [88]:
# this is how you would delete the index
r = e.curl('delete', 'books')
r.json()

{'acknowledged': True}

### Search queries [EXERCISES]

#### Empty query

In [78]:
# empty query
# r = e.curl('get', f'books/_search')
# r.json()

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [books]',
    'resource.type': 'index_or_alias',
    'resource.id': 'books',
    'index_uuid': '_na_',
    'index': 'books'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [books]',
  'resource.type': 'index_or_alias',
  'resource.id': 'books',
  'index_uuid': '_na_',
  'index': 'books'},
 'status': 404}

#### Aggregation query

In [89]:
# SETUP
# create another index jsut as an example. in the following, we will keep using the book index
# created using the Elasticsearch API

create_index_json={
  "mappings" : {
      "properties" : {
        "author_name" : {
          "type" : "text"
        },
        "country" : {
          "type" : "keyword"
        },
        "language" : {
          "type" : "keyword"
        },
        "title" : {
          "type" : "text"
        },
        "year" : {
          "type" : "long"
        }
      }
  },
  "settings": {
    "number_of_shards": 4, 
    "number_of_replicas": 2, 
    "index.max_result_window": 20000,
    "index" : {
        "similarity" : {
          "default" : {
            "type" : "BM25", "b": 0.75, "k1": 1.2
          }
        }
    },
    "analysis": {
      "analyzer": {
        "std_english": {"type": "standard", "stopwords": "_english_" }
      }
    }
  }
}

# create an index
r = e.curl('put', index_name, json=create_index_json)
r.json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'books'}

In [90]:
r = e.curl('put', 'books/_settings', {'index' : {'refresh_interval' : -1}})
r.json()

{'acknowledged': True}

In [91]:
# bulk indexing (via official API)

#connect to the local elasticsearch node and authenticate
es = Elasticsearch([ES_ENDPOINT], ca_certs=path_to_ca_certificates, basic_auth=(USER, PWD))

actions = [
  {
    "_index": index_name,
    "_id": doc_id,
    "_source": doc
  }
  for doc_id, doc in list(zip(doc_ids, docs))
]

# send actions in bulk (the API takes care of chunking them optimally)
bulk(es, actions)

(52695, [])

In [97]:
""" 
EXERCISE A: 
execute an aggregation query to count the number of books writte in each country
"""
aggr_data = {
    "aggregations": {
        "by_category": {
            "terms": {
                "field": "country",
                "size": 10
            }
        }
    }
}

r = e.curl('get', f'books/_search', aggr_data)
r.json()

{'took': 7,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 1.0,
  'hits': [{'_index': 'books',
    '_id': '14602831',
    '_score': 1.0,
    '_source': {'author_name': 'A, T.',
     'title': 'The Aeronaut, a poem; founded almost entirely, upon a statement, printed in the newspapers, of a voyage from Dublin, in October, 1812',
     'country': 'Ireland',
     'language': 'English',
     'year': 1816}},
   {'_index': 'books',
    '_id': '14602832',
    '_score': 1.0,
    '_source': {'author_name': 'Albert, Prince Consort, consort of Victoria, Queen of Great Britain',
     'title': 'The Prince Albert, a poem',
     'country': 'Ireland',
     'language': 'English',
     'year': 1868}},
   {'_index': 'books',
    '_id': '14602838',
    '_score': 1.0,
    '_source': {'author_name': 'Brabant, Henry, Sir',
     'title': 'The Eve of the Revolution; in Newcastle-upon-Tyne. (The Case 

#### Full-text query

In [103]:
""" 
EXERCISE B: 
execute a full-text query for the query "love magic"
"""

b_query = {
  "query": {
    "match": {
      "title": {
        "query": "love magic"
      }
    }
  }
}

r = e.curl('get', f'books/_search', b_query)
r.json()


{'took': 7,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 456, 'relation': 'eq'},
  'max_score': 15.985547,
  'hits': [{'_index': 'books',
    '_id': '14812010',
    '_score': 15.985547,
    '_source': {'author_name': 'Grant, Forrest, Mrs',
     'title': 'The Magic of Love. A novel',
     'country': 'England',
     'language': 'English',
     'year': 1874}},
   {'_index': 'books',
    '_id': '14628808',
    '_score': 14.215401,
    '_source': {'author_name': 'Dorisi, Lisa',
     'title': 'The Enchanter; or, Love and Magic. A Musical Drama',
     'country': 'England',
     'language': 'English',
     'year': 1760}},
   {'_index': 'books',
    '_id': '14816800',
    '_score': 12.649038,
    '_source': {'author_name': 'Lansdowne, George Granville, Baron',
     'title': 'Poems upon Several Occasions. (The British Enchanters; or, No magic like love.)',
     'country': 'England',
     'language': 'English',
     'year'

#### Exact match query

In [107]:
""" 
EXERCISE C: 
execute an exact-match query for the query "magic love"
"""

c_query = {
  "query": {
    "match_phrase": {
      "title": {
        "query": "magic love"
      }
    }
  }
}

r = e.curl('get', f'books/_search', c_query)
r.json()


{'took': 1,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

#### Multi-field full-text query with field boosting

In [110]:
""" 
EXERCISE D: 
Execute a query that searches both on tile and author
Weights the importance of matches on the author field twice as much as matches on the title
Sse "shakespeare" as query term
"""

d_query =  {
    "query": {
        "multi_match": {
            "query": "shakespeare",
            "fields": ["title", "author^2"],
        }
    }
}

r = e.curl('get', f'books/_search', d_query)
r.json()


{'took': 1,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 60, 'relation': 'eq'},
  'max_score': 9.891209,
  'hits': [{'_index': 'books',
    '_id': '14805353',
    '_score': 9.891209,
    '_source': {'author_name': 'Black, William',
     'title': 'Judith Shakespeare: a romance',
     'country': 'England',
     'language': 'English',
     'year': 1884}},
   {'_index': 'books',
    '_id': '14608354',
    '_score': 9.353176,
    '_source': {'author_name': 'Pearce, William',
     'title': 'The Haunts of Shakespeare; a poem',
     'country': 'England',
     'language': 'English',
     'year': 1778}},
   {'_index': 'books',
    '_id': '14824767',
    '_score': 9.040419,
    '_source': {'author_name': 'Williams, Robert Folkestone',
     'title': 'The youth of Shakespeare',
     'country': 'United States of America',
     'language': 'English',
     'year': 1840}},
   {'_index': 'books',
    '_id': '14829324',
    '_scor

#### Combining different queries

In [134]:
"""
EXERCISE E:
execute a *single* boolean query returning books that:
- have the "queen mary" in the title and were written in England
- should NOT have been published in the range of years [1850-1913]
"""

e_query = {
    "query": {
        "bool": {
            "must": [
                {"match": {
                    "title": {
                        "query": "queen mary"
                    }
                }},
                {"match": {
                    "country": {
                        "query": "England"
                    }
                }}
            ],
            "must_not": [
                {"range": {"year": {"gte": 1850, "lte": 1913 }}}
            ]
        }
    }
}


r = e.curl('get', f'books/_search', e_query)
r.json()

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 104, 'relation': 'eq'},
  'max_score': 14.338542,
  'hits': [{'_index': 'books',
    '_id': '14828855',
    '_score': 14.338542,
    '_source': {'author_name': 'Saint John, John, Honourable',
     'title': 'Mary Queen of Scots, a tragedy',
     'country': 'England',
     'language': 'English',
     'year': 1789}},
   {'_index': 'books',
    '_id': '14866478',
    '_score': 14.199584,
    '_source': {'author_name': 'Buckingham, Leicester Silk',
     'title': 'Memoirs of Mary Stuart, Queen of Scotland',
     'country': 'England',
     'language': 'English',
     'year': 1844}},
   {'_index': 'books',
    '_id': '14816241',
    '_score': 11.958053,
    '_source': {'author_name': 'Banks, John',
     'title': 'The Albion Queens, or, The Death of Mary Queen of Scotland',
     'country': 'England',
     'language': 'English',
     'year': 1704}},
   {'_index': 'b

#### Fuzzy queries

In [136]:
"""
EXERCISE F:
execute a fuzzy query for the query "comander" with at most 50 expansions 
and considering transpositions
"""

query_f = {
    "query": {
        "fuzzy": {
            "title": {
                "value": "comander", #the query
                "fuzziness": "AUTO", #maximum edit distance allowed, AUTO
                "max_expansions": 50, #maximum number of term variations created
                "prefix_length": 0, #characters left unchanged at start of term
                "transpositions": True, #to include transpositions in edits
            }
        }
    }
}

r = e.curl('get', f'books/_search', query_f)
r.json()

{'took': 16,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 64, 'relation': 'eq'},
  'max_score': 8.783188,
  'hits': [{'_index': 'books',
    '_id': '14643500',
    '_score': 8.783188,
    '_source': {'author_name': 'Nesbit, E. (Edith)',
     'title': 'A Pomander of Verse',
     'country': 'England',
     'language': 'English',
     'year': 1895}},
   {'_index': 'books',
    '_id': '14804160',
    '_score': 7.441244,
    '_source': {'author_name': 'Armstrong, F. Claudius',
     'title': 'The Neapolitan Commander, a romance of sea & land',
     'country': 'England',
     'language': 'English',
     'year': 1863}},
   {'_index': 'books',
    '_id': '14810095',
    '_score': 6.4313183,
    '_source': {'author_name': 'Great Britain, Hydrographic Department',
     'title': 'Report on the Bore of the Tsien-Tang Kiang, by Commander Moore',
     'country': 'England',
     'language': 'English',
     'year': 1888}},
   {'

#### Get TF of terms in document

In [141]:
"""
EXERCISE G:
get the term frequencies (in the title) of document with id = 4200
consider only words with a minimum length of 4 and a minimum term frequency of 2
"""

query_g =  {
  "fields" : ["title"],
  "term_statistics" : True,
  "field_statistics" : True,
  "positions": True,
  "filter": {
    "min_word_length": 4,
    "min_term_freq" : 2
  }
}

r = e.curl('get', f'books/_termvectors/{doc_ids[4200]}', query_g)
r.json()

{'_index': 'books',
 '_id': '14767985',
 '_version': 1,
 'found': True,
 'took': 1,
 'term_vectors': {'title': {'field_statistics': {'sum_doc_freq': 148337,
    'doc_count': 13216,
    'sum_ttf': 166497},
   'terms': {'with': {'doc_freq': 1908,
     'ttf': 2076,
     'term_freq': 2,
     'tokens': [{'position': 9, 'start_offset': 52, 'end_offset': 56},
      {'position': 24, 'start_offset': 135, 'end_offset': 139}],
     'score': 5.8698487}}}}}

### Solutions

In [68]:
# SOLUTION TO EXERCISE A

# aggregation query
query = {
    "aggregations": {
        "by_category": {
            "terms": {
                "field": "country",
                "size":100
            }
        }
    }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'error': {'root_cause': [{'type': 'illegal_argument_exception',
    'reason': 'Fielddata is disabled on [country] in [books]. Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. Alternatively, set fielddata=true on [country] in order to load field data by uninverting the inverted index. Note that this can use significant memory.'}],
  'type': 'search_phase_execution_exception',
  'reason': 'all shards failed',
  'phase': 'query',
  'grouped': True,
  'failed_shards': [{'shard': 0,
    'index': 'books',
    'node': 'oPASJZaRRR-rQcIBczjcmw',
    'reason': {'type': 'illegal_argument_exception',
     'reason': 'Fielddata is disabled on [country] in [books]. Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. A

In [69]:
# SOLUTION TO EXERCISE B

# full-text query
query = {
    "query": {
        "match": {
            "title": {
                "query": "love magic"
            }
        }
    }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'took': 40,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 456, 'relation': 'eq'},
  'max_score': 16.060637,
  'hits': [{'_index': 'books',
    '_id': '14812010',
    '_score': 16.060637,
    '_source': {'author_name': 'Grant, Forrest, Mrs',
     'title': 'The Magic of Love. A novel',
     'country': 'England',
     'language': 'English',
     'year': 1874}},
   {'_index': 'books',
    '_id': '14628808',
    '_score': 14.282113,
    '_source': {'author_name': 'Dorisi, Lisa',
     'title': 'The Enchanter; or, Love and Magic. A Musical Drama',
     'country': 'England',
     'language': 'English',
     'year': 1760}},
   {'_index': 'books',
    '_id': '14816800',
    '_score': 12.691303,
    '_source': {'author_name': 'Lansdowne, George Granville, Baron',
     'title': 'Poems upon Several Occasions. (The British Enchanters; or, No magic like love.)',
     'country': 'England',
     'language': 'English',
     'year

In [108]:
# SOLUTION TO EXERCISE C

# exact match query
query = {
  "query": {
    "match_phrase": {
      "title": {
        "query": "magic love"
      }
    }
  }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [71]:
# SOLUTION TO EXERCISE D

# Full text query, multiple fields with boosting
query = {
"query": {
    "multi_match": {
            "query": "shakespeare",
            "fields": ["title", "author_name^2"],
            "type": "phrase"
        }
    }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'took': 21,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 95, 'relation': 'eq'},
  'max_score': 16.75016,
  'hits': [{'_index': 'books',
    '_id': '14628923',
    '_score': 16.75016,
    '_source': {'author_name': 'Shakespeare, William',
     'title': 'A Midsummer Nights Dream : With alterations and additions and several new songs  . As it is performed at the Theatre-Royal in Drury-Lane',
     'country': 'England',
     'language': 'English',
     'year': 1763}},
   {'_index': 'books',
    '_id': '14824041',
    '_score': 16.75016,
    '_ignored': ['title.keyword'],
    '_source': {'author_name': 'Shakespeare, William',
     'title': 'The plays of William Shakspeare. In ten volumes. With the corrections and illustrations of various commentators; to which are added notes by S. Johnson and G. Steevens. (An attempt to ascertain the order in which the plays attributed to Shakespeare were written, by E. Malone.) The

In [127]:
# SOLUTION TO EXERCISE E

# One or more queries can be specified in each of the clauses
# All the clauses are optional
# MUST: A document must match all of the queries
# MUST_NOT: A document must not match any of the queries
# SHOULD: A document does not have to match the queries, but it is considered more relevant if it does
# FILTER: Filters with yes/no categories
query = {
  "size" : 100,
  "query": {
    "bool": {
      "must": [
        {
          "match": {
                "title": "queen mary"
          }
        },
        {
          "match": {
              "country": "England"
          }
        }
      ],
      "must_not": [
        {
          "range": {
            "year": {
                "gte": 1850,
                "lte": 1913
            }
          }
        }
      ]
    }
  }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'took': 12,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 104, 'relation': 'eq'},
  'max_score': 14.338542,
  'hits': [{'_index': 'books',
    '_id': '14828855',
    '_score': 14.338542,
    '_source': {'author_name': 'Saint John, John, Honourable',
     'title': 'Mary Queen of Scots, a tragedy',
     'country': 'England',
     'language': 'English',
     'year': 1789}},
   {'_index': 'books',
    '_id': '14866478',
    '_score': 14.199584,
    '_source': {'author_name': 'Buckingham, Leicester Silk',
     'title': 'Memoirs of Mary Stuart, Queen of Scotland',
     'country': 'England',
     'language': 'English',
     'year': 1844}},
   {'_index': 'books',
    '_id': '14816241',
    '_score': 11.958053,
    '_source': {'author_name': 'Banks, John',
     'title': 'The Albion Queens, or, The Death of Mary Queen of Scotland',
     'country': 'England',
     'language': 'English',
     'year': 1704}},
   {'_index': '

In [137]:
# SOLUTION TO EXERCISE F

query ={
  "query": {
    "fuzzy": {
      "title": {
        "value": "comander",
        "fuzziness": "AUTO",
        "max_expansions": 50,
        "prefix_length": 0,
        "transpositions": True,
        "rewrite": "constant_score"
      }
    }
  }
}
r = e.curl('get', f'books/_search', query)
r.json()

{'took': 8,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 64, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'books',
    '_id': '14603267',
    '_score': 1.0,
    '_source': {'author_name': 'Pindar, Peter, Jun, pseudonym [i.e. John Agg]',
     'title': 'Three R----l Bloods; or, a Lame r-t, a darling commander, and a love-sick admiral. A poem. By Peter Pindar, Jun. (Seventh edition.)',
     'country': 'England',
     'language': 'English',
     'year': 1812}},
   {'_index': 'books',
    '_id': '14809925',
    '_score': 1.0,
    '_source': {'author_name': 'Elliott, Robert, Commander, R.N.',
     'title': 'Views in India, China, and on the Shores of the Red Sea: drawn by Prout, Stanfield, Cattermole, Purser, Cox, Austen, &c. from original sketches by Commander R. Elliott ... With descriptions by Emma Roberts',
     'country': 'England',
     'language': 'English',
     'year': 1835}},
   {'_index': 'b

In [140]:
# SOLUTION TO EXERCISE G

# https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-termvectors.html
query = {
    "fields" : ["title"],
    "term_statistics" : True,
    "field_statistics" : True,
    "positions": True,
    "filter": {
        "min_word_length": 4,
        "min_term_freq" : 2
  }
}
#r = e.curl('get', f'books/_doc/{doc_ids[4200]}')
r = e.curl('get', f'books/_termvectors/{doc_ids[4200]}', query)
r.json()

{'_index': 'books',
 '_id': '14767985',
 '_version': 1,
 'found': True,
 'took': 4,
 'term_vectors': {'title': {'field_statistics': {'sum_doc_freq': 148337,
    'doc_count': 13216,
    'sum_ttf': 166497},
   'terms': {'with': {'doc_freq': 1908,
     'ttf': 2076,
     'term_freq': 2,
     'tokens': [{'position': 9, 'start_offset': 52, 'end_offset': 56},
      {'position': 24, 'start_offset': 135, 'end_offset': 139}],
     'score': 5.8698487}}}}}