In [6]:
from datetime import datetime
from elasticsearch import Elasticsearch, helpers
import requests
import json
import time

### ES client

In [7]:
from elasticsearch import Elasticsearch
#es = Elasticsearch()
es = Elasticsearch(hosts= [{
  'host': 'localhost',
  'port': 9200,
  'headers': {
    'Accept': 'application/json',
    'Content-Type': 'application/json'
  }
}])

### Create an index

In [8]:
es.indices.create(index='test-indexw', 
                  ignore=400)
res = requests.get('http://localhost:9200')
print(res.content)

doc = {
    'author': 'kimchy',
    'text': 'Elasticsearch: cool. bonsai cool.',
    'timestamp': datetime.now(),
}

es.count(index='test-indexw')

res = es.index(index="test-indexw", doc_type='tweet', id=1, body=doc)
print(res)

b'{\n  "name" : "GEdcTh3",\n  "cluster_name" : "elasticsearch",\n  "cluster_uuid" : "8Du4O1XkQ6qY2mLer7ohgA",\n  "version" : {\n    "number" : "6.6.1",\n    "build_flavor" : "default",\n    "build_type" : "deb",\n    "build_hash" : "1fd8f69",\n    "build_date" : "2019-02-13T17:10:04.160291Z",\n    "build_snapshot" : false,\n    "lucene_version" : "7.6.0",\n    "minimum_wire_compatibility_version" : "5.6.0",\n    "minimum_index_compatibility_version" : "5.0.0"\n  },\n  "tagline" : "You Know, for Search"\n}\n'
{'_primary_term': 1, '_index': 'test-indexw', '_type': 'tweet', '_id': '1', '_version': 1, 'result': 'created', '_seq_no': 0, '_shards': {'successful': 1, 'failed': 0, 'total': 2}}


### Refresh and count index elements

In [9]:
es.indices.refresh(index="test-indexw")
es.count(index='test-indexw')

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'count': 1}

### Search elements with query 

In [10]:
res = es.search(index="test-indexw", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])

Got 1 Hits:
2019-02-26T10:43:26.929426 kimchy: Elasticsearch: cool. bonsai cool.


### Delete index

In [11]:
es.indices.delete(index='test-indexw', ignore=[400, 404])

{'acknowledged': True}

# Case: Index movies

In [12]:
def flatten(l):
    [item for sublist in l for item in sublist]

def simplerExplain(explainJson, depth=0):
    result = " " * (depth * 2) + "%s, %s\n" % (explainJson['value'], explainJson['description'])
    #print json.dumps(explainJson, indent=True)
    if 'details' in explainJson:
        for detail in explainJson['details']:
            result += simplerExplain(detail, depth=depth+1)
    return result

def extract():
    f = open('tmdb.json')
    if f:
         return json.loads(f.read());        
    return {}


def timer(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
    
def reindex_seq(analysisSettings={}, mappingSettings={}, movieDict={}):
    '''Sequencialy index new values'''
    bulkMovies =  ({'_index':'test-movie','doc':movieDict[i], '_type':'movie', '_id':i} for i in movieDict.keys() )
    #helpers.bulk(es,bulkMovies)
    
    for i in bulkMovies:
        es.index(index="test-movie", doc_type='movie', 
                 id=i['_id'], body=i['doc'])
        
    print ("indexing...")
    
def reindex_chunk(index, chunk_size, Settings={},
                       movieDict={}):
    """Batch insert new values on index for faster creation"""
    print("Delete index")
    es.indices.delete(index=index, ignore=400)
    es.indices.create(index=index, ignore=400, body=Settings)
    bulkMovies =  ({'_index':index,'doc':movieDict[i], '_type':'movie', '_id':i} for i in movieDict.keys() )
    print("Indexing")
    for ok, response in helpers.streaming_bulk(es, actions = bulkMovies, chunk_size=chunk_size, max_retries=10):
        if not ok:
            # failure inserting
            print (response)


## Settings for index

In [None]:
#index time settings analyzer (converts to tokens)
# Analyzers and mappers !

analyze_settings = {
     "analyzer": {
       "movies_analyzer": {
         "type": "standard",
         "stopwords": "_english_",
            
       }
     }
   ,

  "mappings": {
    "movie": { 
      "properties": {
        "title": {"type": "text", "analyzer": "english"},
        "overview": { "type": "text", "analyzer": "english"}
      }
    }
  },
    
}

settings_index = { #A
        "settings": {
                "analysis" : analyze_settings, #C
            }}

json.dumps(settings_index)

In [14]:
# this setting have a filter with tstemmer for english words
analyze_settings = {
     "analyzer": {
       "movies_analyzer": {
         "type": "standard",
         "stopwords": "_english_",
         "filter" : ["lowercase", "my_stemmer"]
       }
     }
    ,
         "filter" : {
                "my_stemmer" : {
                    "type" : "stemmer",
                    "name" : "english"
                }
            },

  "mappings": {
    "movie": { 
      "properties": {
        "title": {"type": "text", "analyzer": "english"},
        "overview": { "type": "text", "analyzer": "english"}
      }
    }
  },
    
}

settings_index = { #A
        "settings": {
                "analysis" : analyze_settings, #C
            }}

## Indexing

In [16]:
movieDict = extract()

t1 = time.time()
reindex_chunk(index='test-movie', chunk_size=500, movieDict=movieDict, Settings=settings_index)
t2 = time.time()
timer(t1, t2)

es.indices.refresh(index="test-movie")
print(es.count(index='test-movie'))

Delete index
Indexing
00:00:01.57
{'_shards': {'successful': 5, 'failed': 0, 'total': 5, 'skipped': 0}, 'count': 3051}


## Search

In [18]:
def search(query, index, n_hits=10, explain=False):
    searchHits = es.search(index=index, doc_type='movie', body= query, size=n_hits, explain=explain)
    for idx, hit in enumerate(searchHits['hits']['hits']):
        print(idx, hit['_score'], hit['_id'], hit['_source']['doc']['title'])
    return searchHits

fieldsSearch = ['doc.title', 'doc.overview']# Limit search on this fields  'doc.genres']
usersSearch = 'basketball with cartoon aliens'

search_object = {'_source':[],#fieldsSearch, 
                 'query': {
            'multi_match': { 
                'query': usersSearch,
                'fields': ['doc.title^1', 'doc.overview'] #if we take out 10 it will return space jam as second
            },
        }}

hits = search(query=search_object, index = 'test-movie', explain=False)

0 9.522362 11260 Meet Dave
1 8.215713 2300 Space Jam
2 7.863373 38365 Grown Ups
3 7.842091 13260 Semi-Pro
4 7.292982 20856 Aliens in the Attic
5 7.14475 679 Aliens
6 7.0154967 7459 Speed Racer
7 6.710354 8078 Alien: Resurrection
8 6.5932465 80035 The Watch
9 6.3094597 888 The Flintstones


## Inspecting hits

In [26]:
# returns first 10 hits in a dictionary, here we use filter_path to filter 
hits = es.search(index='test-movie', filter_path=['hits.hits._id', 'hits.hits._type'])

print(hits['hits']['hits']) # list of dictionarys)
print('One hit: ',hits['hits']['hits'][0])

[{'_id': '10882', '_type': 'movie'}, {'_id': '10530', '_type': 'movie'}, {'_id': '16781', '_type': 'movie'}, {'_id': '11675', '_type': 'movie'}, {'_id': '9296', '_type': 'movie'}, {'_id': '13596', '_type': 'movie'}, {'_id': '144336', '_type': 'movie'}, {'_id': '582', '_type': 'movie'}, {'_id': '8012', '_type': 'movie'}, {'_id': '105', '_type': 'movie'}]
One hit:  {'_id': '10882', '_type': 'movie'}


In [27]:
res['hits']['hits'][0]['_source'].keys()

dict_keys(['doc'])

In [28]:
# returns everything, matches with everything
res = es.search(index='test-movie', filter_path=['hits.hits._*'])
res['hits']['hits'][0]['_source']['doc'].keys()#['overview']# ##

dict_keys(['video', 'homepage', 'original_language', 'id', 'original_title', 'production_companies', 'vote_average', 'popularity', 'belongs_to_collection', 'poster_path', 'release_date', 'adult', 'budget', 'backdrop_path', 'status', 'spoken_languages', 'tagline', 'title', 'vote_count', 'overview', 'revenue', 'runtime', 'genres', 'directors', 'production_countries', 'cast', 'imdb_id'])

In [29]:
for idx, hit in enumerate(res['hits']['hits']):
    print(idx, hit['_score'], hit['_id'], hit['_source']['doc']['title'])
    break

print("Title: ",res['hits']['hits'][0]['_source']['doc']['title'])
print("Genres: ",res['hits']['hits'][0]['_source']['doc']['genres'])
print("Overview: ",res['hits']['hits'][0]['_source']['doc']['overview'])

0 1.0 10882 Sleeping Beauty
Title:  Sleeping Beauty
Genres:  [{'id': 16, 'name': 'Animation'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10749, 'name': 'Romance'}, {'id': 10751, 'name': 'Family'}]
Overview:  A beautiful princess born in a faraway kingdom is destined by a terrible curse to prick her finger on the spindle of a spinning wheel and fall into a deep sleep that can only be awakened by true love's first kiss. Determined to protect her, her parents ask three fairies to raise her in hiding. But the evil Maleficent is just as determined to seal the princess's fate.


## Using CURL

# Analize results 

Without stopwords all words are taken into account.

Using english stop words 'with' is not a token anymore

## Explain search

This is useful because we can see the score and also how the stemmer works on the tokens. 

In [38]:
search_object

{'_source': [],
 'query': {'multi_match': {'fields': ['doc.title^1', 'doc.overview'],
   'query': 'basketball with cartoon aliens'}}}

In [37]:
hit = es.search(index='test-movie', doc_type='movie', body= search_object, size=1, explain=True)
hit['hits']['hits']

[{'_explanation': {'description': 'max of:',
   'details': [{'description': 'sum of:',
     'details': [{'description': 'weight(doc.overview:with in 179) [PerFieldSimilarity], result of:',
       'details': [{'description': 'score(doc=179,freq=1.0 = termFreq=1.0\n), product of:',
         'details': [{'description': 'idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:',
           'details': [{'description': 'docFreq',
             'details': [],
             'value': 263.0},
            {'description': 'docCount', 'details': [], 'value': 637.0}],
           'value': 0.8842849},
          {'description': 'tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:',
           'details': [{'description': 'termFreq=1.0',
             'details': [],
             'value': 1.0},
            {'description': 'parameter k1', 'details': [], 'value': 1.2},
            {'description': 'parameter b', 'details': [], 'value': 0.75}