In [41]:
from datetime import datetime
from elasticsearch import Elasticsearch, helpers
import requests
import json
import time

sudo -i service elasticsearch start
sudo -i service elasticsearch stop

# Small tutorial using elastic

### ES client

In [42]:

#es = Elasticsearch()
es = Elasticsearch(hosts= [{
  'host': 'localhost',
  'port': 9200,
  'headers': {
    'Accept': 'application/json',
    'Content-Type': 'application/json'
  }
}])

### Create an index

In [43]:
es.indices.create(index='test-indexw', 
                  ignore=400)
res = requests.get('http://localhost:9200')
print(res.content)

doc = {
    'author': 'kimchy',
    'text': 'Elasticsearch: cool. bonsai cool.',
    'timestamp': datetime.now(),
}

print("Count: ", es.count(index='test-indexw'))

res = es.index(index="test-indexw", doc_type='tweet', id=1, body=doc)
print(res)

b'{\n  "name" : "GEdcTh3",\n  "cluster_name" : "elasticsearch",\n  "cluster_uuid" : "8Du4O1XkQ6qY2mLer7ohgA",\n  "version" : {\n    "number" : "6.6.1",\n    "build_flavor" : "default",\n    "build_type" : "deb",\n    "build_hash" : "1fd8f69",\n    "build_date" : "2019-02-13T17:10:04.160291Z",\n    "build_snapshot" : false,\n    "lucene_version" : "7.6.0",\n    "minimum_wire_compatibility_version" : "5.6.0",\n    "minimum_index_compatibility_version" : "5.0.0"\n  },\n  "tagline" : "You Know, for Search"\n}\n'
Count:  {'_shards': {'skipped': 0, 'failed': 0, 'total': 5, 'successful': 5}, 'count': 0}
{'_shards': {'failed': 0, 'total': 2, 'successful': 1}, '_type': 'tweet', 'result': 'created', '_id': '1', '_seq_no': 0, '_index': 'test-indexw', '_version': 1, '_primary_term': 1}


### Refresh and count index elements

In [44]:
es.indices.refresh(index="test-indexw")
es.count(index='test-indexw')

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'count': 1}

### Search elements with query 

In [45]:
res = es.search(index="test-indexw", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])

Got 1 Hits:
2019-03-29T20:46:09.293233 kimchy: Elasticsearch: cool. bonsai cool.


### Delete index

In [46]:
es.indices.delete(index='test-indexw', ignore=[400, 404])

{'acknowledged': True}

# Case: Index movies

In [47]:
def flatten(l):
    [item for sublist in l for item in sublist]


def extract():
    f = open('tmdb.json')
    if f:
         return json.loads(f.read());        
    return {}


def timer(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
    
def reindex_seq(analysisSettings={}, mappingSettings={}, movieDict={}):
    '''Sequencialy index new values'''
    bulkMovies =  ({'_index':'test-movie','doc':movieDict[i], '_type':'movie', '_id':i} for i in movieDict.keys() )
    #helpers.bulk(es,bulkMovies)
    
    for i in bulkMovies:
        es.index(index="test-movie", doc_type='movie', 
                 id=i['_id'], body=i['doc'])
        
    print ("indexing...")
    
def reindex_chunk(index, chunk_size, Settings={},
                       movieDict={}):
    """Batch insert new values on index for faster creation"""
    print("Delete index")
    es.indices.delete(index=index, ignore=400)
    es.indices.create(index=index, ignore=400, body=Settings)
    bulkMovies =  ({'_index':index,'doc':movieDict[i], '_type':'movie', '_id':i} for i in movieDict.keys() )
    print("Indexing")
    for ok, response in helpers.streaming_bulk(es, actions = bulkMovies, chunk_size=chunk_size, max_retries=10):
        if not ok:
            # failure inserting
            print (response)


## Settings for index

In [48]:
#index time settings analyzer (converts to tokens)
# Analyzers and mappers !

analyze_settings = {
     "analyzer": {
       "movies_analyzer": {
         "type": "standard",
         "stopwords": "_english_",
            
       }
     }
   ,

  "mappings": {
    "movie": { 
      "properties": {
        "title": {"type": "text", "analyzer": "english"},
        "overview": { "type": "text", "analyzer": "english"}
      }
    }
  },
    
}

settings_index = { #A
        "settings": {
                "analysis" : analyze_settings, #C
            }}

json.dumps(settings_index)

'{"settings": {"analysis": {"analyzer": {"movies_analyzer": {"stopwords": "_english_", "type": "standard"}}, "mappings": {"movie": {"properties": {"title": {"analyzer": "english", "type": "text"}, "overview": {"analyzer": "english", "type": "text"}}}}}}}'

In [49]:
# this setting have a filter with tstemmer for english words
analyze_settings = {
     "analyzer": {
       "movies_analyzer": {
         "type": "standard",
         "stopwords": "_english_",
         "filter" : ["lowercase", "my_stemmer"]
       }
     }
    ,
    "filter" : {
                "my_stemmer" : {
                    "type" : "stemmer",
                    "name" : "english"
                }
            },

  "mappings": {
    "movie": { 
      "properties": {
        "title": {"type": "text", "analyzer": "english"},
        "overview": { "type": "text", "analyzer": "english"}
      }
    }
  },
    
}

settings_index = { #A
        "settings": {
                "analysis" : analyze_settings, #C
            }}

## Indexing

In [50]:
movieDict = extract()

t1 = time.time()
reindex_chunk(index='test-movie', chunk_size=500, movieDict=movieDict, Settings=settings_index)
t2 = time.time()
timer(t1, t2)

es.indices.refresh(index="test-movie")
print(es.count(index='test-movie'))

Delete index
Indexing
00:00:03.38
{'_shards': {'skipped': 0, 'failed': 0, 'total': 5, 'successful': 5}, 'count': 3051}


In [9]:
movieDict

{'14160': {'adult': False,
  'backdrop_path': '/fI3ucpgaVKOUcQ82vhgWmWuLlg2.jpg',
  'belongs_to_collection': None,
  'budget': 175000000,
  'cast': [{'cast_id': 4,
    'character': 'Carl Fredricksen (voice)',
    'credit_id': '52fe45d19251416c75063875',
    'id': 68812,
    'name': 'Ed Asner',
    'order': 0,
    'profile_path': '/1EysZS86vozSb9pwD7HVGqInfDQ.jpg'},
   {'cast_id': 5,
    'character': 'Charles Muntz (voice)',
    'credit_id': '52fe45d19251416c75063879',
    'id': 290,
    'name': 'Christopher Plummer',
    'order': 1,
    'profile_path': '/fauMGxa6dc86nHNenQ8X6DlE6YV.jpg'},
   {'cast_id': 6,
    'character': 'Russell (voice)',
    'credit_id': '52fe45d19251416c7506387d',
    'id': 80676,
    'name': 'Jordan Nagai',
    'order': 2,
    'profile_path': '/cE0ylwj9U3vii34XQGFE3zk8n80.jpg'},
   {'cast_id': 7,
    'character': 'Dug / Alpha (voice)',
    'credit_id': '52fe45d19251416c75063881',
    'id': 10,
    'name': 'Bob Peterson',
    'order': 3,
    'profile_path': '/13YN

In [38]:
for i in movieDict:
    print(i)
    break

24021


## Search

In [57]:
def search(query, index, n_hits=10, explain=False):
    searchHits = es.search(index=index, doc_type='movie', body= query, size=n_hits, explain=explain)
    for idx, hit in enumerate(searchHits['hits']['hits']):
        print(idx, hit['_score'], hit['_id'], hit['_source']['doc']['title'])
    return searchHits

fieldsSearch = ['doc.title', 'doc.overview']# Limit search on this fields  'doc.genres']
usersSearch = 'basketball with cartoon aliens'

search_object = {'_source':[],#fieldsSearch, 
                 'query': {
            'multi_match': { 
                'query': usersSearch,
                'fields': ['doc.title^1', 'doc.overview'] #if we take out 10 it will return space jam as second
            },
        }}

hits = search(query=search_object, index = 'test-movie', explain=False)

0 9.522362 11260 Meet Dave
1 8.215713 2300 Space Jam
2 7.863373 38365 Grown Ups
3 7.842091 13260 Semi-Pro
4 7.292982 20856 Aliens in the Attic
5 7.14475 679 Aliens
6 7.0154967 7459 Speed Racer
7 6.710354 8078 Alien: Resurrection
8 6.5932465 80035 The Watch
9 6.3094597 888 The Flintstones


In [58]:
hits = search(query=search_object, index = 'test-movie', explain=True)
hits

0 9.522362 11260 Meet Dave
1 8.215713 2300 Space Jam
2 7.863373 38365 Grown Ups
3 7.842091 13260 Semi-Pro
4 7.292982 20856 Aliens in the Attic
5 7.14475 679 Aliens
6 7.0154967 7459 Speed Racer
7 6.710354 8078 Alien: Resurrection
8 6.5932465 80035 The Watch
9 6.3094597 888 The Flintstones


{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_explanation': {'description': 'max of:',
     'details': [{'description': 'sum of:',
       'details': [{'description': 'weight(doc.overview:with in 90) [PerFieldSimilarity], result of:',
         'details': [{'description': 'score(doc=90,freq=1.0 = termFreq=1.0\n), product of:',
           'details': [{'description': 'idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:',
             'details': [{'description': 'docFreq',
               'details': [],
               'value': 263.0},
              {'description': 'docCount', 'details': [], 'value': 637.0}],
             'value': 0.8842849},
            {'description': 'tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:',
             'details': [{'description': 'termFreq=1.0',
               'details': [],
               'value': 1.0},
              {'description': 'par

## Inspecting hits

In [59]:
# returns first 10 hits in a dictionary, here we use filter_path to filter 
hits = es.search(index='test-movie', filter_path=['hits.hits._id', 'hits.hits._type'])

print(hits['hits']['hits']) # list of dictionarys)
print('One hit: ',hits['hits']['hits'][0])

[{'_type': 'movie', '_id': '79'}, {'_type': 'movie', '_id': '10303'}, {'_type': 'movie', '_id': '73'}, {'_type': 'movie', '_id': '12403'}, {'_type': 'movie', '_id': '338'}, {'_type': 'movie', '_id': '38757'}, {'_type': 'movie', '_id': '51876'}, {'_type': 'movie', '_id': '9421'}, {'_type': 'movie', '_id': '10312'}, {'_type': 'movie', '_id': '6278'}]
One hit:  {'_type': 'movie', '_id': '79'}


In [60]:
res['hits']['hits'][0]['_source'].keys()

dict_keys(['author', 'text', 'timestamp'])

In [61]:
# returns everything, matches with everything
res = es.search(index='test-movie', filter_path=['hits.hits._*'])
res['hits']['hits'][0]['_source']['doc'].keys()#['overview']# ##

dict_keys(['backdrop_path', 'homepage', 'revenue', 'overview', 'production_companies', 'budget', 'adult', 'imdb_id', 'spoken_languages', 'production_countries', 'release_date', 'popularity', 'title', 'tagline', 'original_title', 'poster_path', 'genres', 'video', 'status', 'id', 'vote_average', 'vote_count', 'cast', 'belongs_to_collection', 'directors', 'runtime', 'original_language'])

In [62]:
for idx, hit in enumerate(res['hits']['hits']):
    print(idx, hit['_score'], hit['_id'], hit['_source']['doc']['title'])
    break

print("Title: ",res['hits']['hits'][0]['_source']['doc']['title'])
print("Genres: ",res['hits']['hits'][0]['_source']['doc']['genres'])
print("Overview: ",res['hits']['hits'][0]['_source']['doc']['overview'])

0 1.0 79 Hero
Title:  Hero
Genres:  [{'name': 'Action', 'id': 28}, {'name': 'Adventure', 'id': 12}, {'name': 'Drama', 'id': 18}, {'name': 'History', 'id': 36}]
Overview:  One man defeated three assassins who sought to murder the most powerful warlord in pre-unified China.


## Using CURL

using CURL paste the following commands tu make requests to elastic search

# Analize results 

Without stopwords all words are taken into account. We need to set an analyzer

Using english stop words 'with' is not a token anymore

### Remove stopwords with custom analyzer

### Removes stopwords and uses stem porter

## Explain search

This is useful because we can see the score and also how the query is interpreted by elastic search.

In [38]:
search_object

{'_source': [],
 'query': {'multi_match': {'fields': ['doc.title^1', 'doc.overview'],
   'query': 'basketball with cartoon aliens'}}}

### Match

### Multi-match