Elasticsearch
# Buscador de documentos

In [3]:
## Code from: https://www.reddit.com/r/IPython/comments/34t4m7/lpt_print_json_in_collapsible_format_in_ipython/

import uuid
from IPython.display import display_javascript, display_html, display
import json

class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json
        self.uuid = str(uuid.uuid4())

    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid),
            raw=True
        )
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
          document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)

In [13]:
import requests


index_options = '''
{ 
  "mappings" : { 
      "serie" : {
        "properties" : {
          "_links" : {
            "properties" : {
              "nextepisode" : {
                "properties" : {
                  "href" : {
                    "type" : "string",
                    "index" : "no"
                  }
                }
              },
              "previousepisode" : {
                "properties" : {
                  "href" : {
                    "type" : "string",
                    "index" : "no"
                  }
                }
              },
              "self" : {
                "properties" : {
                  "href" : {
                    "type" : "string",
                    "index" : "no"
                   }
                }
              }
            }
          },
          "externals" : {
            "properties" : {
              "imdb" : {
                "type" : "string",
                "index" : "no"
              },
              "thetvdb" : {
                "type" : "long",
                "index": "no"
              },
              "tvrage" : {
                "type" : "long",
                "index": "no"
              }
            }
          },
          "genres" : {
            "type" : "string",
            "index": "not_analyzed"
          },
          "id" : {
            "type" : "long"
          },
          "image" : {
            "properties" : {
              "medium" : {
                "type" : "string",
                "index": "no"
              },
              "original" : {
                "type" : "string",
                "index": "no"
              }
            }
          },
          "language" : {
            "type" : "string",
            "index": "not_analyzed"
          },
          "name" : {
            "type" : "string"
          },
          "network" : {
            "properties" : {
              "country" : {
                "properties" : {
                  "code" : {
                    "type" : "string",
                    "index": "not_analyzed"
                  },
                  "name" : {
                    "type" : "string"
                  },
                  "timezone" : {
                    "type" : "string",
                    "index": "not_analyzed"
                  }
                }
              },
              "id" : {
                "type" : "long"
              },
              "name" : {
                "type" : "string"
              }
            }
          },
          "premiered" : {
            "type" : "date",
            "format" : "strict_date_optional_time||epoch_millis"
          },
          "rating" : {
            "properties" : {
              "average" : {
                "type" : "double"
              }
            }
          },
          "runtime" : {
            "type" : "long"
          },
          "schedule" : {
            "properties" : {
              "days" : {
                "type" : "string",
                "index": "not_analyzed"
              },
              "time" : {
                "type" : "date",
                "format" : "hour_minute",
                "ignore_malformed": true
              }
            }
          },
          "status" : {
            "type" : "string",
            "index": "not_analyzed"            
          },
          "summary" : {
            "type" : "string",
            "index": "analyzed",
            "analyzer": "english"
          },
          "type" : {
            "type" : "string",
            "index": "not_analyzed"            
          },
          "updated" : {
            "type" : "long"
          },
          "url" : {
            "type" : "string",
            "index": "not_analyzed"            
          },
          "webChannel" : {
            "properties" : {
              "country" : {
                "properties" : {
                  "code" : {
                    "type" : "string",
                    "index": "not_analyzed"
                  },
                  "name" : {
                    "type" : "string"
                  },
                  "timezone" : {
                    "type" : "string",
                    "index": "not_analyzed"
                  }
                }
              },
              "id" : {
                "type" : "long"
              },
              "name" : {
                "type" : "string"
              }
            }
          },
          "weight" : {
            "type" : "long"
          }
        }
      }
    }
  } 
'''

requests.delete('http://localhost:9200/my_tvseries')

requests.delete('http://localhost:9200/tvseries')

r = requests.post('http://localhost:9200/tvseries', data = index_options)
print r.text

{"acknowledged":true}


In [14]:
series = ['breaking bad','blindspot','the knick','house of cards', 'orange is the new black',
          'true detective', 'game of thrones',
          'the tudors','isabel', 'versailles', 'los serrano']

for s in series:  
  data = requests.get('http://api.tvmaze.com/singlesearch/shows?q=' + s ) 
  id = data.json()['id']
  response = requests.post('http://localhost:9200/tvseries/serie/' + str(id), data = data)
  print s + " indexed: " + response.text 



breaking bad indexed: {"_index":"tvseries","_type":"serie","_id":"169","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}
blindspot indexed: {"_index":"tvseries","_type":"serie","_id":"1855","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




the knick indexed: {"_index":"tvseries","_type":"serie","_id":"51","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




house of cards indexed: {"_index":"tvseries","_type":"serie","_id":"175","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




orange is the new black indexed: {"_index":"tvseries","_type":"serie","_id":"170","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




true detective indexed: {"_index":"tvseries","_type":"serie","_id":"5","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




game of thrones indexed: {"_index":"tvseries","_type":"serie","_id":"82","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




the tudors indexed: {"_index":"tvseries","_type":"serie","_id":"712","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




isabel indexed: {"_index":"tvseries","_type":"serie","_id":"9274","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




versailles indexed: {"_index":"tvseries","_type":"serie","_id":"3386","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




los serrano indexed: {"_index":"tvseries","_type":"serie","_id":"6346","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}


## Objetivos

 - Entender la arquitectura de un buscador
 - Comprender como analizar, indizar y formular busquedas para diferentes aplicaciones 
 - Comprender como se implementa la relevancia en ES 
 - Entender como usar las opciones de relevancia para optimizar los resultados de búsqueda

## Arquitectura de un buscador

[TODO] Imagen


## Búsqueda en ES

Soporta diferentes operaciones/tipos de busqueda:
  - Búsqueda estructurada: 
      - Operadores de seleccion: a.k.a *SELECT* 
      - Operadores de filtrado:  a.k.a *WHERE*
  - **Búsqueda de texto completo** - mas potente que *WHERE c LIKE "regexp"* 
  - Agregación - pero diferente a *GROUP BY* 
  - **Ordenación** - si bien el orden suele definirse al indexar y segun el tipo - vs *SORT BY* 
  - Paginación - en contraste con *LIMIT*


## Operadores de búsqueda - QueryDSL

 - Query context: “How well does this document match this query clause?” 
 - Filter contex: “Does this document match this query clause?” 
 


### Query DSL - búsqueda de texto completo

In [17]:
import requests

payload = """
{
  "query" : {
     "match_all" : { }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())

### Query DSL - Enviando cadenas de búsqueda 

In [18]:
import requests

payload = """
{
  "query" : {
     "query_string" : { "query" : "name:'Breaking Bad'" }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())

In [25]:
payload = """
{
    "query" : {
        "match" : {
            "summary" : "New Mexico"
        }
    }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())

### QueryDSL - terms

In [23]:
payload = """
{
  "query" : {
     "term" : { "name": "breaking" }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())

In [None]:
Probar la misma query en mayusculas - no hay análisis asi que no matchea nada

https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html

### Query DSL - terms

In [27]:
payload = """
{
  "query" : {
     "terms" : { "genres": ["Drama", "Crime"] }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())

Se pueden usar como query los terminos de otro documento - por ejemplo, los seguidores de un seguidor

### Query DSL - Búsqueda por rangos

In [29]:
payload = """
{
  "query" : {
    "range" : {
        "rating.average" : {
            "gte" : 9,
            "lte" : 10,
            "boost" : 2.0
        }
    }
}
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())

### Query DSL - Búsqueda por rangos

In [31]:
payload = """
{
  "query" : {
    "range" : {
        "premiered" : {
            "gte" : "now-1y/y",
            "lte" : "now/y"
        }
    }
}
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())

### Query DSL - busquedas booleanas

 [todo] operadores booleanos

### Query DSL - busquedas booleanas

In [39]:
payload = """
{
  "query" : {
    "bool" : {
        "must" : {
            "term" : { "genres" : "Thriller" }
        },
        "must_not" : {
            "term" : { "summary" : "mexico" }
        }
    }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())


In [40]:
payload = """
{
  "query" : {
    "bool" : {
        "must" : {
            "term" : { "genres" : "Thriller" }
        },
        "must_not" : {
            "match" : { "summary" : "Mexico" }
        },
        "minimum_should_match" : 1,
        "boost" : 1.0
    }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())


In [52]:
payload = """
{
  "query" : {
    "bool" : {
        "must" : {
            "term" : { "genres" : "Thriller" }
        },
        "must_not" : {
            "term" : { "summary" : "mexico" }
        },
        "should" : [
           {"term" : { "genres" : "Mystery" }},
           {"term" : { "genres" : "Crime" }}

           ],
        "minimum_should_match" : 1,
        "boost" : 1.0
    }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search?pretty', data = payload)
RenderJSON(r.json())


### Query DSL - búsqueda  de frase 

In [54]:
payload = """
{
    "query" : {
        "match_phrase" : {
            "_all" : "Walter White"
        }
    }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search?pretty&explain', data = payload)
RenderJSON(r.json())


### Query DSL - Búsqueda por matching parcial

In [60]:
payload = """
{
  "query" : {
    "prefix" : { "name" : "break" }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search?pretty', data = payload)
RenderJSON(r.json())


- wildcards 
- regexps

Son búsquedas más costosas - en general expanden el número de terminos y requieren recorrer todo el diccionario

### Query DSL - busquedas borrosas

In [65]:
payload = """
{
  "query" : {
    "fuzzy" : { "summary" : "mejico" }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())


In [64]:
payload = """
{
  "query" : {
    "fuzzy" : {
        "summary" : {
            "value" :         "mejico",
            "boost" :         1.0,
            "fuzziness" :     1,
            "prefix_length" : 0,
            "max_expansions": 100
        }
    }
    }
}"""


r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())


### Query DSL - Boosting

In [67]:

payload = """
{
  "query" : {
     "terms" : { 
         "genres": ["Drama", "Crime"],
         "boost" : 2.0
      }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())


### Búsqueda multiindice
[todo]

In [89]:
payload = """
{
    "query" : {
        "match" : {
            "_all" : "John"
        }
    }
}
"""

r = requests.get('http://localhost:9200/tvseries,megacorp/serie,employee/_search', data = payload)
RenderJSON(r.json())


In [91]:
payload = """
{
    "query" : {
        "match" : {
            "_all" : "John"
        }
    }
}
"""

r = requests.get('http://localhost:9200/tvseries*/serie/_search', data = payload)
RenderJSON(r.json())


### Relevancia en Elasticsearch

### Relevancia:  *Practical Scoring Function*
  - recupera documentos usando un modelo booleano 
  - asigna la relevancia usando una formula basada en ideas 
    - TF-ID
    - Modelo de espacio vectorial
    

### Relevancia por defecto


$$ rel(q,d) = qNorm_q \cdot coord_{q,d} \cdot \sum_{t \in q}{tf_{t,d} \cdot idf_t^2 \cdot boost_t \cdot norm_{t,d}}$$

 - $qNorm_q$ : factor de normalización de las consultas - ignorar
 - $coord_{q,d}$ : *coordination factor* - sube la importancia de los documentos que tienen más terminos de la consulta 
 - $boost_t$: *query boost* - Sube la importancia de un determinado término
 - $norm_{t,d}$: Factor de normalizacion del indice - tiene en cuenta la longitud del documento y opcionalmente *index boost*

## Relevancia - Default score 
 
[TODO] Query sobre las series haciendo uso de summary



In [69]:
r = requests.get('http://localhost:9200/tvseries/_search?q=New Mexico')
RenderJSON(r.json())

### Explicando la relevancia 

In [70]:
r = requests.get('http://localhost:9200/tvseries/_search?q=New Mexico&explain')

In [71]:
RenderJSON(r.json())

[TODO] Explicar bien cada uno de los parámetros

## Modelos de relevancia alternativa (texto) 

Otras medidas de relavancia para documentos
  - Okapi BM 25 
  -Se puede elegir una funcion de similitud por campo. sin embargo requiere reindexar

Medidas de similitud entre cadenas:
  - Fuzzy similarity

[todo] ¿cómo cambiar la medida de relevancia?

## Otras medidas de relevancia (estructura) 

- We can take into account other relevance measures
     - Time - recency
     - Location - proximity
     - Other numerical fields
  - Difference with databases: algorithms are adapted to sort and get top k documents. 

In [93]:
payload = """
{
  "query" : {
     "terms" : { "genres": ["Drama", "Crime"] }
  },
  "sort" : { "rating.average" : "asc" }
  
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())

## Definiendo la relevancia a medida

 - function score 
 - script score


[TODO] Ejemplo para tener una fecha o una puntuacion media en cuenta 

In [107]:
payload = """
{
  "query": {
    "function_score" : {
    "query" : { 
       "terms" : { "genres": ["Drama", "Crime"] }
     },
      "random_score" : {  }
    }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search', data = payload)
RenderJSON(r.json())

In [113]:
payload = """
{
  "query": {
    "function_score" : {
    "query" : { 
       "terms" : { "genres": ["Drama", "Crime"] }
     },
      "script_score" : {
        "script" : "_score * doc['rating.average'].value"
     }
    }
  }
}
"""

r = requests.get('http://localhost:9200/tvseries/serie/_search?explain', data = payload)
RenderJSON(r.json())

In [None]:
Requiere que habilitemos los scripts dinámicos en elasticsearch.yml

script.inline: true
script.indexed: true



## Búsqueda multicampo

Motivation: 

  * Different uses: 
    * Match different full text queries in different fields: title and author
    * Order and bool queries impact, boosting may also be used
    
    * Tuning: 
       * dis_max - selecting the score of the best fields
       * tie_breaker
       * multi_match - helper to direct the same query to different fields
       * we can select fields by using regular expressions 
       * cross fields entity search
       
   * best fields 
   * most fields 
   * cross fields 


## Relevancia multicampo

## Integracion con la interfaz de búsqueda - highlighting