### ElasticSearch 

# Analytics

In [1]:
## Code from: https://www.reddit.com/r/IPython/comments/34t4m7/lpt_print_json_in_collapsible_format_in_ipython/

import uuid
from IPython.display import display_javascript, display_html, display
import json

class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json
        self.uuid = str(uuid.uuid4())

    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid),
            raw=True
        )
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
          document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)

In [2]:
import requests

In [3]:
index_options = '''
{ 
  "mappings" : { 
      "serie" : {
        "properties" : {
          "_links" : {
            "properties" : {
              "nextepisode" : {
                "properties" : {
                  "href" : {
                    "type" : "string",
                    "index" : "no"
                  }
                }
              },
              "previousepisode" : {
                "properties" : {
                  "href" : {
                    "type" : "string",
                    "index" : "no"
                  }
                }
              },
              "self" : {
                "properties" : {
                  "href" : {
                    "type" : "string",
                    "index" : "no"
                   }
                }
              }
            }
          },
          "externals" : {
            "properties" : {
              "imdb" : {
                "type" : "string",
                "index" : "no"
              },
              "thetvdb" : {
                "type" : "long",
                "index": "no"
              },
              "tvrage" : {
                "type" : "long",
                "index": "no"
              }
            }
          },
          "genres" : {
            "type" : "string",
            "index": "not_analyzed"
          },
          "id" : {
            "type" : "long"
          },
          "image" : {
            "properties" : {
              "medium" : {
                "type" : "string",
                "index": "no"
              },
              "original" : {
                "type" : "string",
                "index": "no"
              }
            }
          },
          "language" : {
            "type" : "string",
            "index": "not_analyzed"
          },
          "name" : {
            "type" : "string"
          },
          "network" : {
            "properties" : {
              "country" : {
                "properties" : {
                  "code" : {
                    "type" : "string",
                    "index": "not_analyzed"
                  },
                  "name" : {
                    "type" : "string"
                  },
                  "timezone" : {
                    "type" : "string",
                    "index": "not_analyzed"
                  }
                }
              },
              "id" : {
                "type" : "long"
              },
              "name" : {
                "type" : "string"
              }
            }
          },
          "premiered" : {
            "type" : "date",
            "format" : "strict_date_optional_time||epoch_millis"
          },
          "rating" : {
            "properties" : {
              "average" : {
                "type" : "double"
              }
            }
          },
          "runtime" : {
            "type" : "long"
          },
          "schedule" : {
            "properties" : {
              "days" : {
                "type" : "string",
                "index": "not_analyzed"
              },
              "time" : {
                "type" : "date",
                "format" : "hour_minute",
                "ignore_malformed": true
              }
            }
          },
          "status" : {
            "type" : "string",
            "index": "not_analyzed"            
          },
          "summary" : {
            "type" : "string",
            "index": "analyzed",
            "analyzer": "english"
          },
          "type" : {
            "type" : "string",
            "index": "not_analyzed"            
          },
          "updated" : {
            "type" : "long"
          },
          "url" : {
            "type" : "string",
            "index": "not_analyzed"            
          },
          "webChannel" : {
            "properties" : {
              "country" : {
                "properties" : {
                  "code" : {
                    "type" : "string",
                    "index": "not_analyzed"
                  },
                  "name" : {
                    "type" : "string"
                  },
                  "timezone" : {
                    "type" : "string",
                    "index": "not_analyzed"
                  }
                }
              },
              "id" : {
                "type" : "long"
              },
              "name" : {
                "type" : "string"
              }
            }
          },
          "weight" : {
            "type" : "long"
          }
        }
      }
    }
  } 
'''

requests.delete('http://localhost:9200/my_tvseries')

requests.delete('http://localhost:9200/tvseries')

r = requests.post('http://localhost:9200/tvseries', data = index_options)
print r.text

{"acknowledged":true}


In [4]:
series = ['breaking bad','blindspot','the knick','house of cards', 'orange is the new black',
          'true detective', 'game of thrones',
          'the tudors','isabel', 'versailles', 'los serrano']

for s in series:  
  data = requests.get('http://api.tvmaze.com/singlesearch/shows?q=' + s ) 
  id = data.json()['id']
  response = requests.post('http://localhost:9200/tvseries/serie/' + str(id), data = data)
  print s + " indexed: " + response.text 



breaking bad indexed: {"_index":"tvseries","_type":"serie","_id":"169","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}
blindspot indexed: {"_index":"tvseries","_type":"serie","_id":"1855","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




the knick indexed: {"_index":"tvseries","_type":"serie","_id":"51","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




house of cards indexed: {"_index":"tvseries","_type":"serie","_id":"175","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




orange is the new black indexed: {"_index":"tvseries","_type":"serie","_id":"170","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




true detective indexed: {"_index":"tvseries","_type":"serie","_id":"5","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




game of thrones indexed: {"_index":"tvseries","_type":"serie","_id":"82","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




the tudors indexed: {"_index":"tvseries","_type":"serie","_id":"712","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




isabel indexed: {"_index":"tvseries","_type":"serie","_id":"9274","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




versailles indexed: {"_index":"tvseries","_type":"serie","_id":"3386","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}




los serrano indexed: {"_index":"tvseries","_type":"serie","_id":"6346","_version":1,"_shards":{"total":1,"successful":1,"failed":0},"created":true}


Build an overview of the results that match a query: 
    
Ejemplos de queries: 
- ¿Cuantos comentarios tienen los post que tienen los terminos de búsqueda "Big Data" o "Data Science" ?
- ¿Quienes son los autores más populares de estos términos?
- ¿Cuales son las etiquetas más populares de estos posts? 

We can do these queries to work with near Real Time data 

## Aggregation

* Buckets - collection of documents that match a criterium ==> GROUP BY 
* Metrics - Statistics calculated in a bucket ==> AGGREGATE FUNCTIONS : COUNT(), SUM(), MAX()

* Aggregations are executed in the context of search results - send them to /search 
* It is just another step

## Buckets

 - Definition: A collection of documents that match a criteria
 - Buckets can be nested


## Example : aggregation 

<pre>
GET /cars/transactions/_search
{
    "size" : 0,
    "aggs" : { 
        "popular_colors" : { 
            "terms" : { 
              "field" : "color"
            }
        }
    }
}
</pre>

   * size: 0 indicates that we do not care about the results
   * "popular colors": name of the aggregation 
   

## Agregando los resultados

In [None]:
payload = """
{
    "aggs" : { 
        "popular_languages" : { 
            "terms" : { 
              "field" : "language"
            }
        }
    }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [None]:
payload = """
{
    "aggs" : { 
        "popular_languages" : { 
            "terms" : { 
              "field" : "language"
            }
        },
        "popular_countries" : { 
            "terms" : { 
              "field" : "network.country.code"
            }
        }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [None]:
payload = """
{
    "aggs" : { 
        "popular_languages" : { 
            "terms" : { 
              "field" : "language"
            }
        },
        "popular_countries" : { 
            "terms" : { 
              "field" : "network.country.code"
            }
        },
        "popular_genres" : { 
            "terms" : { 
              "field" : "genres"
            }
        }        
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [None]:
payload = """
{
    "aggs" : { 
        "popular_languages" : { 
            "terms" : { 
              "field" : "language"
            }
        },
        "popular_countries" : { 
            "terms" : { 
              "field" : "network.country.code"
            }
        },
        "popular_country_names" : { 
            "terms" : { 
              "field" : "network.country.name"
            }
        }        
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

## Sorting

In [7]:
payload = """
{
  "query" : {
     "terms" : { "genres": ["Comedy", "Crime"] }
  },
    "aggs" : { 
        "popular_countries" : { 
            "terms" : { 
              "field" : "network.country.code",
              "order" : {
                  "_count" : "asc"
              }
            }
        }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [None]:
payload = """
{
    "aggs" : { 
        "popular_countries" : { 
            "terms" : { 
              "field" : "network.country.code",
              "order" : {
                  "_term" : "asc"
              }
            }
        }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

## Example: extract average

<pre>
GET /cars/transactions/_search
{
   "size" : 0,
   "aggs": {
      "colors": {
         "terms": {
            "field": "color"
         },
         "aggs": { 
            "avg_price": { 
               "avg": {
                  "field": "price" 
               }
            }
         }
      }
   }
}
</pre>

In [None]:
payload = """
{
    "aggs" : { 
        "avg_duration" : { 
            "avg" : { 
              "field" : "runtime"
            }
        }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [None]:
payload = """
{
    "aggs" : { 
        "avg_duration" : { 
            "avg" : { 
              "field" : "runtime"
            }
        },
        "avg_rating" : { 
            "avg" : { 
              "field" : "rating.average"
            }
        }        
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [None]:
payload = """
{
    "aggs" : { 
        "popular_countries" : { 
            "terms" : { 
              "field" : "network.country.code",
              "order" : {
                  "_count" : "asc"
              }
            }
        },    
        "avg_duration" : { 
            "avg" : { 
              "field" : "runtime"
            }
        },
        "avg_rating" : { 
            "avg" : { 
              "field" : "rating.average"
            }
        }        
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

## Example: nested aggregations


In [None]:
payload = """
{
    "aggs" : { 
        "popular_countries" : { 
            "terms" : { 
              "field" : "network.country.code",
              "order" : {
                  "_count" : "asc"
              }
            },
        "aggs" : {
            "avg_duration" : { 
               "avg" : { 
                  "field" : "runtime"
               }
            },
            "avg_rating" : { 
               "avg" : { 
                  "field" : "rating.average"
               }
            }        
          
          }  
       }   
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [None]:
payload = """
{
    "aggs" : { 
        "popular_countries" : { 
            "terms" : { 
              "field" : "network.country.code",
              "order" : {
                  "avg_rating" : "desc"
              }
            },
        "aggs" : {
            "avg_duration" : { 
               "avg" : { 
                  "field" : "runtime"
               }
            },
            "avg_rating" : { 
               "avg" : { 
                  "field" : "rating.average"
               }
            }        
          
          }  
       }   
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [None]:
payload = """
{
    "aggs" : { 
        "popular_countries" : { 
            "terms" : { 
              "field" : "network.country.code",
              "order" : {
                  "stats.min" : "asc"
              }
            },
        "aggs" : {
            "stats" : { 
               "extended_stats" : { 
                  "field" : "rating.average"
               }
            }        
          }  
       }   
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

## Example: histograms

In [None]:
payload = """
{
    "aggs" : { 
        "ratings_histogram" : { 
            "histogram" : { 
              "field" : "rating.average",
              "interval": 1
            }
       }   
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [None]:
payload = """
{
    "aggs" : { 
        "ratings_histogram" : { 
            "histogram" : { 
              "field" : "rating.average",
              "interval": 1
            },
         "aggs" : {
            "avg_rating": {
               "avg" : { "field" : "rating.average" }
            }
         }
       }   
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

## Example: Time histograms

In [None]:
payload = """
{
    "aggs" : { 
        "premiere_histogram" : { 
            "date_histogram" : { 
              "field" : "premiered",
              "interval": "year",
              "format" : "yyyy"
            }
       }   
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [None]:
payload = """
{
    "aggs" : { 
        "premiere_histogram" : { 
            "date_histogram" : { 
              "field" : "premiered",
              "interval": "month",
              "format" : "yyyy-MM-dd"
            }
       }   
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

## Scope
  - Las agregaciones siempre se calculan en el contexto de una consulta (query) 
  - Por defecto, match_all

In [20]:
payload = """
{
  "query" : {
     "match" : { "summary" : "white" } 
  },
    "aggs" : { 
        "popular_genres" : { 
            "terms" : { 
              "field" : "genres",
              "order" : {
                  "_term" : "asc"
              }
            }
        }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

### Evitando mostrar los resultados

 - Si solo estamos interesados en el resultado de la agregación, podemos evitar sacar todos los documentos

In [22]:
payload = """
{
  "query" : {
     "match" : { "summary" : "white" } 
  },
  "size" : 0,
    "aggs" : { 
        "popular_genres" : { 
            "terms" : { 
              "field" : "genres",
              "order" : {
                  "_term" : "asc"
              }
            }
        }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [36]:
payload = """
{
  "query" : {
     "match" : { "summary" : "white" } 
  },
  "filter" : {
     "terms" : { "genres": ["Medical"] }
  },
    "aggs" : { 
        "popular_genres" : { 
            "terms" : { 
              "field" : "genres",
              "order" : {
                  "_term" : "asc"
              }
            }
        }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

## Filtering aggregations and search

In [35]:
payload = """
{
  "query" : {
     "filtered" :  {
           "filter" : {
                 "terms" : { "genres": ["Medical"] }
            }
     }
   },
   "aggs" : { 
        "popular_genres" : { 
            "terms" : { 
              "field" : "genres",
              "order" : {
                  "_term" : "asc"
              }
            }
        }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [37]:
payload = """
{

  "query" : {
     "bool" : {
        "must" : {
            "match" : { "summary" : "white" }
        },
        "filter": {
            "term" : { "genres" : "Medical" }
        }
    }    
   },
   "aggs" : { 
        "popular_genres" : { 
            "terms" : { 
              "field" : "genres",
              "order" : {
                  "_term" : "asc"
              }
            }
        }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [31]:
payload = """
{
  "query" : {
     "match" : { "summary" : "white" } 
  },
    "aggs" : { 
        "popular_genres" : {
           "filter" : {
              "terms" : { "genres": ["Medical"] }
            },
            "aggs" : {
               "subset" : {
                "terms" : { 
                  "field" : "genres",
                  "order" : {
                      "_term" : "asc"
                  }
                }
            }
            }
        }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [41]:
payload = """
{
  "query" : {
     "match" : { "summary" : "white" } 
  },
  "aggs" : {
  "all" : {
    "global" : {},
    "aggs" : { 
        "popular_genres" : { 
            "terms" : { 
              "field" : "genres",
              "order" : {
                  "_term" : "asc"
              }
            }
        }
      }
   }
   }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

## Kibana

![Dashboard de ejemplo en Kibana](https://www.elastic.co/guide/en/elasticsearch/guide/current/images/elas_29in03.png)

## Aproximate aggregations

 * cardinalities: distinct count - based on HyperLogLog++
    - hace una estimación estadística de un número de valores grandes
    - usa una cantidad limitada de memoria
 * percentiles  - based on T-digests

In [44]:
payload = """
{
    size: 0,
    "aggs" : { 
        "popular_genres" : { 
            "terms" : { 
              "field" : "genres",
              "order" : {
                  "_term" : "asc"
              }
            }
        },
        "distinct_genres" : {
           "cardinality" : {
              "field" : "genres"
           }
        }
      }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

In [46]:
payload = """
{
    size: 0,
    "aggs" : { 
        "popular_genres" : { 
            "terms" : { 
              "field" : "genres",
              "order" : {
                  "_term" : "asc"
              }
            }
        },
        "distinct_genres" : {
           "cardinality" : {
              "field" : "genres"
           }
        },
       "percentile_ratings" : {
           "percentiles" : {
              "field" : "rating.average"
           }
        }
      }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())

## Significant terms

significant_terms aggregation does just this. It analyzes your data and finds terms that appear with a frequency that is statistically anomalous compared to the background data.

In [52]:
payload = """
{
   "query" : {
     "query_string" : { "query" : "summary:white" }
    },
    size: 0,
    "aggs" : { 
        "important_terms" : { 
            "significant_terms" : { 
              "field" : "summary"
            }
        }
    }
}
"""

r = requests.get("http://localhost:9200/tvseries/serie/_search", data = payload)

RenderJSON(r.json())