# Setup

In [1]:
from elasticsearch import (
    Elasticsearch,
    helpers
)
import pickle

In [2]:
#initialize
es = Elasticsearch()

In [3]:
movies=pickle.load(open("../movies.p","rb"))

In [4]:
try:
    es.indices.delete("tmdb")
except:
    pass

#create index here?
# genres.name needs to be keyword tokenized so that 'science fiction' doesn't get split on white space
# maybe create a text field with title and overview to search against
body = {
    "mappings": {
      "movie": {
        "properties": {
          "genres": {
            "properties": {
              "name": { 
                "type": "string",
                "index": "not_analyzed"}}},
          "title": {
            "type": "string",
            "analyzer": "english"}}}}}
es.indices.create("tmdb",body=body)

{u'acknowledged': True}

In [10]:
#doc indexer
print movies

def format_doc(doc):
    action = {
        "_index": "tmdb",
        "_type": "movie",
        "_id": doc['id\r'],
        "_source": doc
        }
    return action

def index_movies():
    actions = (format_doc(doc) for doc in movies)
    results = [details for success,details in helpers.streaming_bulk(es, actions) if not success]
    return results



In [11]:
results = index_movies()

# Examples

In [12]:
# basic facet example
body = {
    "aggs": {
        "genres": {
            "terms": {
                "field": "genres.name"
            }
        },
    }
}
es.search(index="tmdb",body=body,size=0)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'genres': {u'buckets': [],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [], u'max_score': 0.0, u'total': 400},
 u'timed_out': False,
 u'took': 57}

In [13]:
# facet with a filter in place - notice the different aggregation numbers
body = {
    "fields":["title"],
    "query": {
        "bool": {
            "filter": [
              {"term": {"genres.name": "Science Fiction"}}
            ]
        }
    },
    "aggs": {
        "genres": {
            "terms": {
                "field": "genres.name"
            }
        },
    }

}
es.search(index="tmdb",body=body,size=5)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'genres': {u'buckets': [],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [], u'max_score': None, u'total': 0},
 u'timed_out': False,
 u'took': 21}

# Scratch

In [14]:
# see how the genre doc conuts tum to 209 while the original language counts sum to 90 (the num docs)
body = {
    "fields":["title"],
    "query":{
        "match":{
            "genres.name": "Science Fiction"}},
    "aggs": {
        "genres": {
            "terms": {
                "field": "genres.name"
            }
        },
    }

}
es.search(index="tmdb",body=body,size=100)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'genres': {u'buckets': [],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [], u'max_score': None, u'total': 0},
 u'timed_out': False,
 u'took': 8}

In [15]:
# facet with a filter in place
body = {
    "fields":["title"],
    "query": {
        "filtered": {
            "query": {
                "match":{
                    "title": "star trek"}},
            "filter": {
                "term": {
                    "genres.name": "Science Fiction"}}}},
    "aggs": {
        "genres": {
            "terms": {
                "field": "genres.name"
            }
        },
    }

}
es.search(index="tmdb",body=body,size=100)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'genres': {u'buckets': [],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [], u'max_score': None, u'total': 0},
 u'timed_out': False,
 u'took': 11}

#Trash

In [16]:
{ "query": {
    "filtered": {
      "filter": {
        "term": {
          "genres.name": "Science Fiction"}}}},
  "aggs": {
    "genres": {
      "terms": {
        "field": "genres.name"}}}}


{'aggs': {'genres': {'terms': {'field': 'genres.name'}}},
 'query': {'filtered': {'filter': {'term': {'genres.name': 'Science Fiction'}}}}}

In [17]:
x= [
    {'doc_count': 7546, 'key': 'Drama'},
    {'doc_count': 5342, 'key': 'Comedy'},
    {'doc_count': 3878, 'key': 'Thriller'},
    {'doc_count': 3753, 'key': 'Action'},
    {'doc_count': 2623, 'key': 'Romance'},
    {'doc_count': 2165, 'key': 'Adventure'},
    {'doc_count': 1981, 'key': 'Horror'},
    {'doc_count': 1861, 'key': 'Crime'},
    {'doc_count': 1640, 'key': 'Family'},
    {'doc_count': 1597, 'key': 'Science Fiction'}]
sum([y['doc_count'] for y in x])

32386

In [18]:
es.cluster.stats()['indices'].keys()

[u'count',
 u'completion',
 u'fielddata',
 u'docs',
 u'segments',
 u'shards',
 u'query_cache',
 u'percolate',
 u'store']