In [1]:
import requests
import json

In [2]:
def extract():
    f = open('../chapter-3/tmdb.json')
    if f:
        return json.loads(f.read())

In [3]:
def reindex(analysis_settings={}, mapping_settings={}, movie_dict = {}):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysis_settings,
            }
        }
    }
    
    if mapping_settings:
        settings["mappings"] = mapping_settings
    
    
    print("using stettings\n", settings)
    resp = requests.delete("http://localhost:9200/tmdb")
    print(f"previouse index removal: {resp.status_code}")
    resp = requests.put("http://localhost:9200/tmdb", json=settings)
    print("index creation", resp.json())
    
    bulk_movies = ""
    for id, movie in movie_dict.items():
        add_cmd = {
            "index": {"_index": "tmdb", "_id": movie["id"]}
        }
        bulk_movies += json.dumps(add_cmd) + "\n" + json.dumps(movie) + "\n"
    
    resp = requests.post(
        "http://localhost:9200/_bulk",
        data=bulk_movies,
        headers={'Content-Type': 'application/json'}
    )
    return resp

In [4]:
def search(query):
    url = 'http://localhost:9200/tmdb/_doc/_search'
    resp = requests.get(url, json=query)
    
    search_hits = json.loads(resp.text)['hits']
    
    print("Num\tid\tRelevance\tMovie title")
    
    for idx, hit in enumerate(search_hits['hits']):
        print(f"{idx}\t{hit['_id']}\t{hit['_score']}\t{hit['_source']['title']}")

In [5]:
movie_dict = extract()

In [6]:
analysis_settings = {
   "analyzer": {
      "default": {
        "type": "english"
      },
      "english_bigrams": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "porter_stem",
            "bigram_filter"
          ]
      }
    },
    "filter": {
        "bigram_filter": {
          "type": "shingle",
          "min_shingle_size": 2,
          "max_shingle_size": 2,
          "output_unigrams": False
        }
    }
}

In [7]:
mapping_settings = {
    "properties": {
        "cast": {
            "properties": {
                "name": {
                    "type": "text",
                    "analyzer": "english",
                    "fields": {
                        "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                    }
                }
            }
        },
        "directors": {
            "properties": {
                "name": {
                    "type": "text",
                    "analyzer": "english",
                    "fields": {
                        "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                    }
                }
            }
        }
    }
}

In [9]:
r = reindex(analysis_settings, mapping_settings, movie_dict)

using stettings
 {'settings': {'number_of_shards': 1, 'index': {'analysis': {'analyzer': {'default': {'type': 'english'}, 'english_bigrams': {'tokenizer': 'standard', 'filter': ['lowercase', 'porter_stem', 'bigram_filter']}}, 'filter': {'bigram_filter': {'type': 'shingle', 'min_shingle_size': 2, 'max_shingle_size': 2, 'output_unigrams': False}}}}}, 'mappings': {'properties': {'cast': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}, 'directors': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}}}}
previouse index removal: 404
index creation {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}


In [16]:
user_search = "star trek patrick stewart william shatner"
query = {
    "query": {
        "multi_match": {
            "query": user_search,
            "fields": ["title", "cast.name.bigramed", "directors.name.bigramed"],
            "type": "most_fields"
        }
    }
}

In [None]:
# leaving out overview leaves us with albino elephant problem

In [17]:
search(query)

Num	id	Relevance	Movie title
0	172	20.892147	Star Trek V: The Final Frontier
1	193	19.903265	Star Trek: Generations
2	200	15.065897	Star Trek: Insurrection
3	201	14.39519	Star Trek: Nemesis
4	199	13.657165	Star Trek: First Contact
5	152	13.020787	Star Trek: The Motion Picture
6	154	12.988506	Star Trek II: The Wrath of Khan
7	168	12.557999	Star Trek IV: The Voyage Home
8	157	12.302769	Star Trek III: The Search for Spock
9	174	11.7537365	Star Trek VI: The Undiscovered Country


###### This is where we fnished chapter 5