In [1]:
import requests

In [2]:
import json

In [3]:
headers={'Content-Type': 'application/json'}

In [4]:
def extract():
    f = open('tmdb.json')
    if f:
        return json.loads(f.read())


In [5]:
movies = extract()

In [6]:
def reindex(analysis_settings={}, mapping_settings={}, movie_dict = {}):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysis_settings,
            }
        }
    }
    
    if mapping_settings:
        settings["mappings"] = mapping_settings
    
    resp = requests.delete("http://localhost:9200/tmdb")
    resp = requests.put("http://localhost:9200/tmdb", json=settings)
    
    bulk_movies = ""
    for id, movie in movie_dict.items():
        add_cmd = {
            "index": {"_index": "tmdb", "_id": movie["id"]}
        }
        bulk_movies += json.dumps(add_cmd) + "\n" + json.dumps(movie) + "\n"
    
    resp = requests.post("http://localhost:9200/_bulk", data=bulk_movies, headers=headers)
    return resp

In [7]:
resp = reindex(movie_dict=movies)

In [8]:
def search(query):
    url = 'http://localhost:9200/tmdb/_doc/_search'
    resp = requests.get(url, json=query)
    
    search_hits = json.loads(resp.text)['hits']
    
    print("Num\tRelevance\tMovie title")
    
    for idx, hit in enumerate(search_hits['hits']):
        print(f"{idx}\t{hit['_score']}\t{hit['_source']['title']}")

In [9]:
user_search = 'basketball with cartoon aliens'

In [10]:
query = {
    "query": {
        "multi_match": {
            "query": user_search,
            "fields": ["title^10", "overview"],
        }
    }
}


In [11]:
search(query)

Num	Relevance	Movie title
0	85.56929	Aliens
1	73.71077	The Basketball Diaries
2	71.3202	Cowboys & Aliens
3	61.13922	Monsters vs Aliens
4	53.501823	Aliens vs Predator: Requiem
5	53.501823	Aliens in the Attic
6	45.221096	Dances with Wolves
7	45.221096	Friends with Benefits
8	45.221096	Fire with Fire
9	45.221096	Friends with Kids


In [12]:
resp = requests.get(
    "http://localhost:9200/tmdb/_analyze?format=yaml",
    json={"text": "Fire with Fire", "analyzer": "standard"})

In [13]:
print(resp.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "with"
  start_offset: 5
  end_offset: 9
  type: "<ALPHANUM>"
  position: 1
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



In [14]:
mapping_settings = {
    "properties": {
        "title": {
            "type": "text",
            "analyzer": "english"
        },
        "overview": {
            "type": "text",
            "analyzer": "english"
        }
    }
}

In [15]:
resp = reindex(mapping_settings=mapping_settings, movie_dict=movies)

In [16]:
resp = requests.get(
    "http://localhost:9200/tmdb/_analyze?format=yaml",
    json={"text": "Fire with Fire", "field": "title"})

In [17]:
print(resp.text)

---
tokens:
- token: "fire"
  start_offset: 0
  end_offset: 4
  type: "<ALPHANUM>"
  position: 0
- token: "fire"
  start_offset: 10
  end_offset: 14
  type: "<ALPHANUM>"
  position: 2



In [18]:
search(query)

Num	Relevance	Movie title
0	78.76023	The Basketball Diaries
1	74.090744	Alien
2	74.090744	Aliens
3	74.090744	Alien³
4	59.677002	Cowboys & Aliens
5	59.677002	Aliens in the Attic
6	59.677002	Alien: Resurrection
7	49.95806	Monsters vs Aliens
8	42.961403	Aliens vs Predator: Requiem
9	42.961403	AVP: Alien vs. Predator


In [19]:
query["explain"] = True

In [21]:
resp = requests.get('http://localhost:9200/tmdb/_doc/_search', json=query)
resp_json = resp.json()
title = resp_json['hits']['hits'][0]['_source']['title']
explain = resp_json['hits']['hits'][0]['_explanation']
# print(title)
# import pprint
# pprint.pprint(explain, compact=True)

In [22]:
query = {
    "query": {
        "multi_match": {
            "query": user_search,
            "fields": ["title^0.1", "overview"],
        }
    }
}

In [23]:
search(query)

Num	Relevance	Movie title
0	12.882349	Space Jam
1	7.5384703	Grown Ups
2	7.4996777	Speed Racer
3	7.244087	Semi-Pro
4	7.1626425	The Flintstones
5	6.943389	Coach Carter
6	6.7653713	White Men Can't Jump
7	5.845222	Meet Dave
8	5.800564	Aliens vs Predator: Requiem
9	5.440302	Bedazzled
