In [1]:
import requests
import json

In [2]:
def extract():
    f = open('../chapter-3/tmdb.json')
    if f:
        return json.loads(f.read())

In [3]:
def reindex(analysis_settings={}, mapping_settings={}, movie_dict = {}):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysis_settings,
            }
        }
    }
    
    if mapping_settings:
        settings["mappings"] = mapping_settings
    
    
    print("using stettings\n", settings)
    resp = requests.delete("http://localhost:9200/tmdb")
    print(f"previouse index removal: {resp.status_code}")
    resp = requests.put("http://localhost:9200/tmdb", json=settings)
    print("index creation", resp.json())
    
    bulk_movies = ""
    for id, movie in movie_dict.items():
        add_cmd = {
            "index": {"_index": "tmdb", "_id": movie["id"]}
        }
        bulk_movies += json.dumps(add_cmd) + "\n" + json.dumps(movie) + "\n"
    
    resp = requests.post(
        "http://localhost:9200/_bulk",
        data=bulk_movies,
        headers={'Content-Type': 'application/json'}
    )
    return resp

In [4]:
def search(query):
    url = 'http://localhost:9200/tmdb/_doc/_search'
    resp = requests.get(url, json=query)
    
    search_hits = json.loads(resp.text)['hits']
    
    print("Num\tid\tRelevance\tMovie title")
    
    for idx, hit in enumerate(search_hits['hits']):
        print(f"{idx}\t{hit['_id']}\t{hit['_score']}\t{hit['_source']['title']}")

In [5]:
movie_dict = extract()

In [6]:
analysis_settings = {
   "analyzer": {
      "default": {
        "type": "english"
      },
      "english_bigrams": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "porter_stem",
            "bigram_filter"
          ]
      }
    },
    "filter": {
        "bigram_filter": {
          "type": "shingle",
          "min_shingle_size": 2,
          "max_shingle_size": 2,
          "output_unigrams": False
        }
    }
}

In [7]:
mapping_settings = {
    "properties": {
        "cast": {
            "properties": {
                "name": {
                    "type": "text",
                    "analyzer": "english",
                    "fields": {
                        "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                    }
                }
            }
        },
        "directors": {
            "properties": {
                "name": {
                    "type": "text",
                    "analyzer": "english",
                    "fields": {
                        "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                    }
                }
            }
        }
    }
}

In [8]:
r = reindex(analysis_settings, mapping_settings, movie_dict)

using stettings
 {'settings': {'number_of_shards': 1, 'index': {'analysis': {'analyzer': {'default': {'type': 'english'}, 'english_bigrams': {'tokenizer': 'standard', 'filter': ['lowercase', 'porter_stem', 'bigram_filter']}}, 'filter': {'bigram_filter': {'type': 'shingle', 'min_shingle_size': 2, 'max_shingle_size': 2, 'output_unigrams': False}}}}}, 'mappings': {'properties': {'cast': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}, 'directors': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}}}}
previouse index removal: 200
index creation {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}


In [18]:
user_search = "star trek patrick stewart william shatner"
query = {
    "query": {
        "multi_match": {
            "query": user_search,
            "fields": ["title", "cast.name.bigramed", "directors.name.bigramed"],
            "type": "most_fields"
        }
    }
}

In [10]:
# leaving out overview leaves us with albino elephant problem

In [19]:
search(query)

Num	id	Relevance	Movie title
0	172	20.892147	Star Trek V: The Final Frontier
1	193	19.903265	Star Trek: Generations
2	200	15.065897	Star Trek: Insurrection
3	201	14.39519	Star Trek: Nemesis
4	199	13.657165	Star Trek: First Contact
5	152	13.020787	Star Trek: The Motion Picture
6	154	12.988506	Star Trek II: The Wrath of Khan
7	168	12.557999	Star Trek IV: The Voyage Home
8	157	12.302769	Star Trek III: The Search for Spock
9	174	11.7537365	Star Trek VI: The Undiscovered Country


###### This is where we fnished chapter 5

#### 6.3.2

In [20]:
query = {
    "query": {
        "query_string": {
            "query": user_search,
            "fields": ["title", "cast.name.bigramed", "directors.name.bigramed"]
        }
    }
}

In [21]:
search(query)

Num	id	Relevance	Movie title
0	193	10.895414	Star Trek: Generations
1	13475	10.760254	Star Trek
2	200	9.007851	Star Trek: Insurrection
3	201	9.007851	Star Trek: Nemesis
4	54138	9.007851	Star Trek Into Darkness
5	172	8.1974535	Star Trek V: The Final Frontier
6	152	7.7462955	Star Trek: The Motion Picture
7	199	7.7462955	Star Trek: First Contact
8	5851	7.0745726	Showtime
9	12610	6.878936	Osmosis Jones


In [23]:
# already works 🤔, let's add overview and see how it goes

In [24]:
query = {
    "query": {
        "query_string": {
            "query": user_search,
            "fields": ["title", "overview", "cast.name.bigramed", "directors.name.bigramed"]
        }
    }
}

In [25]:
search(query)

Num	id	Relevance	Movie title
0	18126	14.751948	Hannah Montana: The Movie
1	193	10.895414	Star Trek: Generations
2	13475	10.760254	Star Trek
3	168	10.057888	Star Trek IV: The Voyage Home
4	200	9.007851	Star Trek: Insurrection
5	201	9.007851	Star Trek: Nemesis
6	54138	9.007851	Star Trek Into Darkness
7	172	8.1974535	Star Trek V: The Final Frontier
8	9476	8.182335	A Knight's Tale
9	10567	8.1140175	Dinosaur


In [26]:
# hannah montana 🤔

In [37]:
query = {
    "query": {
        "query_string": {
            "query": user_search,
            "fields": ["title", "overview", "cast.name^10", "directors.name"]
        }
    }
}

In [38]:
search(query)

Num	id	Relevance	Movie title
0	193	135.12723	Star Trek: Generations
1	5851	97.44539	Showtime
2	12610	94.6119	Osmosis Jones
3	9904	92.86618	The Wild
4	10040	86.62215	Miss Congeniality 2: Armed and Fabulous
5	154	83.66445	Star Trek II: The Wrath of Khan
6	8834	82.89814	Conspiracy Theory
7	1649	82.85913	Bill & Ted's Bogus Journey
8	172	79.57234	Star Trek V: The Final Frontier
9	168	77.672806	Star Trek IV: The Voyage Home


In [45]:
mapping_settings["properties"]["people"] = {
    "properties": {
        "name": {
            "type": "text",
            "analyzer": "english",
            "fields": {
                "bigramed": {
                    "type": "text",
                    "analyzer": "english_bigrams"
                }
            }
        }
    }
}

In [40]:
mapping_settings["properties"]["cast"]["properties"]["name"]["copy_to"] = "people.name"
mapping_settings["properties"]["directors"]["properties"]["name"]["copy_to"] = "people.name"

In [46]:
reindex(analysis_settings, mapping_settings, movie_dict)

using stettings
 {'settings': {'number_of_shards': 1, 'index': {'analysis': {'analyzer': {'default': {'type': 'english'}, 'english_bigrams': {'tokenizer': 'standard', 'filter': ['lowercase', 'porter_stem', 'bigram_filter']}}, 'filter': {'bigram_filter': {'type': 'shingle', 'min_shingle_size': 2, 'max_shingle_size': 2, 'output_unigrams': False}}}}}, 'mappings': {'properties': {'cast': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}, 'copy_to': 'people.name'}}}, 'directors': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}, 'copy_to': 'people.name'}}}, 'people': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigrammed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}}}}
previouse index removal: 200
index creation {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}


<Response [200]>

In [47]:
query_people = {
    "query": {
        "match": {
            "people.name": "patrick stewart william shatner"
        }
    }
}

In [48]:
search(query_people)

Num	id	Relevance	Movie title
0	193	13.423487	Star Trek: Generations
1	172	10.511824	Star Trek V: The Final Frontier
2	9904	9.807777	The Wild
3	5851	9.551281	Showtime
4	12610	9.049201	Osmosis Jones
5	10040	8.603318	Miss Congeniality 2: Armed and Fabulous
6	1649	8.468272	Bill & Ted's Bogus Journey
7	154	8.286876	Star Trek II: The Wrath of Khan
8	8834	8.181319	Conspiracy Theory
9	168	7.728797	Star Trek IV: The Voyage Home


In [49]:
user_search = "star trek patrick stewart william shatner"
query = {
    "query": {
        "match": {
            "_all": user_search
        }
    }
}

In [50]:
search(query)

Num	id	Relevance	Movie title


In [51]:
# _all is not available in ES 6.0+, copy_to should be used -> skipping this

In [70]:
query = {
    "query": {
        "multi_match": {
            "query": user_search,
            "fields": ["title", "overview^0.1", "cast.name", "directors.name"],
            "type": "cross_fields"
        }
    }
}

In [72]:
search(query)

Num	id	Relevance	Movie title
0	193	13.512724	Star Trek: Generations
1	5851	9.744539	Showtime
2	12610	9.46119	Osmosis Jones
3	9904	9.286617	The Wild
4	13475	9.033916	Star Trek
5	10040	8.662214	Miss Congeniality 2: Armed and Fabulous
6	154	8.366446	Star Trek II: The Wrath of Khan
7	8834	8.289814	Conspiracy Theory
8	1649	8.285913	Bill & Ted's Bogus Journey
9	201	7.9637537	Star Trek: Nemesis


In [77]:
query = {
    "query": {
        "multi_match": {
            "query": user_search,
            "fields": ["title", "overview^0.5", "people.name"],
            "type": "most_fields"
        }
    }
}

In [78]:
search(query)

Num	id	Relevance	Movie title
0	193	24.12532	Star Trek: Generations
1	168	19.552435	Star Trek IV: The Voyage Home
2	172	17.306519	Star Trek V: The Final Frontier
3	201	17.040085	Star Trek: Nemesis
4	200	15.842175	Star Trek: Insurrection
5	154	15.081571	Star Trek II: The Wrath of Khan
6	152	15.064396	Star Trek: The Motion Picture
7	199	14.339674	Star Trek: First Contact
8	157	14.271722	Star Trek III: The Search for Spock
9	174	13.47393	Star Trek VI: The Undiscovered Country


In [79]:
query = {
    "query": {
        "bool": {
            "should": [
                {
                    "multi_match": {
                        "query": user_search,
                        "fields": ["directors.name", "cast.name"],
                        "type": "cross_fields"
                    }
                },
                {
                    "multi_match": {
                        "query": user_search,
                        "fields": ["overview", "title"],
                        "type": "cross_fields"
                    }
                }
            ]
        }
    }
}

In [80]:
search(query)

Num	id	Relevance	Movie title
0	168	23.529747	Star Trek IV: The Voyage Home
1	193	21.658974	Star Trek: Generations
2	201	17.672958	Star Trek: Nemesis
3	18126	16.92695	Hannah Montana: The Movie
4	200	14.535864	Star Trek: Insurrection
5	154	14.071022	Star Trek II: The Wrath of Khan
6	152	13.833348	Star Trek: The Motion Picture
7	172	13.661812	Star Trek V: The Final Frontier
8	199	13.214497	Star Trek: First Contact
9	157	13.203346	Star Trek III: The Search for Spock


In [91]:
query = {
    "query": {
        "bool": {
            "should": [
                {
                    "multi_match": {
                        "query": user_search,
                        "fields": ["directors.name.bigramed", "cast.name.bigramed"],
                        "type": "cross_fields"
                    }
                },
                {
                    "multi_match": {
                        "query": user_search,
                        "fields": ["overview^0.5", "title^2", "directors.name", "cast.name"],
                        "type": "cross_fields"
                    }
                }
            ]
        }
    }
}

In [92]:
search(query)

Num	id	Relevance	Movie title
0	193	26.020739	Star Trek: Generations
1	201	22.518124	Star Trek: Nemesis
2	168	22.201403	Star Trek IV: The Voyage Home
3	200	21.18337	Star Trek: Insurrection
4	199	18.917883	Star Trek: First Contact
5	152	18.281506	Star Trek: The Motion Picture
6	13475	18.067833	Star Trek
7	154	17.602966	Star Trek II: The Wrath of Khan
8	172	17.309155	Star Trek V: The Final Frontier
9	157	16.917229	Star Trek III: The Search for Spock
