In [1]:
import requests
import json

In [2]:
def extract():
    f = open('../chapter-3/tmdb.json')
    if f:
        return json.loads(f.read())

In [44]:
def reindex(analysis_settings={}, mapping_settings={}, movie_dict={}, transform=None):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysis_settings,
            }
        }
    }
    
    if mapping_settings:
        settings["mappings"] = mapping_settings
    
    
    print("using stettings\n", settings)
    resp = requests.delete("http://localhost:9200/tmdb")
    print(f"previouse index removal: {resp.status_code}")
    resp = requests.put("http://localhost:9200/tmdb", json=settings)
    print("index creation", resp.json())
    
    bulk_movies = ""
    for id, movie in movie_dict.items():
        add_cmd = {
            "index": {"_index": "tmdb", "_id": movie["id"]}
        }
        if transform:
            transform(movie)
            
        bulk_movies += json.dumps(add_cmd) + "\n" + json.dumps(movie) + "\n"
    
    resp = requests.post(
        "http://localhost:9200/_bulk",
        data=bulk_movies,
        headers={'Content-Type': 'application/json'}
    )
    return resp

In [4]:
def search(query):
    url = 'http://localhost:9200/tmdb/_doc/_search'
    resp = requests.get(url, json=query)
        
    try:
        search_hits = json.loads(resp.text)['hits']
    
        print("Num\tid\tRelevance\tMovie title")
        for idx, hit in enumerate(search_hits['hits']):
            print(f"{idx}\t{hit['_id']}\t{hit['_score']}\t{hit['_source']['title']}")
    except:
        pass
    
    return resp

In [5]:
analysis_settings = {
   "analyzer": {
      "default": {
        "type": "english"
      },
      "english_bigrams": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "porter_stem",
            "bigram_filter"
          ]
      }
    },
    "filter": {
        "bigram_filter": {
          "type": "shingle",
          "min_shingle_size": 2,
          "max_shingle_size": 2,
          "output_unigrams": False
        }
    }
}

In [6]:
mapping_settings = {
    "properties": {
        "cast": {
            "properties": {
                "name": {
                    "type": "text",
                    "analyzer": "english",
                    "fields": {
                        "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                    }
                }
            }
        },
        "directors": {
            "properties": {
                "name": {
                    "type": "text",
                    "analyzer": "english",
                    "fields": {
                        "bigramed": {
                            "type": "text",
                            "analyzer": "english_bigrams"
                        }     
                    }
                }
            }
        }
    }
}

In [18]:
reindex(analysis_settings, mapping_settings, extract())

using stettings
 {'settings': {'number_of_shards': 1, 'index': {'analysis': {'analyzer': {'default': {'type': 'english'}, 'english_bigrams': {'tokenizer': 'standard', 'filter': ['lowercase', 'porter_stem', 'bigram_filter']}}, 'filter': {'bigram_filter': {'type': 'shingle', 'min_shingle_size': 2, 'max_shingle_size': 2, 'output_unigrams': False}}}}}, 'mappings': {'properties': {'cast': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}, 'directors': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}}}}
previouse index removal: 200
index creation {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}


<Response [200]>

In [8]:
user_search = "patrick stewart william shatner"
query = {
    "query": {
        "multi_match": {
            "query": user_search,
            "fields": ["title", "directors.name", "cast.name"],
            "type": "cross_fields"
        }
    }
}

In [9]:
search(query)

Num	id	Relevance	Movie title
0	193	13.487319	Star Trek: Generations
1	5851	9.604372	Showtime
2	12610	9.327592	Osmosis Jones
3	9904	9.216275	The Wild
4	10040	8.54747	Miss Congeniality 2: Armed and Fabulous
5	8834	8.327571	Conspiracy Theory
6	154	8.256831	Star Trek II: The Wrath of Khan
7	1649	8.222833	Bill & Ted's Bogus Journey
8	172	7.8560176	Star Trek V: The Final Frontier
9	168	7.669857	Star Trek IV: The Voyage Home


<Response [200]>

### 7.2.3 

In [10]:
query = {
    "query": {
        "bool": {
            "should": [
                {
                    "multi_match": {
                        "query": user_search,
                        "fields": ["title", "directors.name", "cast.name"],
                        "type": "cross_fields"
                    }
                },
                {
                    "match_phrase": {
                        "title": {"query": "star trek"}
                    }
                }
            ]
        },
    }
}

In [11]:
search(query)

Num	id	Relevance	Movie title
0	193	22.520575	Star Trek: Generations
1	200	15.981052	Star Trek: Insurrection
2	154	15.16114	Star Trek II: The Wrath of Khan
3	201	15.10723	Star Trek: Nemesis
4	152	15.076138	Star Trek: The Motion Picture
5	172	14.751929	Star Trek V: The Final Frontier
6	168	14.5619755	Star Trek IV: The Voyage Home
7	199	14.457286	Star Trek: First Contact
8	157	14.293464	Star Trek III: The Search for Spock
9	174	13.4498825	Star Trek VI: The Undiscovered Country


<Response [200]>

In [12]:
# this already works because of the switch from TFxIDF to BM25 in Lucene
# https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/
# setting boost to higher value actually simulates the book example

In [13]:
query_w_boost = {
    "query": {
        "bool": {
            "should": [
                {
                    "multi_match": {
                        "query": user_search,
                        "fields": ["title", "directors.name", "cast.name"],
                        "type": "cross_fields"
                    }
                },
                {
                    "match_phrase": {
                        "title": {"query": "star trek", "boost": 10}
                    }
                }
            ]
        },
    }
}
search(query_w_boost)

Num	id	Relevance	Movie title
0	13475	108.40272	Star Trek
1	193	103.59122	Star Trek: Generations
2	200	97.05169	Star Trek: Insurrection
3	201	96.17787	Star Trek: Nemesis
4	54138	90.07849	Star Trek Into Darkness
5	152	84.79279	Star Trek: The Motion Picture
6	199	84.17394	Star Trek: First Contact
7	154	76.31339	Star Trek II: The Wrath of Khan
8	172	75.90418	Star Trek V: The Final Frontier
9	168	75.714226	Star Trek IV: The Voyage Home


<Response [200]>

### 7.2.4

In [14]:
query = {
    "query": {
        "function_score": {
            "query": {
                "multi_match": {
                    "query": user_search,
                    "fields": ["title", "directors.name", "cast.name"],
                    "type": "cross_fields"
                }
            },
            "functions": [
                {
                    "weight": 2.5,
                    "filter": {
                        "match_phrase": { "title": "star trek"}
                    }
                }
            ]
        }
    }
}

In [15]:
resp = search(query)

Num	id	Relevance	Movie title
0	193	33.78181	Star Trek: Generations
1	154	20.916115	Star Trek II: The Wrath of Khan
2	172	19.893085	Star Trek V: The Final Frontier
3	168	19.418203	Star Trek IV: The Voyage Home
4	157	18.746922	Star Trek III: The Search for Spock
5	152	18.324604	Star Trek: The Motion Picture
6	200	17.433002	Star Trek: Insurrection
7	199	16.777475	Star Trek: First Contact
8	174	16.63797	Star Trek VI: The Undiscovered Country
9	201	15.248449	Star Trek: Nemesis


### 7.3

In [16]:
query_w_filter = {
    "query": {
        "bool": {
            "should": [
                {
                    "multi_match": {
                        "query": user_search,
                        "fields": ["title", "directors.name", "cast.name"],
                        "type": "cross_fields"
                    }
                }
            ],
            "filter": [
                {
                    "match_phrase": {
                        "title": {"query": "star trek"}
                    }
                }
            ]
        },
    }
}
search(query_w_boost)

Num	id	Relevance	Movie title
0	13475	108.40272	Star Trek
1	193	103.59122	Star Trek: Generations
2	200	97.05169	Star Trek: Insurrection
3	201	96.17787	Star Trek: Nemesis
4	54138	90.07849	Star Trek Into Darkness
5	152	84.79279	Star Trek: The Motion Picture
6	199	84.17394	Star Trek: First Contact
7	154	76.31339	Star Trek II: The Wrath of Khan
8	172	75.90418	Star Trek V: The Final Frontier
9	168	75.714226	Star Trek IV: The Voyage Home


<Response [200]>

### 7.4.2

In [28]:
SENTINEL_BEGIN = "SENTINEL_BEGIN"
SENTINEL_END = "SENTINEL_END"

def transform(movie):
    movie["title_exact_match"] = f"{SENTINEL_BEGIN} {movie['title']} {SENTINEL_END}"

In [45]:
reindex(analysis_settings, mapping_settings, extract(), transform)

using stettings
 {'settings': {'number_of_shards': 1, 'index': {'analysis': {'analyzer': {'default': {'type': 'english'}, 'english_bigrams': {'tokenizer': 'standard', 'filter': ['lowercase', 'porter_stem', 'bigram_filter']}}, 'filter': {'bigram_filter': {'type': 'shingle', 'min_shingle_size': 2, 'max_shingle_size': 2, 'output_unigrams': False}}}}}, 'mappings': {'properties': {'cast': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}, 'directors': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}}}}
previouse index removal: 200
index creation {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}


<Response [200]>

In [30]:
user_search = "star trek"

In [41]:
query = {
    "query": {
        "match_phrase": {
            "title_exact_match": {
                "query": f"{SENTINEL_BEGIN} {user_search} {SENTINEL_END}",
                "boost": 0.1
            }
        }
    }
}

In [46]:
search(query)

Num	id	Relevance	Movie title
0	13475	1.0588807	Star Trek


<Response [200]>

In [48]:
def transform(movie):
    movie["title_exact_match"] = f"{SENTINEL_BEGIN} {movie['title']} {SENTINEL_END}"
    
    movie["names_exact_match"] = []
    for person in movie["cast"] + movie["directors"]:
        movie["names_exact_match"].append(
            f"{SENTINEL_BEGIN} {person['name']} {SENTINEL_END}"
        )

In [49]:
reindex(analysis_settings, mapping_settings, extract(), transform)

using stettings
 {'settings': {'number_of_shards': 1, 'index': {'analysis': {'analyzer': {'default': {'type': 'english'}, 'english_bigrams': {'tokenizer': 'standard', 'filter': ['lowercase', 'porter_stem', 'bigram_filter']}}, 'filter': {'bigram_filter': {'type': 'shingle', 'min_shingle_size': 2, 'max_shingle_size': 2, 'output_unigrams': False}}}}}, 'mappings': {'properties': {'cast': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}, 'directors': {'properties': {'name': {'type': 'text', 'analyzer': 'english', 'fields': {'bigramed': {'type': 'text', 'analyzer': 'english_bigrams'}}}}}}}}
previouse index removal: 200
index creation {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}


<Response [200]>

In [54]:
user_search = "Star Trek Patrick Stewart"

In [57]:
query = {
    "query": {
        "bool": {
            "should": [
                {
                    "match_phrase": {
                        "title_exact_match": {
                            "query": f"{SENTINEL_BEGIN} {user_search} {SENTINEL_END}",
                            "boost": 1000
                        }
                    }
                },
                {
                    "multi_match": {
                        "query": user_search,
                        "fields": ["title", "directors.name", "cast.name"],
                        "type": "cross_fields"
                    }
                }
            ]
        }
    }
}

In [58]:
search(query)

Num	id	Relevance	Movie title
0	13475	10.760254	Star Trek
1	193	9.007851	Star Trek: Generations
2	200	9.007851	Star Trek: Insurrection
3	201	9.007851	Star Trek: Nemesis
4	54138	9.007851	Star Trek Into Darkness
5	152	7.7462955	Star Trek: The Motion Picture
6	199	7.7462955	Star Trek: First Contact
7	204082	7.6290894	Homefront
8	426	7.1596975	Vertigo
9	154	6.794695	Star Trek II: The Wrath of Khan


<Response [200]>