In [1]:
import pandas
from tqdm.auto import tqdm

In [2]:
df_question = pandas.read_csv('../data/ground-truth-retrieval.csv')

In [3]:
df_question.head()


Unnamed: 0,id,question
0,39264,Who created the humanoid robot named Jet Jagua...
1,39264,What undersea race of people seized Jet Jaguar...
2,39264,Why did the Seatopians send Megalon to the sur...
3,39264,Which companies produced the movie Godzilla vs...
4,39264,What are some keywords associated with the mov...


In [4]:
ground_truth = df_question.to_dict(orient='records')

In [5]:
ground_truth[0]

{'id': 39264,
 'question': 'Who created the humanoid robot named Jet Jaguar in the movie Godzilla vs. Megalon?'}

In [6]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [7]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

index_name = 'movies'

def elastic_search(query):

    search_query = {
        "size": 10,
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^3", "description^2", "overview^1.5", "genres", "keywords"],
                "type": "best_fields",
                "fuzziness": "AUTO"
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    
    return result_docs

In [8]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [9]:
evaluate(ground_truth, lambda q: elastic_search(q['question']))

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [05:41<00:00,  2.93it/s]


{'hit_rate': 0.687, 'mrr': 0.6390579365079364}

## finding the best paremeters

In [9]:
from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll.base import scope

In [10]:
search_space = {
    'title_boost': hp.uniform('title_boost', 1.0, 3.0),
    'genres_boost': hp.uniform('genres_boost', 1.0, 3.0),
    'overview_boost': hp.uniform('overview_boost', 1.0, 3.0),
    'production_companies_boost': hp.uniform('production_companies_boost', 1.0, 3.0),
    'tagline_boost': hp.uniform('tagline_boost', 1.0, 3.0),
    'credits_boost': hp.uniform('credits_boost', 1.0, 3.0),
    'keywords_boost': hp.uniform('keywords_boost', 1.0, 3.0),
}

In [11]:
def elastic_search_search(query, params):

    title_boost = params['title_boost']
    genres_boost = params['genres_boost']
    overview_boost = params['overview_boost']
    production_companies_boost = params['production_companies_boost']
    tagline_boost = params['tagline_boost']
    credits_boost = params['credits_boost']
    keywords_boost = params['keywords_boost']
    
    # Define your query with dynamic boosts
    search_query = {
        "size": 10,
        "query": {
            "multi_match": {
                "query": query,
                "fields": [
                    f"title^{title_boost}",
                    f"genres^{genres_boost}",
                    f"overview^{overview_boost}",
                    f"production_companies^{production_companies_boost}",
                    f"tagline^{tagline_boost}",
                    f"credits^{credits_boost}",
                    f"keywords^{keywords_boost}"
                ],
                "type": "best_fields",
                "fuzziness": "AUTO"
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    
    return result_docs

In [12]:
def objective(params):
    
    def search_function(q):
        return elastic_search_search(q['question'], params)  
    
    results = evaluate(ground_truth[:200], search_function)    
    return -(results['mrr'])


In [13]:
# Initialize trials object to store the result history
trials = Trials()

# Run optimization
best_params = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,  # The number of iterations you want to run
    trials=trials
)

print("Best parameters:", best_params)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

  0%|          | 0/200 [00:00<?, ?it/s]

 10%|█         | 1/10 [02:18<20:42, 138.01s/trial, best loss: -0.28307341269841274]

  0%|          | 0/200 [00:00<?, ?it/s]

 20%|██        | 2/10 [04:21<17:15, 129.46s/trial, best loss: -0.3412341269841271] 

  0%|          | 0/200 [00:00<?, ?it/s]

 30%|███       | 3/10 [06:17<14:23, 123.43s/trial, best loss: -0.3412341269841271]

  0%|          | 0/200 [00:00<?, ?it/s]

 40%|████      | 4/10 [08:25<12:30, 125.02s/trial, best loss: -0.47331746031746036]

  0%|          | 0/200 [00:00<?, ?it/s]

 50%|█████     | 5/10 [10:22<10:12, 122.40s/trial, best loss: -0.47331746031746036]

  0%|          | 0/200 [00:00<?, ?it/s]

 60%|██████    | 6/10 [12:29<08:14, 123.66s/trial, best loss: -0.47331746031746036]

  0%|          | 0/200 [00:00<?, ?it/s]

 70%|███████   | 7/10 [14:35<06:13, 124.56s/trial, best loss: -0.47331746031746036]

  0%|          | 0/200 [00:00<?, ?it/s]

 80%|████████  | 8/10 [16:23<03:58, 119.16s/trial, best loss: -0.47331746031746036]

  0%|          | 0/200 [00:00<?, ?it/s]

 90%|█████████ | 9/10 [18:29<02:01, 121.42s/trial, best loss: -0.47331746031746036]

  0%|          | 0/200 [00:00<?, ?it/s]

100%|██████████| 10/10 [20:21<00:00, 122.13s/trial, best loss: -0.47331746031746036]
Best parameters: {'credits_boost': np.float64(1.808387107586882), 'genres_boost': np.float64(1.9189552249491384), 'keywords_boost': np.float64(2.2886877440317113), 'overview_boost': np.float64(2.7267845342556107), 'production_companies_boost': np.float64(2.329601260063974), 'tagline_boost': np.float64(1.2779522585346685), 'title_boost': np.float64(2.5379715627080377)}


In [15]:
best_params

{'credits_boost': np.float64(1.2455632927531133),
 'genres_boost': np.float64(1.8690964556691751),
 'keywords_boost': np.float64(2.495763043344679),
 'overview_boost': np.float64(2.5734245250947705),
 'production_companies_boost': np.float64(2.364879048983533),
 'tagline_boost': np.float64(2.4531032723552704),
 'title_boost': np.float64(2.6316450728726513)}

In [43]:
evaluate(ground_truth,lambda q: elastic_search_search(q['question'], best_params))

100%|██████████| 225/225 [01:38<00:00,  2.28it/s]


{'hit_rate': 0.6088888888888889, 'mrr': 0.5090828924162256}