# Query Elasticsearch, store results in `trec_eval` format

Environment used: Chorus - The Elasticsearch Edition available at https://github.com/querqy/chorus-elasticsearch-edition

To succesfully follow the steps, clone the repository and run the quickstart:

`git clone https://github.com/querqy/chorus-elasticsearch-edition`

`cd chorus-elasticsearch-edition`

`./quickstart.sh -vector`

## Install Elasticsearch Python Client

In [None]:
!python3 -m pip install elasticsearch

## Import libraries

In [None]:
from elasticsearch import Elasticsearch
import numpy as np
import pandas as pd
import json

## Query Elasticsearch and check connection

In [None]:
es = Elasticsearch("http://localhost:9200")

resp = es.search(index="ecommerce", query={
    "querqy": {
      "matching_query": {
        "query": "laptop"
      },
      "query_fields": [
        "id",
        "name",
        "title^15",
        "product_type^10",
        "short_description^5",
        "ean",
        "search_attributes"
      ],
      "rewriters": [
        "replace",
        "common_rules"
      ]
    }
  })
print("Got %d Hits:" % resp['hits']['total']['value'])
for hit in resp['hits']['hits']:
    print("%(id)s %(title)s: %(name)s" % hit["_source"])

## Read list of queries and get results from Elasticsearch for each

In [None]:
df_relevance = pd.DataFrame()

df_queries = pd.read_csv('../data/queries.txt', sep="\t", header=None)

for query in df_queries.itertuples():
    
    resp = es.search(index="ecommerce", query={
    "querqy": {
      "matching_query": {
        "query": query[2]
      },
      "query_fields": [
        "id",
        "name",
        "title^15",
        "product_type^10",
        "short_description^5",
        "ean",
        "search_attributes"
      ],
      "rewriters": [
        "replace",
        "common_rules"
      ]
    }
  })
    #for i in range (0, 10):
    position = 0
    for hit in resp['hits']['hits']:
        # create a new row for the DataFrame and append it
        row = { 'query_id' : str(query[1]), 'Q0' : "Q0", 'product_id' : hit["_id"], 'position' : str(position), 'relevance' : hit["_score"], 'run': 'es' }
        df_relevance = df_relevance.append(row, ignore_index=True)
        #print("%(id)s %(title)s: %(name)s" % hit["_source"])
        position += 1
    
# store the DataFrame without header and index, with tabs as delimiters
name = '../data/es_result'
df_relevance.to_csv(name, sep="\t", header=False, index=False)
df_relevance
# work with two for loops:
# 1) one to iterate over the list of queries and have a query id instead of a query
# 2) another one to iterate over the result sets to have the position of the result in the result set 

# DataFrame with columns:
# query_id: the id of the query as the trec_eval tool needs a numeric id rather than a query string as an identifier
# Q0: all lines have Q0, currently unused by trec_eval
# product_id: the id of the product in the hit list
# position: the position of the product in the result set
# relevance: relevance as given by the search engine, BM25 score in the case of Elasticsearch
# run: the name of the query run