# New Feature for Interface: composition & negation for different fields.

In this notebook, we use bm25 to test the feature. In search_demo.ipynb, we replace bm25 with our search algorithm.

## install and import packages

In [None]:
!pip install python-terrier

In [None]:
import pandas as pd
# Helpful for showing indexing information
pd.set_option('display.max_colwidth', 150)
import csv
import pyterrier as pt
import os
if not pt.started():
    pt.init()

terrier-assemblies 5.7 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.7 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



## read dataset and create index for attributes

In [None]:
recipe_dataset = pd.read_csv('/content/recipes.csv')

In [None]:
attributes = ["title","ingredients"]
recipe_index = dict()

In [None]:
import os
for attribute in attributes:
  index_path = os.path.join("/content", attribute)
  pd_indexer = pt.DFIndexer(index_path)
  indexref = pd_indexer.index(recipe_dataset[attribute].astype(str), recipe_dataset["docno"])
  recipe_index[attribute] = pt.IndexFactory.of(indexref)

00:41:08.867 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 1 empty documents
00:41:34.957 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 1 empty documents


In [None]:
doc_list = list(recipe_dataset["docno"])
doc_dict = {}
for i in range(len(doc_list)):
  doc_dict[doc_list[i]] = i

## ranking

In [None]:
import numpy as np

def normalize(raw_scores):
  min = np.min(raw_scores)
  max = np.max(raw_scores)
  norm_scores = (raw_scores-min)/max

  return norm_scores


In [None]:
full_query = {"title":"pizza", "ingredients":"oil", "no-ingredients":"sesame"}
attribute_weights = {"title":1, "ingredients":1, "no-ingredients":-100000}
number_filter = {"rating":[4, 5], "calories":[200, 1000]}

doc_ranking = list(recipe_dataset["docno"])
output_num = 10
all_samples = recipe_dataset.shape[0]

In [None]:
doc_ranking_dict = dict()
for i in range(len(doc_ranking)):
  doc_ranking_dict[doc_ranking[i]] = 0

for attribute in full_query:

  query = pd.DataFrame([[attribute, full_query[attribute]]], columns=["qid", "query"])
  if attribute == "no-ingredients":
    bm25 = pt.BatchRetrieve(recipe_index["ingredients"], wmodel="BM25",num_results=5000)
  else:
    bm25 = pt.BatchRetrieve(recipe_index[attribute], wmodel="BM25", num_results=5000)

  results = bm25.transform(query).sort_values('score',ascending = False)
  raw_scores = list(results["score"])
  norm_scores = normalize(raw_scores)
  attribute_weight = attribute_weights[attribute]

  score = dict()
  for i in range(len(doc_ranking)):
    score[doc_ranking[i]] = 0
  for i in range(len(results["docno"])):
    score[results["docno"][i]] = norm_scores[i]  

  for idx, docno in enumerate(score):
    doc_ranking_dict[docno] += attribute_weight*score[docno]  


In [None]:
sorted_ranking  = list(sorted(doc_ranking_dict.items(), key=lambda item: item[1],reverse = True))
output_docno = []

i = 0
while(True):
  docno = sorted_ranking[i][0]
  result = recipe_dataset.iloc[doc_dict[docno]]
  rating = result[4]
  if type(result[9]) != str:
    continue
  else:
    calories = int(result[9].split(" ")[0])

  if rating>=number_filter["rating"][0] and rating<=number_filter["rating"][1]:
    if calories>=number_filter["calories"][0] and calories<=number_filter["calories"][1]:
      output_docno.append(docno)
  i = i+1
  if len(output_docno) == output_num or i == len(sorted_ranking):
    break

output_docno

['fAI29y5',
 'vWkTtWr',
 '7eNZUqK',
 'YHtcx9T',
 'Fd3HTS6',
 '3IJ9LHs',
 'JzhzP2O',
 'Dn8uO4J',
 'zzbweKt',
 'eYaGaFA']