<a href="https://colab.research.google.com/github/waishun78/hungry-rag/blob/main/hungry_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installing the necessary packages
!pip install --upgrade pip
!pip install 'farm-haystack[all]' ## or 'all-gpu' for the GPU-enabled dependencies

!pip install -U accelerate
!pip install bitsandbytes
!pip install SentencePiece
!pip install evaluate
!pip install bert_score
!pip install transformers
!pip install elasticsearch

!pip install googlemaps
# !apt install libgraphviz-dev
# !pip install pygraphviz

Collecting pip
  Obtaining dependency information for pip from https://files.pythonhosted.org/packages/47/6a/453160888fab7c6a432a6e25f8afe6256d0d9f2cbd25971021da6491d899/pip-23.3.1-py3-none-any.whl.metadata
  Downloading pip-23.3.1-py3-none-any.whl.metadata (3.5 kB)
Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.2.1
    Uninstalling pip-23.2.1:
      Successfully uninstalled pip-23.2.1
Successfully installed pip-23.3.1
Collecting farm-haystack[all]
  Downloading farm_haystack-1.22.1-py3-none-any.whl.metadata (28 kB)
Collecting boilerpy3 (from farm-haystack[all])
  Downloading boilerpy3-1.0.7-py3-none-any.whl.metadata (5.8 kB)
Collecting events (from farm-haystack[all])
  Downloading Events-0.5-py3-none-any.whl.metadata (3.9 kB)
Collecting httpx (from f

In [3]:
!nvidia-smi

Wed Nov 29 03:32:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# Import necessary modules
import os
from subprocess import Popen, PIPE, STDOUT
import torch
import torch.nn as nn
import bitsandbytes as bnb
import accelerate
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from transformers import LlamaTokenizer
from elasticsearch import helpers
from elasticsearch import Elasticsearch



In [5]:
# Download & extract Elasticsearch 7.0.0

!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.0.0-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-7.0.0-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.0.0

In [6]:
# Load the LLM and its Tokenizer
load_in_8bit = True
model_name_l = "lmsys/vicuna-7b-v1.5"

model_l = AutoModelForCausalLM.from_pretrained(
    model_name_l,
    torch_dtype=torch.float16,
    load_in_8bit=load_in_8bit,
    device_map="auto"
)
tokenizer_l = LlamaTokenizer.from_pretrained(model_name_l)

Downloading config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]



Downloading tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [7]:
from haystack.nodes.base import BaseComponent

class Query_Rewriter(BaseComponent):
    outgoing_edges = 1

    def __init__(self, model, tokenizer):
      self.model = model
      self.tokenizer = tokenizer

    def run(self, query: str):

      prompt=f"""Generate some follow up questions I need to better answer your query
                  Example:
                  Question: Where to get Japanese food in Marina Bay Sands that is fine-dining and has a romantic atmosphere at night?
                  Answer:
                  1. What type of Japanese food are you looking for? e.g. sushi, ramen, teppanyaki, etc.
                  2. Do you have a preference for specific dishes or would you like to try a variety of options?
                  3. Are you looking for a specific price range for the food?
                  4. What are you looking for in a romantic atmsphere?
                  5. How late at night are you eating?
                  Question: {query}
                  Answer:"""
      input = self.tokenizer(prompt, return_tensors='pt')
      input_ids = input["input_ids"].to("cuda")
      temperature=0.7
      with torch.no_grad():
        generation_output = self.model.generate(
            input_ids=input_ids,
            temperature=temperature,
            top_p = 1.0,
            do_sample=True,
            return_dict_in_generate=True,
            max_new_tokens=200,
          )

      s = generation_output.sequences[0][len(input_ids[0]):]

      output = self.tokenizer.decode(s)
      return {"query":query, "prompt":prompt, "output":output}, "output_1"

    def run_batch(self, queries):
        # Insert code here to manipulate the input and produce an output dictionary
        output = []
        for query in queries:
            output.append(self.run(query))
        return output, "output_1"

2023-11-29 03:42:44,552	INFO util.py:129 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [8]:
from haystack import Pipeline
# Create Custom Query_Rewriter
query_rewriter = Query_Rewriter(model_l, tokenizer_l)
# Create a Haystack pipeline
p = Pipeline()
p.add_node(component=query_rewriter, name="QueryRewriter", inputs=["Query"])

In [66]:
indian_query = "Find me a waffle place near Marina Bay, Singapore with good reviews"
indian_result = p.run(query = indian_query)

In [67]:
print(f'''
  query: {indian_result["query"]}
  output: {indian_result["output"]}
  ''')


  query: Find me a waffle place near Marina Bay, Singapore with good reviews
  output: 
                  1. Do you have a preference for a specific type of waffle (e.g. sweet, savory, gluten-free, etc.)?
                  2. Are you looking for a specific price range for the waffles?
                  3. Do you have any dietary restrictions or allergies?
                  4. Are you looking for a particular ambiance or setting?
                  5. What other factors are important to you in addition to the waffle place (e.g. proximity to other attractions, availability of outdoor seating, etc.)?</s>
  


In [68]:
updated_indian_query = {
    "question":indian_result["query"],
    "clarifier": indian_result["output"],
    "additional_user_criteria":
      """
      1. Chocolate
      2. Cheap
      3. No
      4. Comfort
      """
}

In [52]:
korean_query = "I want to buy bubble tea and then have Korean fried chicken. Can you give me some suggestions?"
korean_result = p.run(query = korean_query)

In [53]:
print(f'''
  query: {korean_result["query"]}
  output: {korean_result["output"]}
  ''')


  query: I want to buy bubble tea and then have Korean fried chicken. Can you give me some suggestions?
  output: 
                  Yes, of course! Here are some suggestions based on your request:
                  1. What type of bubble tea are you in the mood for? e.g. milk tea, fruit tea, etc.
                  2. Do you have a preferred brand or are you open to trying new brands?
                  3. Are you looking for a specific type of Korean fried chicken or are you open to trying different options?
                  4. Do you have any dietary restrictions or preferences?
                  5. How much are you looking to spend?</s>
  


In [54]:
updated_korean_query = {
    "question":korean_result["query"],
    "clarifier": korean_result["output"],
    "additional_user_criteria":
      """
      1. NIL
      2. NIL
      3. NIL
      4. NIL
      5. 20$
      """
}

In [55]:
class Annotate_Needs(BaseComponent):
    outgoing_edges = 1

    def __init__(self, model, tokenizer):
      self.model = model
      self.tokenizer = tokenizer

    def run(self, query: dict):
      print(f'Summarizing user needs.... using query:{query}\n\n')

      prompt=f"""Generate the user's query requirements annotated with the requirement labels. Here are the categories:
                    Price, Opening, Location, Cuisine, Dietary Restrictions, Ratings, Reservations, Landmark, General (Atmosphere, idea-related needs)

                  Example:
                  Question: Where to get Japanese food in Marina Bay Sands that is fine-dining and has a romantic atmosphere at night?
                  Clarifying Questions:
                  1. What type of Japanese food are you looking for? e.g. sushi, ramen, teppanyaki, etc.
                  2. Do you have a preference for specific dishes or would you like to try a variety of options?
                  3. Are you looking for a specific price range for the food?
                  4. What are you looking for in a romantic atmsphere?
                  5. How late at night are you eating?
                  Additional User Criteria:
                  1. Sushi
                  2. Mostly just sushi
                  3. $$
                  4. Intimate with nice lighting, Inside Marina Bay Sands
                  5. 22:00 minimally
                  Answer:
                  Sushi[Cuisine], $$ [Price], Intimate with nice lighting [General-Atmosphere], Inside Marina Bay Sands [Location], 22:00 minimally [Opening]

                  ________________________________________________________________________________________________________
                  Question: {query["question"]}
                  Clarifying Questions: {query["clarifier"]}
                  Additional User Criteria:{query["additional_user_criteria"]}
                  Answer:"""
      input = self.tokenizer(prompt, return_tensors='pt')
      input_ids = input["input_ids"].to("cuda")
      temperature=0.7
      with torch.no_grad():
        generation_output = self.model.generate(
            input_ids=input_ids,
            temperature=temperature,
            top_p = 1.0,
            do_sample=True,
            return_dict_in_generate=True,
            max_new_tokens=200,
          )

      s = generation_output.sequences[0][len(input_ids[0]):]

      query["requirements"] = self.tokenizer.decode(s)
      return query, "output_1"

    def run_batch(self, queries):
        # Insert code here to manipulate the input and produce an output dictionary
        output = []
        for query in queries:
            output.append(self.run(query))
        return output, "output_1"

In [13]:
class Search_Term_Generator(BaseComponent):
    outgoing_edges = 1

    def __init__(self, model, tokenizer):
      self.model = model
      self.tokenizer = tokenizer
      # TODO: Might need to change the temperature to be less random and favour more "safe" search terms
      # TODO: Might reduce the max tokens generated as well

    def run(self, query: dict):
      print(f'Generating Search Query.... with query:{query}\n\n')
      prompt=f"""Generate the search queries based on the example given below. Here are the categories and increasing importance (1-5), include more important terms more often in queries:
                    Price - 3, Opening - 4, Location - 5, Cuisine - 5, Dietary Restrictions - 5, Ratings - 2, Reservations - 2, Landmark - 2 , General - 1 (Atmosphere, idea-related needs)

                  Example:
                  Question:
                  Where to get Japanese food in Marina Bay Sands that is fine-dining and has a romantic atmosphere at night?
                  Clarifying Questions:
                  1. What type of Japanese food are you looking for? e.g. sushi, ramen, teppanyaki, etc.
                  2. Do you have a preference for specific dishes or would you like to try a variety of options?
                  3. Are you looking for a specific price range for the food?
                  4. What are you looking for in a romantic atmsphere?
                  5. How late at night are you eating?
                  Additional User Input:
                  1. Sushi
                  2. Mostly just sushi
                  3. $$
                  4. Intimate with nice lighting, Inside Marina Bay Sands
                  5. 22:00 minimally
                  Requirements:
                  Sushi[Cuisine], $$ [Price], Intimate with nice lighting [General-Atmosphere], Inside Marina Bay Sands [Location], 22:00 minimally [Opening]
                  Answer:
                  Marina Bay Sands Japanese food open till 2200, Romantic Japanese restaurant in Marina Bay Sands, Central Sushi open till late 30-40SGD

                  ________________________________________________________________________________________________________
                  Question:
                  {query["question"]}
                  Clarifying Questions:
                  {query["clarifier"]}
                  Additional User Input:
                  {query["additional_user_criteria"]}
                  Requirements:
                  {query["requirements"]}
                  Answer:
                  """
      input = self.tokenizer(prompt, return_tensors='pt')
      input_ids = input["input_ids"].to("cuda")
      temperature=0.8
      with torch.no_grad():
        generation_output = self.model.generate(
            input_ids=input_ids,
            temperature=temperature,
            top_p = 1.0,
            do_sample=True,
            return_dict_in_generate=True,
            max_new_tokens=100,
          )

      s = generation_output.sequences[0][len(input_ids[0]):]

      query["search_terms"]= self.tokenizer.decode(s)
      return query, "output_1"

    def run_batch(self, queries):
        # Insert code here to manipulate the input and produce an output dictionary
        output = []
        for query in queries:
            output.append(self.run(query))
        return output, "output_1"

Param Constructor Node for Yelp API (Not in use)

In [15]:
class ParamConstructor(BaseComponent):
  outgoing_edges = 1
  def __init__(self, model, tokenizer):
    self.model = model
    self.tokenizer = tokenizer

  def run(self, query: dict):
    print(f'Generating Params.... with query:{query}')
    prompt=f"""Construct the params based on the example given below. Here are the categories and their descriptions along with increasing importance (1-5), include more important terms more often in params:
                  Location(5): This string indicates the geographic area to be used when searching for restaurants.
                  Term(5): Search term, e.g. food or restaurants. The term may also be the business's name, such as Starbucks.
                  If term is not included the endpoint will default to searching across businesses from a small number of popular categories.
                  Categories(4):Array of strings. Categories to filter the search results with. See the list of supported categories. The category filter can be a list of comma delimited categories.
                  e.g., bars,french will filter by Bars OR French.
                  Price(4):Array of integers. Pricing levels to filter the search result with: 1 = $, 2 = $$, 3 = $$$, 4 = $$$$. The price filter can be a list of comma delimited pricing levels.
                  e.g., 1, 2, 3 will filter the results to show the ones that are $, $$, or $$$.

                  Example:
                  Question:
                  Where to get Japanese food in Marina Bay Sands that is fine-dining and has a romantic atmosphere at night?
                  Clarifying Questions:
                  1. What type of Japanese food are you looking for? e.g. sushi, ramen, teppanyaki, etc.
                  2. Do you have a preference for specific dishes or would you like to try a variety of options?
                  3. Are you looking for a specific price range for the food?
                  4. What are you looking for in a romantic atmsphere?
                  5. How late at night are you eating?
                  Additional User Input:
                  1. Sushi
                  2. Mostly just sushi
                  3. $$
                  4. Intimate with nice lighting, Inside Marina Bay Sands
                  5. 22:00 minimally
                  Requirements:
                  Sushi[Cuisine], $$ [Price], Intimate with nice lighting [General-Atmosphere], Inside Marina Bay Sands [Location], 22:00 minimally [Opening]
                  Search Terms:
                  Sushi Restaurant $$ in MBS, Marina Bay Sands Japanese food open till 2200, Romantic Japanese restaurant in Marina Bay Sands, Central Sushi open till late 30-40SGD
                  Answer:
                  {{
                    "location":"Marina Bay Sands",
                    "term":"Sushi",
                    "price":[2],
                    "categories":"restaurants",
                    "open_now":"true"
                  }}


                  ________________________________________________________________________________________________________
                  Question:
                  {query["question"]}
                  Clarifying Questions:
                  {query["clarifier"]}
                  Additional User Input:
                  {query["additional_user_criteria"]}
                  Requirements:
                  {query["requirements"]}
                  Search Terms:
                  {query["search_terms"]}
                  Answer:
                  """
    input = self.tokenizer(prompt, return_tensors='pt')
    input_ids = input["input_ids"].to("cuda")
    temperature=0.3
    with torch.no_grad():
      generation_output = self.model.generate(
          input_ids=input_ids,
          temperature=temperature,
          top_p = 1.0,
          do_sample=True,
          return_dict_in_generate=True,
          max_new_tokens=100,
        )

    s = generation_output.sequences[0][len(input_ids[0]):]

    query["params_constructed"] = self.tokenizer.decode(s)
    return query, "output_1"


  def run_batch(self, queries):
        # Insert code here to manipulate the input and produce an output dictionary
        output = []
        for query in queries:
            output.append(self.run(query))
        return output, "output_1"


Yelp Node (Not in Use)

In [None]:
import urllib.parse
import json
import requests
class YelpNode(BaseComponent):
  outgoing_edges = 1
  def __init__(self, api_key, params={}):
    self.api_key = api_key
    self.params = params

  def construct_business_url(self, params: dict) -> str:
    base_url = "https://api.yelp.com/v3/businesses/search?"
    query_parameters = []

    for key, value in params.items():
        # If the value is a list (e.g., for attributes or price levels),
        # we need to add each value separately with the same key.
        if isinstance(value, list):
            for v in value:
                query_parameters.append(f"{key}={urllib.parse.quote(str(v))}")
        else:
            query_parameters.append(f"{key}={urllib.parse.quote(str(value))}")

    # Join all query parameters with "&" and append to the base_url
    url = base_url + "&".join(query_parameters)
    print(url)
    return url

  def get_business_review(self, alias:str):
    #returns the api url for retrieving reviews of businesses
    headers = {"Authorization": f"Bearer {self.api_key}"}

    base_url = "https://api.yelp.com/v3/businesses/{alias}/reviews?limit=20&sort_by=yelp_sort".format(alias=alias)

    response = requests.get(base_url, headers=headers)

    result_json = response.json()
    return result_json

  def run(self, query: dict):
    print(f'Generating Yelp Response.... with query:{query}')
    param_dict = json.loads(query["params_constructed"][:-4])
    param_dict["limit"] = "20"
    query_url = self.construct_business_url(param_dict)
    headers = {"Authorization": f"Bearer {self.api_key}"}

    # Assuming you want to search for businesses on Yelp based on the query
    response = requests.get(query_url, headers=headers)

    # Convert the response to JSON
    result_json = response.json()

    # for business in result['businesses']:
    #     business_alias = business['alias']
    #     business['reviews'] = json.load(self.get_business_review(business_alias))

    for items in result_json['businesses']:
      business_alias=items['alias']
      review_json = self.get_business_review(business_alias)
      items['reviews'] = review_json['reviews']

    return {"query": query, "results": result_json},  "output1"

  def run_batch(self, queries: [dict]):
    # Insert code here to manipulate the input and produce an output dictionary
    output = []
    for query in queries:
        output.append(self.construct_business_url(query))
    return output, "output_1"

In [65]:
import urllib.parse
import json
import requests
import googlemaps
class GoogleNode(BaseComponent):
  outgoing_edges = 1
  def __init__(self,api_key):
    self.api_key = api_key

  def top_results(self,searchquery):
    gmaps = googlemaps.Client(self.api_key)
    places_result  = gmaps.places(searchquery,region = "SG" )
    # print(places_result)
    # if len(places_result['results']) < 3:
    #   raise Exception("WHY IS PLACES API ONLY GIVING ME LESS THAN 3 RESULTS")
    if len(places_result['results']) > 10: #Top 3 results
      len_top_results = 10
    else:
      print("Less than 10 options available")
      len_top_results = len(places_result['results']) #If less than 3 results produced

    top3_results = []
    for i in range(len_top_results): #Top 3 results

        place_id = places_result['results'][i]['place_id']
        name = places_result['results'][i]['name']
        address = places_result['results'][i].get('formatted_address')
        rating = str(places_result['results'][i]['rating'])
        price_level = places_result['results'][i].get('price_level')
        # types = places_result['results'][i]['types']

        top3_results.append({'place_id':place_id,
                             'name':name,
                             'address':address,
                             'rating' : rating,
                             'price_level' : price_level
                            })
    return top3_results

  def business_info_retrieval(self,top3_results):
    gmaps = googlemaps.Client(self.api_key)
    top3_business_info = []
    for i in top3_results:
        business_result  = gmaps.place(i['place_id'])

        phone_number = business_result['result'].get('formatted_phone_number')
        delivery = business_result['result'].get('delivery')
        dine_in = business_result['result'].get('dine_in')

        opening_hours = business_result['result'].get('opening_hours')
        opening_hours_ls = opening_hours.get('weekday_text') if opening_hours != None else []
        opening_hours_text = ""
        for hours in opening_hours_ls:
            opening_hours_text += f"{hours}\n"

        reviews = business_result['result'].get('reviews')
        review_text = "\n"
        for review in reviews if reviews else []:
            review_text += "\n" + review['text'] +"\n"

        top3_business_info.append({'place_id':i['place_id'],
                                    'phone_number': phone_number,
                                    'delivery': delivery,
                                    'dine_in': dine_in,
                                    'opening_hours' : opening_hours_text,
                                    'reviews':review_text})
    return top3_business_info

  def final_context_generator(self,top3_results, top3_business_info):
    final_context = []
    final_context_text=''

    for i in range(len(top3_results)):
        dict1 = top3_results[i]
        dict2 = top3_business_info[i]

        dict3 = {}
        for key in set(dict1).union(dict2):
            if key in dict1 and key in dict2:
                pass
            elif key in dict1:
                dict3[key] = dict1[key]
            else:
                dict3[key] = dict2[key]

        final_context.append(dict3)
        # {'Information Not Available' if {dict3['delivery'] == None} else 'No'}
        final_context_text += f'''\n
Option: {i+1}:
Name: {dict3['name']}
Address: {dict3['address']}
Phone: {dict3['phone_number']}
Delivery Available: {'Information Not Available' if dict3.get('delivery') is None else 'Yes' if dict3['delivery'] else 'No'}
Dine-In Available: {'Information Not Available' if dict3.get('dine_in') is None else 'Yes' if dict3['dine_in'] else 'No'}
Opening Hours:
{dict3['opening_hours']}
Price Level: {dict3['price_level']} / 4 (0- Free, 1- Inexpensive, 2- Moderate, 3- Expensive, 4- Very Expensive, None- Price Data Not Available)
Rating: {dict3['rating']}
Reviews: {dict3['reviews']}'''
    return final_context_text


  def run(self, query: dict):
    print(f'Generating Google Response.... with query:{query}')
    search_terms = query["search_terms"]
    input1 = self.top_results(search_terms)
    print("exists" if input1 else "nonexistent")
    input2 = self.business_info_retrieval(input1)
    input3 = self.final_context_generator(input1, input2)

    return {"query": query, "context": input3},  "output1"

  def run_batch(self, queries: [dict]):
    # Insert code here to manipulate the input and produce an output dictionary
    output = []
    for query in queries:
        output.append(self.run(query))
    return output, "output_1"

In [155]:
import time
class ESNode(BaseComponent):
    outgoing_edges = 1
    def __init__(self,hosturl):
        self.hosturl = hosturl
    
    def createServer(self):
        self.es_server = Popen(['elasticsearch-7.0.0/bin/elasticsearch'],
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )
    
    def connectServer(self, settings):
        time.sleep(20)
        self.es = Elasticsearch(hosts = [{"host":"localhost", "port":9200}])
        if not self.es.indices.exists(index='food_index'):
            food_index = self.es.indices.create(index="food_index", ignore=[400,404], body=settings)
            print(food_index)
        print(f"The Elastic Server is active: {self.es.ping()}")
        
    def pingServer(self):
         print(f"The Elastic Server is active: {self.es.ping()}")
        
    def json_formatter(self, dataset, index_name, index_type='_doc'):
        """
        This function is used to create JSON formatted dictionaries for Elasticsearch.

        Args:
          dataset: The dataset you want to apply this function.
          index_name: Name of the index in Elasticsearch
          index_type: Type of the index in Elasticsearch.
          Note: It is suggested to keep index_type as '_doc' since it is deprecated from version 6.
          Note: This function formats all columns of your dataset, if you want to apply this to special columns only,
          you can delete the second for loop and add your custom fields.
        """
        try:
            List = []

            for k, v in dataset.items():
                for idx, row in enumerate(v):
                    dic = {}
                    dic['_index'] = index_name
                    #dic['_type'] = index_type
                    source = {}
                    for i in row.keys():
                        source[i] = row[i]
                    dic['_source'] = source
                    List.append(dic)
            return List

        except Exception as e:
            print("There is a problem: {}".format(e))
            
    def postData(self, json_Formatted_dataset):
        # For importing Data to elasticsearch we use elasticsearch's bulk API from elasticsearch.helpers
        try:
            res = helpers.bulk(self.es, json_Formatted_dataset)
            print("Successfully imported to elasticsearch.")
        except Exception as e:
            print(f"error: {e}")
            
    def runQuery(self, infoQuery):
        search = self.es.search(
            index="food_index",
            body={
                "size":5,
                "query":{
                    "bool":{
                        "must":[
                                {"match":{"query":infoQuery,
                                }}
                        ]
                    }
                }
            }
        )

        return search
    
    def formatResponse(self, searchResults):
        hits = searchResults['hits']['hits']
        print(hits[0]['_source'].keys())
        final_context_text = ""
        for idx, hit in enumerate(hits):
            final_context_text += f'''\n
Option: {idx+1}:
Name: {hit['_source']['name']}
Address: {hit['_source']['address']}
Phone: {hit['_source']['phone_number']}
Delivery Available: {'Information Not Available' if hit['_source'].get('delivery') is None else 'Yes' if hit['_source']['delivery'] else 'No'}
Dine-In Available: {'Information Not Available' if hit['_source'].get('dine_in') is None else 'Yes' if hit['_source']['dine_in'] else 'No'}
Opening Hours:
{hit['_source']['opening_hours']}
Price Level: {hit['_source']['price_level']} / 4 (0- Free, 1- Inexpensive, 2- Moderate, 3- Expensive, 4- Very Expensive, None- Price Data Not Available)
Rating: {hit['_source']['rating']}
Reviews: {hit['_source']['reviews']}'''
        return final_context_text
    
    def run(self, query):
        print(f'Generating Context Response from ES index.... with query:{query}')
        search_terms = query["search_terms"]
        results = self.runQuery(search_terms)
        formatted_context = self.formatResponse(results)
        return {"query": query, "context":formatted_context, "results":results},  "output1"
        
    def run_batch(self, queries: [dict]):
        # Insert code here to manipulate the input and produce an output dictionary
        output = []
        for query in queries:
            output.append(self.run(query))
        return output, "output_1"
        

In [156]:
settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "query":{
                "type":"text"
            },
            "name":{
                "type":"text"
            },
            "address":{
                "type":"text"
            },
            "opening_hours":{
                "type":"text"
            },
            "phone_number":{
                "type":"text"
            },
            "price_level":{
                "type":"integer"
            },
            "reviews":{
                "type":"text"
            },
            "delivery":{
                "type":"boolean"
            },
            "rating":{
                "type":"float"
            },
            "dine_in":{
                "type":"boolean"
            }
        }
    }
}

In [157]:
#Instantiate an ES Server
haystack_elastic = ESNode("localhost:9200/")
haystack_elastic.createServer()

In [158]:
%%bash
# If you get 1 root & 2 daemon process then Elasticsearch instance has started successfully
ps -ef | grep elasticsearch

daemon       467      47  2 03:48 ?        00:01:17 /kaggle/working/elasticsearch-7.0.0/jdk/bin/java -Xms1g -Xmx1g -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=75 -XX:+UseCMSInitiatingOccupancyOnly -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.io.tmpdir=/tmp/elasticsearch-14964786339350526237 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m -Djava.locale.providers=COMPAT -Dio.netty.allocator.type=unpooled -Des.path.home=/kaggle/working/elasticsearch-7.0.0 -Des.path.conf=/kaggle/working/elasticsearch-7.0.0/config -Des.distri

In [159]:
haystack_elastic.connectServer(settings)

The Elastic Server is active: True


In [160]:
import json
dataset = json.load(open("/kaggle/input/google-maps-scraped-data/FB Dataset.json", "r"))
json_Formatted_dataset = haystack_elastic.json_formatter(dataset=dataset, index_name='food_index')
haystack_elastic.postData(json_Formatted_dataset)

  dataset = json.load(open("/kaggle/input/google-maps-scraped-data/FB Dataset.json", "r"))


Successfully imported to elasticsearch.


In [70]:
annotater = Annotate_Needs(model_l, tokenizer_l)
search_terms_generator = Search_Term_Generator(model_l, tokenizer_l)
google_node = GoogleNode("AIzaSyDoX1_IHRHkhoS9Ut15t1JfC-pFwv9L5qY")

# Create a Haystack pipeline
p2 = Pipeline()
p2.add_node(component=annotater, name="Annotate_Needs", inputs=["Query"])
p2.add_node(component=search_terms_generator, name="Search_Term_Generator", inputs=["Annotate_Needs"])
p2.add_node(component=google_node, name="Google_Node", inputs=["Search_Term_Generator"])

In [71]:
result2 = p2.run(query = updated_indian_query)

Summarizing user needs.... using query:{'question': 'Find me a waffle place near Marina Bay, Singapore with good reviews', 'clarifier': '\n                  1. Do you have a preference for a specific type of waffle (e.g. sweet, savory, gluten-free, etc.)?\n                  2. Are you looking for a specific price range for the waffles?\n                  3. Do you have any dietary restrictions or allergies?\n                  4. Are you looking for a particular ambiance or setting?\n                  5. What other factors are important to you in addition to the waffle place (e.g. proximity to other attractions, availability of outdoor seating, etc.)?</s>', 'additional_user_criteria': '\n      1. Chocolate\n      2. Cheap\n      3. No\n      4. Comfort\n      '}


Generating Search Query.... with query:{'question': 'Find me a waffle place near Marina Bay, Singapore with good reviews', 'clarifier': '\n                  1. Do you have a preference for a specific type of waffle (e.g. sweet

In [72]:
print(result2['context'])



Option: 1:
Name: Taro Waffles
Address: 2 Finlayson Green, Singapore 049247
Phone: None
Delivery Available: Yes
Dine-In Available: Yes
Opening Hours:

Price Level: None / 4 (0- Free, 1- Inexpensive, 2- Moderate, 3- Expensive, 4- Very Expensive, None- Price Data Not Available)
Rating: 0
Reviews: 


Option: 2:
Name: VENCHI Singapore Marina Bay Sands
Address: 8 Bayfront Avenue B2-56A Canal Level, The Shoppes at Marina Bay Sands, 018972
Phone: 6688 7010
Delivery Available: Information Not Available
Dine-In Available: Yes
Opening Hours:
Monday: 10:30 AM – 10:00 PM
Tuesday: 10:30 AM – 10:00 PM
Wednesday: 10:30 AM – 10:00 PM
Thursday: 10:30 AM – 10:00 PM
Friday: 10:30 AM – 11:00 PM
Saturday: 10:30 AM – 11:00 PM
Sunday: 10:30 AM – 10:00 PM

Price Level: None / 4 (0- Free, 1- Inexpensive, 2- Moderate, 3- Expensive, 4- Very Expensive, None- Price Data Not Available)
Rating: 4.3
Reviews: 

Had a mini cone that offers 2 flavours. I chose milk chocolate hazelnut and summer cocktail which had a lot

In [161]:
annotater = Annotate_Needs(model_l, tokenizer_l)
search_terms_generator = Search_Term_Generator(model_l, tokenizer_l)

# Create a Haystack pipeline
p3 = Pipeline()
p3.add_node(component=annotater, name="Annotate_Needs", inputs=["Query"])
p3.add_node(component=search_terms_generator, name="Search_Term_Generator", inputs=["Annotate_Needs"])
p3.add_node(component=haystack_elastic, name="Elastic_Node", inputs=["Search_Term_Generator"])

In [162]:
result3 = p3.run(query = updated_indian_query)

Summarizing user needs.... using query:{'question': 'Find me a waffle place near Marina Bay, Singapore with good reviews', 'clarifier': '\n                  1. Do you have a preference for a specific type of waffle (e.g. sweet, savory, gluten-free, etc.)?\n                  2. Are you looking for a specific price range for the waffles?\n                  3. Do you have any dietary restrictions or allergies?\n                  4. Are you looking for a particular ambiance or setting?\n                  5. What other factors are important to you in addition to the waffle place (e.g. proximity to other attractions, availability of outdoor seating, etc.)?</s>', 'additional_user_criteria': '\n      1. Chocolate\n      2. Cheap\n      3. No\n      4. Comfort\n      '}


Generating Search Query.... with query:{'question': 'Find me a waffle place near Marina Bay, Singapore with good reviews', 'clarifier': '\n                  1. Do you have a preference for a specific type of waffle (e.g. sweet

  search = self.es.search(


dict_keys(['dine_in', 'rating', 'delivery', 'reviews', 'price_level', 'phone_number', 'opening_hours', 'address', 'name', 'query'])


In [164]:
import pandas as pd
pd.json_normalize((result3['results']['hits']['hits']))

Unnamed: 0,_index,_type,_id,_score,_source.dine_in,_source.rating,_source.delivery,_source.reviews,_source.price_level,_source.phone_number,_source.opening_hours,_source.address,_source.name,_source.query
0,food_index,_doc,mK44GYwBpYYDngoR5IPi,6.647784,True,4.7,False,\n\nOur first time here. The service was hospi...,,6837 0402,"Monday: Closed\nTuesday: 12:00 – 2:30 PM, 6:00...","30 Victoria St, #01-26/27 CHIJMES, Singapore 1...",Whitegrass Restaurant,Best restaurants in Singapore
1,food_index,_doc,ma44GYwBpYYDngoR5IPj,6.647784,True,4.6,False,\n\nLes Amis restaurant is a gastronomic wonde...,4.0,6733 2225,"Monday: 12:00 – 2:00 PM, 7:00 – 9:00 PM\nTuesd...","1 Scotts Rd, #01 - 16 Shaw Centre, Singapore 2...",Les Amis,Best restaurants in Singapore
2,food_index,_doc,mq44GYwBpYYDngoR5IPj,6.647784,True,4.5,False,\n\nOur experience at Braci was probably one o...,,6866 1933,"Monday: 12:00 – 1:30 PM, 6:00 – 11:00 PM\nTues...","52 Boat Quay, #05-01/ #06-01, Singapore 049841",Braci,Best restaurants in Singapore
3,food_index,_doc,m644GYwBpYYDngoR5IPj,6.647784,True,4.4,True,\n\nAbsolutely loved the food here. The sea ur...,3.0,6423 1228,"Monday: 12:00 – 2:30 PM, 6:00 – 11:00 PM\nTues...","22 Ann Siang Rd, Singapore 069702",Lolla,Best restaurants in Singapore
4,food_index,_doc,nK44GYwBpYYDngoR5IPj,6.647784,True,4.5,True,\n\nHad a great 6 course lunch at Beni! Servic...,4.0,9159 3177,"Monday: 12:00 – 3:00 PM, 7:00 – 10:30 PM\nTues...","333A Orchard Road, #02-37 Mandarin Gallery, 23...",béni Singapore,Best restaurants in Singapore


In [166]:
print(result3['context'])



Option: 1:
Name: Whitegrass Restaurant
Address: 30 Victoria St, #01-26/27 CHIJMES, Singapore 187996
Phone: 6837 0402
Delivery Available: No
Dine-In Available: Yes
Opening Hours:
Monday: Closed
Tuesday: 12:00 – 2:30 PM, 6:00 – 10:30 PM
Wednesday: 12:00 – 2:30 PM, 6:00 – 10:30 PM
Thursday: 12:00 – 2:30 PM, 6:00 – 10:30 PM
Friday: 12:00 – 2:30 PM, 6:00 – 10:30 PM
Saturday: 12:00 – 2:30 PM, 6:00 – 10:30 PM
Sunday: Closed

Price Level: None / 4 (0- Free, 1- Inexpensive, 2- Moderate, 3- Expensive, 4- Very Expensive, None- Price Data Not Available)
Rating: 4.7
Reviews: 

Our first time here. The service was hospitable and genuine, no pretense despite being a Michelin-starred restaurant. Alex and Kim were friendly and was impeccable in their service. Food was amazing with unique combinations of fresh ingredients. Chef also came over to each table to interact which we felt was an awesome touch. Strongly recommended!

Well deserved Michelin Restaurants.
The bread is so crispy on the outside an

## Archive

In [None]:
from haystack import BaseComponent

class CustomNodeA(BaseComponent):
    outgoing_edges = 1

    def run(self, query: dict):
        # Append some text to the input
        print("A",query)
        query["text"] += " processed by NodeA"
        return query, "output_1"

    def run_batch(self, queries: list):
        # Insert code here to manipulate the input and produce an output dictionary
        input_data = []
        for query in queries:
            input_data.append(self.run(query))
        return input_data, "output_1"

class CustomNodeB(BaseComponent):
    outgoing_edges = 1

    def run(self, query: dict):
        # Append some more te
        print("B", query)
        query["text"] += " and then by NodeB"
        return query, "output_1"
    def run_batch(self, queries: list):
        # Insert code here to manipulate the input and produce an output dictionary
        output_2 = []
        for query in queries:
            output_2.append(self.run(query))
        return output_2, "output_1"

class NodeC(BaseComponent):
    outgoing_edges = 1
    def run(self, query:dict):
      print("C", query)
      query["hi"] = "hi"
      return query, "output_1"
    def run_batch(self, queries:list):
      return queries,"output_1"

In [None]:
from haystack import Pipeline

# Create instances of your custom nodes
node_a = CustomNodeA()
node_b = CustomNodeB()
node_c = NodeC()

# Create a Haystack pipeline and add the nodes
pipeline = Pipeline()
pipeline.add_node(component=node_a, name="NodeA", inputs=["Query"])
pipeline.add_node(component=node_b, name="NodeB", inputs=["NodeA"])
pipeline.add_node(component=node_c, name="NodeC", inputs=["NodeB"])


In [None]:
pipeline.draw("pipeline_retrieval.png")

In [None]:

# Prepare initial input and run the pipeline
initial_input= {"text": "Start"}
output = pipeline.run(query=initial_input)

print(output)