## RULE BASED RETRIEVAL 

### aka why won't my LLM do what I tell it to when I tell it to 

### SETUP

In [1]:
import logging

from pinecone import PodSpec

from whyhow_rbr import Client, Rule, IndexNotFoundException

  from tqdm.autonotebook import tqdm


In [2]:
# Configure parameters
index_name = "whyhow-demo"
namespace = "BC-CS688"
pdfs = ["../data/full_book_one.pdf"]

In [3]:
# Logging
logging_level = logging.INFO

logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
)
logger = logging.getLogger("create_index")
logger.setLevel(logging_level)

In [4]:
# Initialize client
client = Client()

In [5]:
try:
    index = client.get_index(index_name)
    logger.info(f"Index {index_name} already exists, reusing it")
except IndexNotFoundException:
    spec = PodSpec(environment="gcp-starter")
    index = client.create_index(index_name, spec=spec)
    logger.info(f"Index {index_name} created")

2024-04-01 16:10:36,135 - INFO - create_index - Index whyhow-demo already exists, reusing it


In [6]:
# Upload, split, chunk, and vectorize documents in Pinecone
client.upload_documents(index=index, documents=pdfs, namespace=namespace)

Upserted vectors: 100%|██████████| 1156/1156 [00:08<00:00, 133.76it/s]


### RULES

In [29]:
rules = [
    Rule(
        # Replace with your filename
        filename="full_book_one.pdf",
        page_numbers=[40],
        keywords=['friends']
    ),
    Rule(
        # Replace with your filename
        filename="doc2.pdf",
        page_numbers=[2],
        keywords=[],
    )
]

In [35]:
question = "Who does Harry know? Like who are his friends?"
top_k = 5

In [36]:
result = client.query(
    question=question,
    index=index,
    namespace=namespace,
    rules=rules,
    top_k=top_k,
    process_rules_separately=False,
    keyword_trigger=False
)

In [37]:
answer = result["answer"]

logger.info(f"Answer: {answer}")

2024-04-01 16:18:22,626 - INFO - create_index - Answer: I don't have the context documents to answer who Harry's friends are. Please provide the relevant context or specify which Harry you are referring to.


### WHAT IF I WANT IT TO FIND KEYWORDS 

In [38]:
question = "What does Harry Potter like to eat?"

In [39]:
rule = Rule(
    filename="../data/full_book_one.pdf",
    page_numbers=[15, 30, 45],
    keywords=["food", "favorite"]
)

In [40]:
result = client.query(
    question=question,
    index=index,
    namespace=namespace,
    rules=[rule],
    keyword_trigger=True
)

print(result["answer"])
print(result["matches"])
print(result["used_contexts"])

Harry Potter likes to eat roast beef, roast chicken, pork chops, lamb chops, sausages, bacon, steak, boiled potatoes, roast potatoes, chips, Yorkshire pudding, peas, carrots, gravy, ketchup, chocolate éclairs, jam doughnuts, trifle, strawberries, jelly, rice pudding, treacle tart, and Bertie Bott's Every-Flavour Beans, but not mint humbugs.
[{'id': '../data/full_book_one.pdf-85-1', 'score': 0.6526559, 'metadata': {'text': 'piled with food. He had never seen so many things he liked to eat on one table: roast beef, roast chicken, pork chops and lamb chops, sausages, bacon and steak, boiled potatoes, roast potatoes, chips, Yorkshire pudding, peas, carrots, gravy , ketchup and, for some strange reason, mint humbugs. The Dursleys had never exactly starved Harry , but he’d never been allowed to eat as much as he liked. Dudley had always taken anything that Harry really wanted, even if it made him sick. Harry', 'page_number': 85, 'chunk_number': 1, 'filename': '../data/full_book_one.pdf', 'uu

### WHAT IF WE WANT IT TO RUN EACH RULE IN A ROW

In [41]:
question = "What is Harry Potter's favorite food?"

In [43]:
rule_1 = Rule(
    filename="data/full_book_one.pdf",
    page_numbers=[120, 121, 150]
)

rule_2 = Rule(
    filename="data/full_book_one.pdf",
    page_numbers=[80, 81, 82]
)

result = client.query(
    question=question,
    index=index,
    namespace=namespace,
    rules=[rule_1, rule_2],
    process_rules_separately=True
)