## Installations

In [1]:
# ! pip install dspy-ai==2.4.3
! pip install structlog
# ! pip install qdrant-client==1.8.0
# ! pip install fastembed
# ! pip install Jinja2
! pip install google-generativeai

Collecting structlog
  Using cached structlog-24.1.0-py3-none-any.whl.metadata (6.9 kB)
Using cached structlog-24.1.0-py3-none-any.whl (65 kB)
Installing collected packages: structlog
Successfully installed structlog-24.1.0
Collecting google-generativeai
  Using cached google_generativeai-0.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.1 (from google-generativeai)
  Using cached google_ai_generativelanguage-0.6.1-py3-none-any.whl.metadata (5.2 kB)
Collecting google-api-core (from google-generativeai)
  Using cached google_api_core-2.18.0-py3-none-any.whl.metadata (2.7 kB)
Collecting google-api-python-client (from google-generativeai)
  Using cached google_api_python_client-2.125.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Using cached google_auth-2.29.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting protobuf (from google-generativeai)
  Using cached protobuf-5.26.1-cp37-abi3-manylinux2014_

## Set LLM API key

In [1]:
# import openai
# openai.api_key = "sk-foobar"

import os


# Or add your OPEN_AI_API_KEY
os.environ['OPENAI_API_KEY'] = 'SOME KEY'

if 'OPENAI_API_KEY' not in os.environ:
    raise Exception('Environment variable "OPENAI_API_KEY" not set')


In [None]:
import os
# os.environ.pop('OPENAI_API_KEY')
os.environ.get('OPENAI_API_KEY')

## Import FAQs from the FAQs markdown

In [4]:
# Load FAQs
import re
import random
import json

f = open("faqs_parsings.md")
markdown_content = f.read()

def parse_questions(markdown_content):
    # Regular expression pattern for finding questions
    question_pattern = r'Q: (.+?)\nA: (.+?)(?=\n\nQ:|\Z)'

    matches = re.findall(question_pattern, markdown_content, re.DOTALL)
    ret_l = []
    for match in matches:
        try:
            json.loads(match[1])
            ret_l.append({"input_address": match[0], "address_components": match[1], })
        except:
            print(f"Exception in loading address components for address: {match[0]}")
            continue
    return ret_l

# Parsing the markdown content to get only questions
all_faqs = parse_questions(markdown_content)

random.seed(42)
random.shuffle(all_faqs)

# Displaying the first few extracted questions
all_faqs[:5]  # Displaying only the first few for brevity


[{'input_address': 'chandra nagar thakur mohlla koteswar mandir Gwalior Gwalior Madhya Pradesh 474003',
  'address_components': '{ "house_number": "N/A", "city": "Gwalior", "state": "Madhya Pradesh", "pincode": "474003", "locality": "chandra nagar thakur mohlla", "landmark": "koteswar mandir"}'},
 {'input_address': 'WARD 09 LASANPUR BUARI DAGARUA PURNIA',
  'address_components': '{"house_number": "N/A", "ward_number": "09", "city": "Purnia", "state": "N/A", "pincode": "N/A", "locality": "Lasanpur Buari Dagarua"}'},
 {'input_address': 'Room No 133 RAK Road  Wadala Arvi Mumbai Maharashtra 400031',
  'address_components': '{ "house_number": "Room No 133", "city": "Mumbai", "state": "Maharashtra", "pincode": "400031", "locality": "Wadala", "street_name": "RAK Road"}'},
 {'input_address': 'C-102 Payal Colony Sector 14 Gurgaon',
  'address_components': '{ "house_number": "C-102", "city": "Gurgaon", "state": "Haryana", "pincode": "N/A", "colony_name": "Payal Colony", "sector": "Sector 14" } '

## Initalize the LLM and retriever models and configure them in DSPy

In [5]:
import dspy
# import openai


llm_gemini = dspy.Google(api_key="SOME KEY")
turbo_4 = dspy.OpenAI(model="gpt-4-turbo")
turbo_35 = dspy.OpenAI(model="gpt-3.5-turbo")

# colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

dspy.settings.configure(lm=turbo_35)
# dspy.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
print(dspy.settings.lm("Write a 3 line poem about neural networks, as if you are from New Delhi, India"))

# context_example = dspy.OpenAI(model="gpt-4")
# with dspy.context(llm=context_example):
#     print(context_example("Write a 3 line poem about neural networks."))

["In the heart of Delhi's tech hub,\nNeural networks hum and buzz,\nConnecting minds with a digital rub."]


## Wrap each FAQ into an `dspy.Example` object

The dspy `Example` object optionally lets you attach metadata, or additional labels, to input/output pairs.

For example, you may want to jointly supervise the answer as well as the context the retrieval system produced to feed into the answer generator.

In [6]:
len(all_faqs)

21

In [9]:
# Load into dspy datasets
import dspy

trainset = all_faqs[:9] # 20 examples for training
devset = all_faqs[9:] # 10 examples for development
# testset = all_faqs[14:] # 14 examples for testing


trainset = [dspy.Example(**faq).with_inputs("input_address") for faq in trainset]
devset = [dspy.Example(**faq).with_inputs("input_address") for faq in devset]
# testset = [dspy.Example(**faq).with_inputs("question") for faq in testset]

In [10]:
trainset[0]

Example({'input_address': 'chandra nagar thakur mohlla koteswar mandir Gwalior Gwalior Madhya Pradesh 474003', 'address_components': '{ "house_number": "N/A", "city": "Gwalior", "state": "Madhya Pradesh", "pincode": "474003", "locality": "chandra nagar thakur mohlla", "landmark": "koteswar mandir"}'}) (input_keys={'input_address'})

## (TODO) LLM Metrics

Define a Metric for Performance.

In [11]:
# This is a WIP, the next step is to optimize this metric as itself a DSPy module (pretty meta)

# Reference - https://github.com/stanfordnlp/dspy/blob/main/examples/tweets/tweet_metric.py

metricLM = dspy.OpenAI(model='gpt-4', max_tokens=1000, model_type='chat')

# Signature for LLM assessments.

class Assessment1(dspy.Signature):
    """Assess the quality of an answer to a question based on the context."""

    context = dspy.InputField(desc="The context for answering the question.")
    assessment_criterion = dspy.InputField(desc="The evaluation criterion.")
    assessed_question = dspy.InputField(desc="The question.")
    assessed_answer = dspy.InputField(desc="The answer to the question.")
    assessment_answer = dspy.OutputField(desc="A rating between 1 and 5. Only output the rating and nothing else.")

def metric1(gold, pred, trace=None):
    predicted_answer = pred.answer
    question = gold.question

    # TODO: see if retrieving context from pred is working as expected.
    context = pred.context
    
    print(f"Test Question: {question}")
    print(f"Predicted Answer: {predicted_answer}")
    print(f"Retrieved context: {context}")
    
    detail = "Is the assessed answer detailed or can it contain more details based on the context?"
    faithful = "Is the assessed text grounded in the context? Say no if it includes significant facts not in the context."

    with dspy.context(lm=metricLM):
        # context = dspy.Retrieve(k=3)(question).passages
        detail = dspy.ChainOfThought(Assessment1)(context=context, assessed_question=question, assessed_answer=predicted_answer, assessment_criterion=detail)

        faithful = dspy.ChainOfThought(Assessment1)(context=context, assessed_question="not relevant for this assessment", assessed_answer=predicted_answer, assessment_criterion=faithful)
    
    print(f"Faithful: {faithful.assessment_answer}")
    print(f"Detail: {detail.assessment_answer}")
    
    
    total = float(detail.assessment_answer) + float(faithful.assessment_answer)*2
    
    return total / 5.0

### Inspect the metric

In [22]:
dspy.Retrieve(k=3)("what happens in scenario of a court case?")

Prediction(
    passages=['  Additional Retention Requirements In the event of arbitration, court\n  case, Disciplinary Proceedings or disputes pending, relevant Logs are\n  backed up in a removable media and kept in safe custody till the\n  completion of the same as evidence.', '    1.  Analyze Captured Logs \n\n-   The captured logs are analyzed for anomalies and addressed with\n    appropriate resolutions. The normal schedule for analysis of the\n    various logs is given in Log Analysis Frequency.\n\n-   Whenever a Security Incident / System Error (fault) / Performance\n    issue occurs, the IT department has to take appropriate corrective\n    and preventive steps to resolve the issue. If the issue is a\n    Security Incident as per the Incident Management Policy, then it has\n    to be escalated to the CISO and treated as per the Incident\n    Management Policy.', '    1.  Backup & Retrieval activities\n\n-   Logs generated by Backup and Retrieval activities are captured and\n   

In [23]:
test_gold = trainset[0]
print(f"{test_gold.question=}")

test_gold.question='What rights do individuals have regarding their data collected and processed by FinBox?'


In [24]:
dspy.Retrieve(k=3)(test_gold.question).passages

['Obligations concerning Personal Data \n\n-   As a Data Processor, FinBox is responsible for the following:\n\n    -   To process of customer provided data as per the contractual\n        mandates/as instructed by the Data Controller\n\n    -   To protect the data from unlawful processing\n\n    -   To maintain records of processing / data inventory for the\n        personal data and Conduct Data Protection Impact Assessment\n        based on applicable data protection regulations\n\n    -   To ensure the accuracy of personal data in accordance with\n        applicable laws and regulations\n\n    -   To prevent any unauthorised access, modifications and deletion\n        of personal data in accordance with the provisions of laws and\n        regulations.\n\n    -   To implement appropriate organizational, technical and\n        procedural controls to protect the data and the process of “DUE\n        CARE” is followed to protect the data, always. Refer to section\n        3.4 for more 

In [26]:
# test_example = dspy.Example(question="What happens when the support of an asset is going to end?")
# test_pred = dspy.Example(answer="They are.")

test_pred = dspy.Example(context=dspy.Retrieve(k=2)(test_gold.question).passages, question=test_gold.question, answer="Individuals have the right to request for information")

type(metric1(test_gold, test_pred))

Test Question: What rights do individuals have regarding their data collected and processed by FinBox?
Predicted Answer: Individuals have the right to request for information
Retrieved context: ['Obligations concerning Personal Data \n\n-   As a Data Processor, FinBox is responsible for the following:\n\n    -   To process of customer provided data as per the contractual\n        mandates/as instructed by the Data Controller\n\n    -   To protect the data from unlawful processing\n\n    -   To maintain records of processing / data inventory for the\n        personal data and Conduct Data Protection Impact Assessment\n        based on applicable data protection regulations\n\n    -   To ensure the accuracy of personal data in accordance with\n        applicable laws and regulations\n\n    -   To prevent any unauthorised access, modifications and deletion\n        of personal data in accordance with the provisions of laws and\n        regulations.\n\n    -   To implement appropriate orga

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Faithful: 5
Detail: 2


float

In [None]:
# test_example = dspy.Example(question="What do cross encoders do?")
# test_pred = dspy.Example(answer="They index data.")

# type(metric1(test_example, test_pred))

In [27]:
metricLM.inspect_history(n=3)





Assess the quality of an answer to a question based on the context.

---

Follow the following format.

Context: The context for answering the question.

Assessment Criterion: The evaluation criterion.

Assessed Question: The question.

Assessed Answer: The answer to the question.

Reasoning: Let's think step by step in order to ${produce the assessment_answer}. We ...

Assessment Answer: A rating between 1 and 5. Only output the rating and nothing else.

---

Context:
[1] «Obligations concerning Personal Data 

-   As a Data Processor, FinBox is responsible for the following:

    -   To process of customer provided data as per the contractual
        mandates/as instructed by the Data Controller

    -   To protect the data from unlawful processing

    -   To maintain records of processing / data inventory for the
        personal data and Conduct Data Protection Impact Assessment
        based on applicable data protection regulations

    -   To ensure the accuracy of personal

## Create the dspy.Module

In [147]:
# dspy.settings.configure(lm=turbo_35)
dspy.settings.configure(lm=turbo_4)
# dspy.settings.configure(lm=llm_gemini)

In [143]:
address_components_desc = "A JSON consisting of a few important components: 'house_number', 'city', 'state', 'pincode'. Any other component (eg- 'block', 'ward_number', 'sector', 'society_name', 'locality', 'street_name', 'street_number' 'colony_name', 'landmark', 'care_of', 'house_name', 'village', 'district', etc.), if present, should be filled. Every part of the input_address should be parsed into exactly 1 component. Fill a component with N/A in case it is missing."

class ParseAddressIntoComponents(dspy.Signature):
    """You will be provided with an address in India. You need to parse it and fill it into various components. Note that you are a address parser and your job is to fill every information in exactly 1 component. Please take it seriously."""
    
    input_address = dspy.InputField()
    address_components = dspy.OutputField(desc=address_components_desc)


class CorrectAddressComponents(dspy.Signature):
    """Your job is to validate (and correct) the input_address to input_address_components parsing. Make sure no information is missed from the input address, or being repeated in >1 components. Also make sure each component seems correct. Your job is to find errors in the components and correct them. Please take it seriously."""

    input_address = dspy.InputField(desc="An address in India")
    input_address_components = dspy.InputField(desc=address_components_desc)
    address_components = dspy.OutputField(desc="A JSON of the corrected address components.")

# class CorrectHouseNumber(dspy.Signature):
#     """Your job is to validate (and correct) the input_address to input_address_components parsing. You will be focusing only on the house_number key in the components. Do you think this `house_number` parsing is correct? Your job is not to fill new information that is not present in input_address. You are just a validator and corrector."""

#     input_address = dspy.InputField(desc="An address in India")
#     input_address_components = dspy.InputField(desc=address_components_desc)
#     address_components = dspy.OutputField(desc="A JSON of address components with corrected house_number (if correction is required).")



In [144]:
import dspy

# dspy.settings.rm.name = "Search"
# dspy.settings.rm.input_variable = "query"
# dspy.settings.rm.desc = "takes a search query about an address aspect (eg- which state is Lucknow in?) and returns one or more potentially relevant passages from a corpus (eg- Lucknow is a city in the state of Uttar Pradesh, India)"

class IntrospectiveAddressCompleteness(dspy.Module):
    def __init__(self, num_passages=3, react_max_iters=4):
        super().__init__()

        self.generate_first_draft_components = dspy.ChainOfThought(ParseAddressIntoComponents)
        # self.generate_corrected_components = dspy.ReAct(CorrectAddressComponents, tools=[dspy.settings.rm])
        self.generate_corrected_components = dspy.ChainOfThought(CorrectAddressComponents)
        # retrieve = dspy.Retrieve(k=num_passages)
        # self.generate_corrected_components = dspy.ReAct(CorrectAddressComponents, tools=[dspy.settings.rm], max_iters=react_max_iters)
        # self.generate_corrected_components = dspy.ReAct(CorrectAddressComponents, tools=[retrieve], max_iters=react_max_iters)

    def forward(self, address):
        llm_inspections = []
        address_components_l = []
        prediction = self.generate_first_draft_components(input_address=address,)
        llm_inspections.append(dspy.settings.lm.inspect_history(n=1))
        address_components_l.append(prediction.address_components)

        # with dspy.context(lm=turbo_4):
        prediction = self.generate_corrected_components(input_address=address, input_address_components=prediction.address_components)
        llm_inspections.append(dspy.settings.lm.inspect_history(n=1))
        address_components_l.append(prediction.address_components)
        # print(f"Corrected: {prediction.address_components=}")

        return dspy.Prediction(address_components=prediction.address_components, llm_inspections=llm_inspections, address_components_l=address_components_l)

address_completeness_bot = IntrospectiveAddressCompleteness()
# address_completeness_bot = IntrospectiveAddressCompleteness(num_passages=2, react_max_iters=3)

In [148]:
address="3 d/o mohd hasan, jeevangarh, hn 96, ambari Jeevangarh jeevangarh, Jiwangarh, ambari,Vikasnagar Dehradun Dehradun Uttaranchal 248125"
p = address_completeness_bot(address=address)




You will be provided with an address in India. You need to parse it and fill it into various components. Note that you are a address parser and your job is to fill every information in exactly 1 component. Please take it seriously.

---

Follow the following format.

Input Address: ${input_address}
Reasoning: Let's think step by step in order to ${produce the address_components}. We ...
Address Components: A JSON consisting of a few important components: 'house_number', 'city', 'state', 'pincode'. Any other component (eg- 'block', 'ward_number', 'sector', 'society_name', 'locality', 'street_name', 'street_number' 'colony_name', 'landmark', 'care_of', 'house_name', 'village', 'district', etc.), if present, should be filled. Every part of the input_address should be parsed into exactly 1 component. Fill a component with N/A in case it is missing.

---

Input Address: 3 d/o mohd hasan, jeevangarh, hn 96, ambari Jeevangarh jeevangarh, Jiwangarh, ambari,Vikasnagar Dehradun Dehradun Uttar

In [149]:
print(p.address_components_l[0])
print(p.address_components_l[1])

```json
{
  "house_number": "96",
  "care_of": "3 d/o mohd hasan",
  "locality": "Vikasnagar",
  "street_name": "ambari Jeevangarh",
  "city": "Dehradun",
  "state": "Uttaranchal",
  "pincode": "248125",
  "landmark": "Jeevangarh"
}
```

In this JSON, every part of the input address is parsed into exactly one component. Components not explicitly mentioned in the input address, such as "district" or "village", are not included and are assumed to be N/A by default.
```json
{
  "house_number": "96",
  "care_of": "3 d/o mohd hasan",
  "locality": "Vikasnagar",
  "street_name": "ambari Jeevangarh",
  "city": "Dehradun",
  "state": "Uttaranchal",
  "pincode": "248125",
  "landmark": "Jeevangarh"
}
```

This corrected JSON ensures that each part of the input address is accounted for without repetition and placed in the appropriate component.


In [119]:
turbo_35.inspect_history(n=1)




Your job is to validate (and correct) the input_address to input_address_components parsing. Make sure no information is missed from the input address, or being repeated in >1 components. Also make sure each component seems correct. Your job is to find errors in the components and correct them. Please take it seriously.

---

Follow the following format.

Input Address: An address in India

Input Address Components: A JSON consisting of a few important components: 'house_number', 'city', 'state', 'pincode'. Any other component (eg- 'block', 'ward_number', 'sector', 'society_name', 'locality', 'street_name', 'street_number' 'colony_name', 'landmark', 'care_of', or whatever component you feel appropriate), if present, should be filled. Every part of the input_address should be parsed into exactly 1 component. Fill a component with N/A in case it is missing.

Reasoning: Let's think step by step in order to ${produce the address_components}. We ...

Address Components: A JSON of the cor

'\n\n\nYour job is to validate (and correct) the input_address to input_address_components parsing. Make sure no information is missed from the input address, or being repeated in >1 components. Also make sure each component seems correct. Your job is to find errors in the components and correct them. Please take it seriously.\n\n---\n\nFollow the following format.\n\nInput Address: An address in India\n\nInput Address Components: A JSON consisting of a few important components: \'house_number\', \'city\', \'state\', \'pincode\'. Any other component (eg- \'block\', \'ward_number\', \'sector\', \'society_name\', \'locality\', \'street_name\', \'street_number\' \'colony_name\', \'landmark\', \'care_of\', or whatever component you feel appropriate), if present, should be filled. Every part of the input_address should be parsed into exactly 1 component. Fill a component with N/A in case it is missing.\n\nReasoning: Let\'s think step by step in order to ${produce the address_components}. We

In [42]:
print(p.llm_inspections[0])




You will be provided with an address in India. You need to parse it into its components.

---

Follow the following format.

Input Address: ${input_address}
Reasoning: Let's think step by step in order to ${produce the address_components}. We ...
Address Components: A JSON consisting of a few important components: 'house_number', 'city', 'state', 'pincode'. Any other component (eg- 'block', 'ward_number', 'sector', 'society_name', 'locality', 'street_name', 'street_number' 'colony_name', 'landmark', etc.), if present, should be filled. Every part of the input_address should be parsed into exactly 1 component. Fill a component with N/A in case it is missing.

---

Input Address: NABGHARA, HAORA NABGHARA MULLICK PARA Panchla Howrah West Bengal 711322
Reasoning: Let's think step by step in order to[32m parse the address components. We first identify the house number as 'NABGHARA'. Then we have the city as 'Howrah', the state as 'West Bengal', and the pincode as '711322'. The rest of t

In [43]:
# print(address)
print(p.llm_inspections[1])




Your job is to validate (and correct) the input_address to input_address_components parsing. Make sure no component is missed from the input address, or being repeated in >1 components. Your job is not to fill new information that is not present in input_address. You are just a validator and corrector.

---

Follow the following format.

Input Address: An address in India

Input Address Components: A JSON consisting of a few important components: 'house_number', 'city', 'state', 'pincode'. Any other component (eg- 'block', 'ward_number', 'sector', 'society_name', 'locality', 'street_name', 'street_number' 'colony_name', 'landmark', etc.), if present, should be filled. Every part of the input_address should be parsed into exactly 1 component. Fill a component with N/A in case it is missing.

Reasoning: Let's think step by step in order to ${produce the address_components}. We ...

Address Components: A JSON of the corrected address components.

---

Input Address: NABGHARA, HAORA NAB

In [21]:
p.city

'Noida'

In [16]:
llm.inspect_history(n=1)




You will be provided with an address in India. You need to parse it into its components (fill with N/A in case that component is missing).

---

Follow the following format.

Input Address: ${input_address}

House Number: ${house_number}

Sector: ${sector}

City: ${city}

Pincode: ${pincode}

---

Input Address: H-1401 Tower A Noida

House Number:[32m Input Address: H-1401 Tower A Noida

House Number: H-1401
Sector: N/A
City: Noida
Pincode: N/A[0m





'\n\n\nYou will be provided with an address in India. You need to parse it into its components (fill with N/A in case that component is missing).\n\n---\n\nFollow the following format.\n\nInput Address: ${input_address}\n\nHouse Number: ${house_number}\n\nSector: ${sector}\n\nCity: ${city}\n\nPincode: ${pincode}\n\n---\n\nInput Address: H-1401 Tower A Noida\n\nHouse Number:\x1b[32m Input Address: H-1401 Tower A Noida\n\nHouse Number: H-1401\nSector: N/A\nCity: Noida\nPincode: N/A\x1b[0m\n\n\n'

### dspy.ChainOfThought

In [13]:
dspy.ChainOfThought(GenerateAnswer)(input_address="H-1401 Tower A Noida")
# llm.inspect_history(n=1)

Prediction(
    rationale='produce the completeness. We know that the address includes the building number (H-1401), the tower (Tower A), and the city (Noida). However, it does not include a specific street name or sector number.',
    completeness="The address is not complete enough for us to reach the user's home for collecting money."
)

### dspy.ReAct

In [16]:
dspy.ReAct(GenerateAnswer, tools=[dspy.settings.rm])(question="What are cross encoders?", context="N/A")

Prediction(
    answer='Cross encoders are not mentioned in the context.'
)

In [18]:
llm.inspect_history(n=1)




You will be given `context`, `question` and you will respond with `answer`.

To do this, you will interleave Thought, Action, and Observation steps.

Thought can reason about the current situation, and Action can be the following types:

(1) Search[query], which takes a search query and returns one or more potentially relevant passages from a corpus
(2) Finish[answer], which returns the final `answer` and finishes the task

---

Follow the following format.

Context: The context for answering the question. May contain relevant facts.

Question: ${question}

Thought 1: next steps to take based on last observation

Action 1: always either Search[query] or, when done, Finish[answer]

Observation 1: observations based on action

Thought 2: next steps to take based on last observation

Action 2: always either Search[query] or, when done, Finish[answer]

---

Context: N/A

Question: What are cross encoders?

Thought 1: Search[What are cross encoders?]

Action 1: Search[What are cross enco

'\n\n\nYou will be given `context`, `question` and you will respond with `answer`.\n\nTo do this, you will interleave Thought, Action, and Observation steps.\n\nThought can reason about the current situation, and Action can be the following types:\n\n(1) Search[query], which takes a search query and returns one or more potentially relevant passages from a corpus\n(2) Finish[answer], which returns the final `answer` and finishes the task\n\n---\n\nFollow the following format.\n\nContext: The context for answering the question. May contain relevant facts.\n\nQuestion: ${question}\n\nThought 1: next steps to take based on last observation\n\nAction 1: always either Search[query] or, when done, Finish[answer]\n\nObservation 1: observations based on action\n\nThought 2: next steps to take based on last observation\n\nAction 2: always either Search[query] or, when done, Finish[answer]\n\n---\n\nContext: N/A\n\nQuestion: What are cross encoders?\n\nThought 1: Search[What are cross encoders?]\

# Initialize DSPy Program

In [28]:
uncompiled_rag = RAG()

# Test uncompiled inference 

In [29]:
print(uncompiled_rag("What are re-rankers in search engines?").answer)

Based on the provided documents, I have insufficient information to answer the question.


# Check the last call to the LLM

In [None]:
llm.inspect_history(n=1)

# 4. DSPy Optimization

# Evaluate our RAG Program before it is compiled

In [30]:
# Reminder our dataset looks like this:

devset[0]

Example({'question': 'How are patches managed for FinBox Applications?', 'answer': 'Patches for FinBox Applications are deployed as and when vulnerabilities are found. The code is not Operating System specific, and hence end-user operations are not impacted during patch deployment.'}) (input_keys={'question'})

In [31]:
from dspy.evaluate.evaluate import Evaluate

evaluate = Evaluate(devset=devset[:5], num_threads=1, display_progress=True, display_table=5)

evaluate(RAG(), metric=metric1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 0/5 [00:00<?, ?it/s]

Test Question: How are patches managed for FinBox Applications?
Predicted Answer: Patches are deployed as and when vulnerabilities are found.
Retrieved context: ['1.  Purpose & Scope\n\n    1.  Purpose\n\n-   Purpose of this document is to establish patching, vulnerability\n      assessment and penetration testing policy for Moshpit Technologies\n      Pvt. Ltd.’s (hereby referred as FinBox)\n\n    1.  Scope \n\n-   This policy is applicable to\n\n    -   AWS Infrastructure of FinBox\n\n    -   Applications hosted on this infrastructure\n\n    -   FinBox end user’s devices\n\n2.  Policy Standards\n\n    1.  Patch Management\n\n-   AWS Linux Servers in Dev, Test and Production Environment\n\n    -   AWS inspector detects if patch update is required. AWS Machine\n          Image (AMI) is created and deployed manually once a patch is\n          available.\n\n    -   Patches need not be tested in ‘Test Environment’ prior to\n          deployment. AWS tests patches before they are made avai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Average Metric: 2.6 / 1  (260.0):  20%|██        | 1/5 [00:14<00:56, 14.21s/it]

Faithful: 5
Detail: 3
Test Question: What happens to logs in the event of arbitration, court case, Disciplinary Proceedings or disputes pending?
Predicted Answer: In the event of arbitration, court case, Disciplinary Proceedings or disputes pending, relevant Logs are backed up in a removable media and kept in safe custody till the completion of the same as evidence.
Retrieved context: ['  Additional Retention Requirements In the event of arbitration, court\n  case, Disciplinary Proceedings or disputes pending, relevant Logs are\n  backed up in a removable media and kept in safe custody till the\n  completion of the same as evidence.', '2.  Policy Standards\n\n    1.  Retention Period for Logs\n\n{\n  "data": [\n    {\n      "Type of": "Cloud in frastructure logs",\n      "Retention": "90 Days or as mandated by the customer.",\n      "Remarks": "Logging of cloud management console activities is enabled. Logs are reviewed automatically using AWS services. It is a continuous process. Noti

Average Metric: 5.6 / 2  (280.0):  40%|████      | 2/5 [00:25<00:37, 12.62s/it]

Faithful: 5
Detail: 5
Test Question: What are the responsibilities of the Chief Information Security Officer?
Predicted Answer: Based on the provided documents, I have insufficient information to answer the question.
Retrieved context: ['Purpose\n\n-   This policy is an internal IT policy which defines standard device\n      and device security measures such as Anti-Virus and Malware\n      protection for every laptop.\n\n-   This policy is designed to protect the organizational resources\n      against intrusion by viruses and other malware.\n\nDevice Security Policy\n\n-   All employees in the organization will use Mac Laptop and Windows\n      devices.\n\n-   MacOS is designed with advanced technologies that work together to\n      monitor, encrypt and update to keep the device safe.\n\n-   Employees are not permitted to download / install any other\n      third-party Anti-virus or Malware analysis solution on the laptop\n      provided by the organization.\n\n-   Settings of the la

Average Metric: 7.8 / 3  (260.0):  60%|██████    | 3/5 [00:42<00:29, 14.68s/it]

Faithful: 5
Detail: 1
Test Question: What is the retention period for "Application Consent logs"?
Predicted Answer: Till the time the Customer is in the system.
Retrieved context: ['2.  Policy Standards\n\n    1.  Retention Period for Logs\n\n{\n  "data": [\n    {\n      "Type of": "Cloud in frastructure logs",\n      "Retention": "90 Days or as mandated by the customer.",\n      "Remarks": "Logging of cloud management console activities is enabled. Logs are reviewed automatically using AWS services. It is a continuous process. Notifications are sent for any critical defined events. Access to logs and related AWS services is restricted. Logs are retained permanently."\n    },\n    {\n      "Type of": "Application Consent logs",\n      "Retention": "Till the time the Customer is in the system.",\n      "Remarks": "Consent Logs of Customers is captured and stored."\n    },\n    {\n      "Type of": "FinBox Bangalore Office Network and Security device Logs",\n      "Retention": "90 Days",\

Average Metric: 10.6 / 4  (265.0):  80%|████████  | 4/5 [00:46<00:10, 10.24s/it]

Faithful: 5
Detail: 4
Test Question: How are patches managed for AWS Linux Servers in Dev, Test and Production Environment?
Predicted Answer: AWS inspector detects if patch update is required. AWS Machine Image (AMI) is created and deployed manually once a patch is available.
Retrieved context: ['1.  Purpose & Scope\n\n    1.  Purpose\n\n-   Purpose of this document is to establish patching, vulnerability\n      assessment and penetration testing policy for Moshpit Technologies\n      Pvt. Ltd.’s (hereby referred as FinBox)\n\n    1.  Scope \n\n-   This policy is applicable to\n\n    -   AWS Infrastructure of FinBox\n\n    -   Applications hosted on this infrastructure\n\n    -   FinBox end user’s devices\n\n2.  Policy Standards\n\n    1.  Patch Management\n\n-   AWS Linux Servers in Dev, Test and Production Environment\n\n    -   AWS inspector detects if patch update is required. AWS Machine\n          Image (AMI) is created and deployed manually once a patch is\n          available.\

Average Metric: 13.399999999999999 / 5  (268.0): 100%|██████████| 5/5 [01:05<00:00, 13.01s/it]

Faithful: 5
Detail: 4
Average Metric: 13.399999999999999 / 5  (268.0%)



  df.loc[:, metric_name] = df[metric_name].apply(


Unnamed: 0,question,example_answer,pred_answer,context,metric1
0,How are patches managed for FinBox Applications?,"Patches for FinBox Applications are deployed as and when vulnerabilities are found. The code is not Operating System specific, and hence end-user operations are not...",Patches are deployed as and when vulnerabilities are found.,"['1. Purpose & Scope\n\n 1. Purpose\n\n- Purpose of this document is to establish patching, vulnerability\n assessment and penetration testing policy for Moshpit Technologies\n Pvt. Ltd.’s...",2.6
1,"What happens to logs in the event of arbitration, court case, Disciplinary Proceedings or disputes pending?","In such events, relevant logs are backed up in a removable media and kept in safe custody till the completion of the same as evidence.","In the event of arbitration, court case, Disciplinary Proceedings or disputes pending, relevant Logs are backed up in a removable media and kept in safe...","[' Additional Retention Requirements In the event of arbitration, court\n case, Disciplinary Proceedings or disputes pending, relevant Logs are\n backed up in a removable media...",3.0
2,What are the responsibilities of the Chief Information Security Officer?,"The Chief Information Security Officer is responsible for the implementation of industry standard security frameworks such as ISO 27001 to secure FinBox systems, establishing product...","Based on the provided documents, I have insufficient information to answer the question.",['Purpose\n\n- This policy is an internal IT policy which defines standard device\n and device security measures such as Anti-Virus and Malware\n protection for every laptop.\n\n-...,2.2
3,"What is the retention period for ""Application Consent logs""?","The ""Application Consent logs"" are retained till the time the Customer is in the system.",Till the time the Customer is in the system.,"['2. Policy Standards\n\n 1. Retention Period for Logs\n\n{\n ""data"": [\n {\n ""Type of"": ""Cloud in frastructure logs"",\n ""Retention"": ""90 Days or as mandated by the...",2.8
4,"How are patches managed for AWS Linux Servers in Dev, Test and Production Environment?",AWS inspector detects if a patch update is required. AWS Machine Image (AMI) is created and deployed manually once a patch is available. Patches need...,AWS inspector detects if patch update is required. AWS Machine Image (AMI) is created and deployed manually once a patch is available.,"['1. Purpose & Scope\n\n 1. Purpose\n\n- Purpose of this document is to establish patching, vulnerability\n assessment and penetration testing policy for Moshpit Technologies\n Pvt. Ltd.’s...",2.8


268.0

In [32]:
llm.inspect_history(n=5)




You are a chat agent, interacting directly with an end user. Your task is to answer questions based ONLY on the context provided. Otherwise, tell the user that based on the provided documents, you have insufficient information to answer the question.

---

Follow the following format.

Context: The context for answering the question. May contain relevant facts.

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: ${answer}

---

Context:
[1] «1.  Purpose & Scope

    1.  Purpose

-   Purpose of this document is to establish patching, vulnerability
      assessment and penetration testing policy for Moshpit Technologies
      Pvt. Ltd.’s (hereby referred as FinBox)

    1.  Scope 

-   This policy is applicable to

    -   AWS Infrastructure of FinBox

    -   Applications hosted on this infrastructure

    -   FinBox end user’s devices

2.  Policy Standards

    1.  Patch Management

-   AWS Linux Servers in Dev, Test and Pro

'\n\n\nYou are a chat agent, interacting directly with an end user. Your task is to answer questions based ONLY on the context provided. Otherwise, tell the user that based on the provided documents, you have insufficient information to answer the question.\n\n---\n\nFollow the following format.\n\nContext: The context for answering the question. May contain relevant facts.\n\nQuestion: ${question}\n\nReasoning: Let\'s think step by step in order to ${produce the answer}. We ...\n\nAnswer: ${answer}\n\n---\n\nContext:\n[1] «1.  Purpose & Scope\n\n    1.  Purpose\n\n-   Purpose of this document is to establish patching, vulnerability\n      assessment and penetration testing policy for Moshpit Technologies\n      Pvt. Ltd.’s (hereby referred as FinBox)\n\n    1.  Scope \n\n-   This policy is applicable to\n\n    -   AWS Infrastructure of FinBox\n\n    -   Applications hosted on this infrastructure\n\n    -   FinBox end user’s devices\n\n2.  Policy Standards\n\n    1.  Patch Management\n

# Metric Analysis

The maximum value per rating is (5 + 5*2 + 5) / 5 = 4

4 * 10 test questions = 40

In [None]:
llm.inspect_history(n=1)

In [None]:
metricLM.inspect_history(n=3)

# BootstrapFewShot

In [None]:
from dspy.teleprompt import BootstrapFewShot

teleprompter = BootstrapFewShot(metric=metric1, max_labeled_demos=8, max_rounds=3)

# also common to init here, e.g. Rag()
compiled_rag = teleprompter.compile(uncompiled_rag, trainset=trainset)

### Inspect the compiled prompt

In [None]:
compiled_rag("What do cross encoders do?").answer

In [None]:
llm.inspect_history(n=1)

### Evaluate the Compiled RAG Program

In [None]:
evaluate(compiled_rag, metric=metric1)

# BootstrapFewShotWithRandomSearch

In [None]:
# Accidentally spent $12 on this with `num_candidate_programs=20`, caution!

In [None]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

teleprompter = BootstrapFewShotWithRandomSearch(metric=metric1, 
                                                max_bootstrapped_demos=4,
                                                max_labeled_demos=4, 
                                                max_rounds=1,
                                                num_candidate_programs=2,
                                                num_threads=2)

# also common to init here, e.g. Rag()
second_compiled_rag = teleprompter.compile(uncompiled_rag, trainset=trainset)

In [None]:
second_compiled_rag("What do cross encoders do?")

In [None]:
llm.inspect_history(n=1)

In [None]:
evaluate(second_compiled_rag, metric=metric1)

# BayesianSignatureOptimizer

In [None]:
from dspy.teleprompt import BayesianSignatureOptimizer

llm_prompter = dspy.OpenAI(model='gpt-4', max_tokens=2000, model_type='chat')

teleprompter = BayesianSignatureOptimizer(task_model=dspy.settings.lm,
                                          metric=metric1,
                                          prompt_model=llm_prompter,
                                          n=5,
                                          verbose=False)

kwargs = dict(num_threads=1, display_progress=True, display_table=0)
third_compiled_rag = teleprompter.compile(RAG(), devset=devset,
                                         optuna_trials_num=3,
                                         max_bootstrapped_demos=4,
                                         max_labeled_demos=4,
                                         eval_kwargs=kwargs)

In [None]:
third_compiled_rag("What do cross encoders do?")

# Check this out!!

Below you can see how the BayesianSignatureOptimizer jointly (1) optimizes the task instruction to:

```
Assess the context and answer the given questions that are predominantly about software usage, process optimization, and troubleshooting. Focus on providing accurate information related to tech or software-related queries.
```

As well as sourcing input-output examples for the prompt!

In [None]:
llm.inspect_history(n=1)

In [None]:
evaluate(third_compiled_rag, metric=metric1)

# Test Set Eval

In [None]:
# Evaluate Uncompiled
from dspy.evaluate.evaluate import Evaluate

# Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
evaluate = Evaluate(devset=testset, num_threads=1, display_progress=True, display_table=5)

In [None]:
evaluate(uncompiled_rag, metric=metric1)

In [None]:
evaluate(compiled_rag, metric=metric1)

In [None]:
evaluate(second_compiled_rag, metric=metric1)

In [None]:
evaluate(third_compiled_rag, metric=metric1)