In [6]:
import dspy
from dspy.datasets import HotPotQA

In [7]:
import os
from dotenv import load_dotenv
load_dotenv('../.env')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

### Basic Setup of Data + DSPy config

In [8]:
turbo = dspy.OpenAI(model='gpt-3.5-turbo', api_key=OPENAI_API_KEY)
colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

dspy.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts)

dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0)

trainset = [x.with_inputs('question') for x in dataset.train]
devset = [x.with_inputs('question') for x in dataset.dev]

  table = cls._concat_blocks(blocks, axis=0)


In [107]:
# save these datasets so we can reload them 
# without having to laod hotpotqa again
import json
def serialize(d):
    out = {}
    for k, v in d.items():
        if isinstance(v, set): out[k] = list(v)
        else:out[k] = v
    return out
json.dump([e.toDict() for e in trainset], open('data/trainset.json', 'w'), indent=2)
json.dump([serialize(e.toDict()) for e in devset], open('data/devset.json', 'w'), indent=2)

In [27]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [28]:
# demonstrate basic inference 
i = 0
print(devset[i].question, devset[i].answer)
uncompiled_rag = RAG()
uncompiled_rag(question = devset[0].question)

Are both Cangzhou and Qionghai in the Hebei province of China? no


Prediction(
    context=['Cangzhou | Cangzhou () is a prefecture-level city in eastern Hebei province, People\'s Republic of China. At the 2010 census, Cangzhou\'s built-up ("or metro") area made of Yunhe, Xinhua districts and Cang County largely being conurbated had a population of 1,205,814 inhabitants, while the prefecture-level administrative unit in total has a population of 7,134,062. It lies approximately 90 km from the major port city of Tianjin, and 180 km from Beijing.', 'Haixing County | Haixing County () is a county of southeastern Hebei province, China, bordering Shandong to the southeast. It is administered by Cangzhou City, and, , had a population of 220,000 residing in an area of 836 km2 . Both G18 Rongcheng–Wuhai Expressway and G25 Changchun–Shenzhen Expressway pass through the county.', "Dongguang County | Dongguang County () is a county under the jurisdiction of Cangzhou City, in southeastern Hebei province, People's Republic of China, bordering Shandong to the south

In [29]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)

 55%|█████▌    | 11/20 [00:00<00:00, 576.26it/s]

Bootstrapped 4 full traces after 12 examples in round 0.





In [30]:
# notice even uncopiled it's the same type
type(uncompiled_rag), type(compiled_rag)

(__main__.RAG, __main__.RAG)

### Prove that we can save/load compiled rag
- And then run a valid inference on loaded pipeline

In [44]:
model_fn = 'data/compiled_1.json'
compiled_rag.save(model_fn)

In [46]:
# create, load, invoke
new_rag = RAG()
new_rag.load(model_fn)
new_rag(question = devset[0].question)

Prediction(
    context=['Cangzhou | Cangzhou () is a prefecture-level city in eastern Hebei province, People\'s Republic of China. At the 2010 census, Cangzhou\'s built-up ("or metro") area made of Yunhe, Xinhua districts and Cang County largely being conurbated had a population of 1,205,814 inhabitants, while the prefecture-level administrative unit in total has a population of 7,134,062. It lies approximately 90 km from the major port city of Tianjin, and 180 km from Beijing.', 'Haixing County | Haixing County () is a county of southeastern Hebei province, China, bordering Shandong to the southeast. It is administered by Cangzhou City, and, , had a population of 220,000 residing in an area of 836 km2 . Both G18 Rongcheng–Wuhai Expressway and G25 Changchun–Shenzhen Expressway pass through the county.', "Dongguang County | Dongguang County () is a county under the jurisdiction of Cangzhou City, in southeastern Hebei province, People's Republic of China, bordering Shandong to the south

### Eval both rag programs on `devset`

In [47]:
from dspy.evaluate.evaluate import Evaluate

compiled_eval = Evaluate(
    devset=devset, 
    num_threads=1, 
    display_progress=True, 
    # display_table=5,
)

metric = dspy.evaluate.answer_exact_match

In [48]:
outputs_compiled = compiled_eval(compiled_rag, metric=metric, return_outputs=True)

Average Metric: 27 / 50  (54.0): 100%|██████████| 50/50 [00:00<00:00, 939.72it/s]

Average Metric: 27 / 50  (54.0%)



  df = df.applymap(truncate_cell)


In [49]:
outputs_uncompiled = compiled_eval(uncompiled_rag, metric=metric, return_outputs=True)

Average Metric: 27 / 50  (54.0): 100%|██████████| 50/50 [00:00<00:00, 1237.82it/s]

Average Metric: 27 / 50  (54.0%)





In [36]:
# they get the same score
outputs_compiled[0], outputs_uncompiled[0]

(54.0, 54.0)

### See if there were any differences in the results
 - even tho score is the same

In [67]:
import pandas as pd
from IPython.display import display

In [65]:
outputs_compiled[1][0][0].toDict()

{'question': 'Are both Cangzhou and Qionghai in the Hebei province of China?',
 'answer': 'no',
 'gold_titles': {'Cangzhou', 'Qionghai'},
 'dspy_uuid': '73181150-9dc4-45f1-8f2f-543e675de741',
 'dspy_split': 'dev'}

In [102]:
def get_qa_tbl(outputs):
    qa_tbl = [e[0].toDict() for e in outputs[1]]
              
    qa_tbl = pd.DataFrame(qa_tbl, 
                columns=outputs[1][0][0].toDict().keys()
    )
    qa_tbl = qa_tbl[['question', 'answer']]
    return qa_tbl

qa_tbl = get_qa_tbl(outputs_compiled)
qa_tbl.head(3)

Unnamed: 0,question,answer
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no
1,Who conducts the draft in which Marc-Andre Fleury was drafted to the Vegas Golden Knights for the 2017-18 season?,National Hockey League
2,"The Wings entered a new era, following the retirement of which Canadian retired professional ice hockey player and current general manager of the Tampa Bay Lightning of the National Hockey League (NHL)?",Steve Yzerman


In [103]:
def get_answer_grade_tbl(outputs):
    ag_tbl = [e[1].toDict() for e in outputs[1]]
    ag_tbl = pd.DataFrame(ag_tbl, columns=outputs[1][1][0].toDict().keys())
    grade_tbl = [{'grade': e[2]} for e in outputs[1]]
    grade_tbl = pd.DataFrame(grade_tbl, columns=['grade'])
    tbl = pd.concat([ag_tbl, grade_tbl], axis=1)
    # rename col answer -> response
    tbl = tbl.rename(columns={'answer': 'response'})
    tbl = tbl[['response', 'grade']]
    return tbl

tbl_compiled = get_answer_grade_tbl(outputs_compiled)
tbl_uncompiled = get_answer_grade_tbl(outputs_uncompiled)

display(tbl_compiled.head(3))

Unnamed: 0,response,grade
0,Yes,False
1,National Hockey League,True
2,Steve Yzerman,True


In [105]:
# view where it scored differently
cmp = pd.concat([tbl_compiled, tbl_uncompiled], axis=1, keys=['compiled', 'uncompiled'])
diff_ind = cmp['compiled']['grade'] != cmp['uncompiled']['grade']
display(cmp[diff_ind])

Unnamed: 0_level_0,compiled,compiled,uncompiled,uncompiled
Unnamed: 0_level_1,response,grade,response,grade
0,Yes,False,No,True
21,Deepa Mehta,True,Tony Kaye,False
22,the good market or the good deal,False,The Good Market,True
38,Exon.,True,University of Exeter,False


In [106]:
# attach question/answer to diff responses, analyze...
with pd.option_context  ('display.max_colwidth', None):
    display(pd.concat([qa_tbl[diff_ind], cmp[diff_ind]], axis=1))

Unnamed: 0,question,answer,"(compiled, response)","(compiled, grade)","(uncompiled, response)","(uncompiled, grade)"
0,Are both Cangzhou and Qionghai in the Hebei province of China?,no,Yes,False,No,True
21,"Who was born first, Tony Kaye or Deepa Mehta?",Deepa Mehta,Deepa Mehta,True,Tony Kaye,False
22,What is the English translation of the name of the store that Macy's replaced in Boise Town Square?,the good market,the good market or the good deal,False,The Good Market,True
38,What is the post-nominal abbreviation for the university where the Banded Mongoose Research Project is based?,Exon,Exon.,True,University of Exeter,False


In [None]:
# so really there's:
# - first one which rag get's wrong and generic get's right (more by chance (?))
# - then rag get's two clearly correct where generic get's wrong
# - and on the rag we get marked incorrect for fuzzy match
#  so... rag is slightly better
