# RAG Evaluations

In [1]:
import os
import dspy

In [2]:
os.chdir('../')

In [3]:
from src.chromadb_rm import ChromadbRM

In [6]:
os.environ['OPENAI_API_KEY'] = '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'

In [7]:
os.environ['LANGFUSE_SECRET_KEY'] = 'sk-lf-6f3542d6-53e7-4fd2-b417-e6e2fc0512a0'
os.environ['LANGFUSE_PUBLIC_KEY'] = 'pk-lf-3d36f7c6-2840-40d1-b129-63e075e24226'
os.environ["LANGFUSE_HOST"] = 'https://us.cloud.langfuse.com'

In [8]:
class GenerateAnswer(dspy.Signature):
    """Answer questions given the context"""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Short factual answer to the question. 1 - 5 words long.")

class RAG(dspy.Module):
    def __init__(self, num_passages=5):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [9]:
def setup():
    """
    Setup the dsypy and retrieval models
    """

    turbo = dspy.OpenAI(model='gpt-3.5-turbo')

    chroma_rm = ChromadbRM(collection_name="test-overlap-0", persist_directory="chroma.db", local_embed_model="sentence-transformers/paraphrase-MiniLM-L6-v2",
                                   openai_api_key=os.environ["OPENAI_API_KEY"])

    dspy.settings.configure(lm=turbo, rm=chroma_rm)
    
    rag = RAG()

    return rag

In [10]:
rag = setup()

Collection Count: 7850




In [12]:
# Read question, ground_truths from ./data/processed/synthetic_dataset.csv
import pandas as pd

df = pd.read_csv("./data/processed/synthetic_dataset.csv")

df = df[['question', 'ground_truths']]

In [13]:
df.head()

Unnamed: 0,question,ground_truths
0,Who directed the 2007 production of How to Curse?,['Josie Rourke']
1,"Who starred as ""Jason Tyler"" in the 2006 episo...",['Robert Boulter']
2,Who was Du Fu's paternal grandfather?,['Du Shenyan']
3,When did Du Fu meet Li Bai for the first time?,['Autumn of 744']
4,What was Du Fu's first official post in the ca...,"[""Registrar of the Right Commandant's office""]"


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# split the data into train and test
train, test = train_test_split(df, test_size=0.2)

In [16]:
# save the train and test data
train.to_csv("./data/processed/train_synthetic.csv", index=False)
test.to_csv("./data/processed/test_synthetic.csv", index=False)

# load the train and test data
train = pd.read_csv("./data/processed/train_synthetic.csv")
test = pd.read_csv("./data/processed/test_synthetic.csv")

In [17]:
import tqdm

# Create an empty list to store rows
eval_results_rows = []

for index, row in test.iterrows():
    # Get the question
    question = row['question']
    # Response from rag
    response = rag(question)
    # Create a dictionary to represent a row
    row_dict = {'question': question, 'contexts': response.context, 'answer': response.answer, 'ground_truths' : row['ground_truths']}
    # Append the row dictionary to the list
    eval_results_rows.append(row_dict)

# Create the df_eval_results DataFrame from the list of rows
df_eval_results = pd.DataFrame(eval_results_rows)


In [18]:
df_eval_results

Unnamed: 0,question,contexts,answer,ground_truths
0,"What was the purpose of Operation Torch, devis...",[. many ships also used a forced draught to ge...,Gain control of North Africa.,['To occupy French North Africa']
1,What is the estimated weight of Tres Zapotes M...,[. it has since been moved to the museo comuni...,7.8 tons,['7.8 tons']
2,Who was widely praised for their performance i...,"[american beauty, they gave their top awards t...",Kevin Spacey,"['Spacey, Mendes, Ball']"
3,Who won the IWGP Heavyweight Championship on h...,[njcaa heavyweight champion ( 1998 ) north dak...,Lesnar,['Lesnar']
4,What is the industrial process for the product...,[= = reactions of oxaziridines = = = = = hydra...,Peroxide process.,['Peroxide process']
...,...,...,...,...
150,What musical style has Hed PE referred to thei...,"[= = musical style = =, . hed pe's music is a ...",G@-@ punk,['G-punk']
151,What was the combat efficiency of the 23rd Reg...,[. when the attack finally ceased shortly afte...,38 percent,['38 percent']
152,What is the reproductive strategy of most temn...,"[evolving from temnospondyls ), . like most li...",External fertilization,['External fertilization']
153,What force was greatly feared by nationalists ...,"[. on the third day of fighting, 14 august, th...",B-Specials.,['Ulster Special Constabulary']


In [19]:
import ast

# df_eval_results ground_truths to list
df_eval_results['ground_truths'] = df_eval_results['ground_truths'].apply(lambda x: ast.literal_eval(x))

In [20]:
# Save the df_eval_results DataFrame to a csv file
import time
EXP_NAME = "SIMPLE_RAG_NO_OVERLAP"
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
df_eval_results.to_csv('./results/inference_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

Now, that we have answers for all the questions, we can evaluate the RAG model.

In [24]:
from datasets import Dataset
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
)

ds = Dataset.from_pandas(df_eval_results)


try:
    result = evaluate(
        dataset = ds,
        metrics=[
            context_relevancy,
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ],
        raise_exceptions=False
    )
except Exception as e:
    print(e)



Evaluating:   0%|          | 0/775 [00:00<?, ?it/s]

ERROR:ragas.executor:Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\asyncio\tasks.py", line 571, in _wait_for_one
    return f.result()  # May raise f.exception().
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\metrics\base.py", line 91, in ascore
    raise e
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\metrics\base.py", line 87, in ascore
    score = await self._ascore(row=row, callbacks=group_cm, is_async=is_async)
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\metrics\_faithfulness.py", line 190, in _ascore
    assert isinstanc

In [25]:
# from ragas.metrics import (
#     answer_relevancy,
#     faithfulness,
#     context_recall,
#     context_precision,
#     answer_similarity,
#     context_relevancy
# )
# from datasets import Dataset
# from ragas import evaluate

# ds = Dataset.from_pandas(df_eval_results)

# result = evaluate(
#     ds,
#     metrics=[
#         faithfulness,
#         answer_relevancy,
#         context_relevancy,
#         context_recall,
#         context_precision
#     ],
# )

In [26]:
result

{'context_relevancy': 0.6607, 'context_precision': 0.6197, 'context_recall': 0.6382, 'faithfulness': 0.6828, 'answer_relevancy': 0.6104}

In [27]:
# save the result
result.to_pandas().to_csv('./results/evaluation_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

In [28]:
result.to_pandas()

Unnamed: 0,question,contexts,answer,ground_truths,ground_truth,context_relevancy,context_precision,context_recall,faithfulness,answer_relevancy
0,"What was the purpose of Operation Torch, devis...",[. many ships also used a forced draught to ge...,Gain control of North Africa.,[To occupy French North Africa],To occupy French North Africa,,,,,
1,What is the estimated weight of Tres Zapotes M...,[. it has since been moved to the museo comuni...,7.8 tons,[7.8 tons],7.8 tons,,,0.000000,0.000000,1.000000
2,Who was widely praised for their performance i...,"[american beauty, they gave their top awards t...",Kevin Spacey,"[Spacey, Mendes, Ball]","Spacey, Mendes, Ball",0.000000,0.843472,0.037037,1.000000,1.000000
3,Who won the IWGP Heavyweight Championship on h...,[njcaa heavyweight champion ( 1998 ) north dak...,Lesnar,[Lesnar],Lesnar,1.000000,0.905973,0.111111,0.000000,0.000000
4,What is the industrial process for the product...,[= = reactions of oxaziridines = = = = = hydra...,Peroxide process.,[Peroxide process],Peroxide process,,0.927463,0.428571,0.333333,1.000000
...,...,...,...,...,...,...,...,...,...,...
150,What musical style has Hed PE referred to thei...,"[= = musical style = =, . hed pe's music is a ...",G@-@ punk,[G-punk],G-punk,0.375000,0.583333,1.000000,0.000000,0.861600
151,What was the combat efficiency of the 23rd Reg...,[. when the attack finally ceased shortly afte...,38 percent,[38 percent],38 percent,0.272727,1.000000,1.000000,1.000000,0.971468
152,What is the reproductive strategy of most temn...,"[evolving from temnospondyls ), . like most li...",External fertilization,[External fertilization],External fertilization,0.750000,0.588889,1.000000,1.000000,0.916365
153,What force was greatly feared by nationalists ...,"[. on the third day of fighting, 14 august, th...",B-Specials.,[Ulster Special Constabulary],Ulster Special Constabulary,0.272727,1.000000,1.000000,1.000000,0.877489


In [29]:
os.environ['WANDB_NOTEBOOK_NAME'] = '05_eval_rag.ipynb'

In [30]:
os.environ['WANDB_API_KEY'] = '489eb28b2888d684cef50ac9633d922c62b6c655'

In [31]:
# Logging to wandb

import wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-eval",
    
    # track hyperparameters and run metadata
    config={
        "number_of_questions": len(ds),
        "comments": "Simple QA RAG model with no teleprompter - chunk overlap size 0",
        "model": "RAG",
        "dataset": "Synthetic",
        "num_passages": 5,
        "openai_model": "gpt-3.5-turbo",
        "chroma_collection_name": "test-overlap-64",
        "chroma_persist_directory": "chroma.db",
        "chroma_local_embed_model": "sentence-transformers/paraphrase-MiniLM-L6-v2",

    }
)

wandb.log(result)

wandb.finish()

wandb: Currently logged in as: kuotzuwei15 (kuotzuwei15-national-yang-ming-chiao-tung-university). Use `wandb login --relogin` to force relogin


VBox(children=(Label(value='0.015 MB of 0.026 MB uploaded\r'), FloatProgress(value=0.5838916292424354, max=1.0…

0,1
answer_relevancy,▁
context_precision,▁
context_recall,▁
context_relevancy,▁
faithfulness,▁

0,1
answer_relevancy,0.61042
context_precision,0.61974
context_recall,0.63823
context_relevancy,0.66074
faithfulness,0.6828


----

Now, let's compile the RAG using teleprompters.

In [32]:
train.reset_index(inplace=True, drop=True)

In [33]:
train = train[:10]

In [34]:
train

Unnamed: 0,question,ground_truths
0,What is the Philippine version of the auto ric...,['Tricycles']
1,What type of shot could lodge in the hull of a...,['Red hot shot']
2,When did Clayton Kershaw debut in the MLB?,['2008']
3,"Who influenced Bacon's use of the ""space frame...",['Alberto Giacometti.']
4,What was the first studio album released by th...,['Church of Realities']
5,"What was the Appreciation Index figure for ""Th...",['87']
6,Who was awarded the Medal of Honor for fightin...,['Private First Class Luther H. Story.']
7,When did NY 93 become state-maintained between...,"['October 1, 1998']"
8,"How many digital downloads had ""Kiss You"" sold...","['207,000']"
9,Who was appointed as the Allied Commander-in-C...,['Lieutenant General Dwight D. Eisenhower']


In [35]:
import ast

trainset = []
for i in range(5):
    ex = dspy.Example(
        question=train['question'].iloc[i],
        answer=ast.literal_eval(train['ground_truths'].iloc[i])[0]
    )
    ex = ex.with_inputs('question')
    trainset.append(ex)

In [36]:
trainset

[Example({'question': 'What is the Philippine version of the auto rickshaw?', 'answer': 'Tricycles'}) (input_keys={'question'}),
 Example({'question': 'What type of shot could lodge in the hull of a wooden ship and cause a fire?', 'answer': 'Red hot shot'}) (input_keys={'question'}),
 Example({'question': 'When did Clayton Kershaw debut in the MLB?', 'answer': '2008'}) (input_keys={'question'}),
 Example({'question': 'Who influenced Bacon\'s use of the "space frame" in his artwork?', 'answer': 'Alberto Giacometti.'}) (input_keys={'question'}),
 Example({'question': 'What was the first studio album released by the band?', 'answer': 'Church of Realities'}) (input_keys={'question'})]

In [37]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)

100%|██████████| 5/5 [00:05<00:00,  1.19s/it]


Bootstrapped 1 full traces after 5 examples in round 0.


In [38]:
import ast
def get_evals(dataset, rag):
    # Create an empty list to store rows
    eval_results_rows = []

    for index, row in dataset.iterrows():
        # Get the question
        question = row['question']
        # Response from rag
        response = rag(question)
        # Create a dictionary to represent a row
        row_dict = {'question': question, 'contexts': response.context, 'answer': response.answer, 'ground_truths' : row['ground_truths']}
        # Append the row dictionary to the list
        eval_results_rows.append(row_dict)

    # Create the df_eval_results DataFrame from the list of rows
    df_eval_results = pd.DataFrame(eval_results_rows)

    # Convert 'ground_truths' column to list
    df_eval_results['ground_truths'] = df_eval_results['ground_truths'].apply(lambda x: ast.literal_eval(x))

    return df_eval_results


In [39]:

df_eval_results = get_evals(test, compiled_rag)


In [40]:
# Save the df_eval_results DataFrame to a csv file
import time
EXP_NAME = "COMPILED_RAG_OVERLAP_0"
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
df_eval_results.to_csv('./results/inference_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

Now, that we have answers for all the questions, we can evaluate the RAG model.

In [41]:
from datasets import Dataset
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
)

ds = Dataset.from_pandas(df_eval_results)


try:
    result = evaluate(
        dataset = ds,
        metrics=[
            faithfulness,
            answer_relevancy,
            context_relevancy,
            context_recall,
            context_precision,
        ],
        raise_exceptions=False
    )
except Exception as e:
    print(e)



Evaluating:   0%|          | 0/775 [00:00<?, ?it/s]

ERROR:ragas.executor:Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\asyncio\tasks.py", line 571, in _wait_for_one
    return f.result()  # May raise f.exception().
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\metrics\base.py", line 91, in ascore
    raise e
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\metrics\base.py", line 87, in ascore
    score = await self._ascore(row=row, callbacks=group_cm, is_async=is_async)
  File "c:\Users\kuotz\anaconda3\envs\langchian2\lib\site-packages\ragas\metrics\_faithfulness.py", line 190, in _ascore
    assert isinstanc

In [42]:
# ds = Dataset.from_pandas(df_eval_results)

# result = evaluate(
#     ds,
#     metrics=[
#         context_precision,
#         faithfulness,
#         answer_relevancy,
#         context_recall,
#         answer_similarity,
#         context_relevancy
#     ],
# )

In [43]:
result

{'faithfulness': 0.7160, 'answer_relevancy': 0.6861, 'context_relevancy': 0.5496, 'context_recall': 0.7275, 'context_precision': 0.5440}

In [44]:
# save the result
result.to_pandas().to_csv('./results/evaluation_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

In [45]:
result.to_pandas()

Unnamed: 0,question,contexts,answer,ground_truths,ground_truth,faithfulness,answer_relevancy,context_relevancy,context_recall,context_precision
0,"What was the purpose of Operation Torch, devis...",[. many ships also used a forced draught to ge...,Military strategy.,[To occupy French North Africa],To occupy French North Africa,,,,,
1,What is the estimated weight of Tres Zapotes M...,[. it has since been moved to the museo comuni...,7.8 tons.,[7.8 tons],7.8 tons,,,0.000000,0.738929,0.000000
2,Who was widely praised for their performance i...,"[american beauty, they gave their top awards t...",Kevin Spacey.,"[Spacey, Mendes, Ball]","Spacey, Mendes, Ball",1.0,0.000000,1.000000,0.915835,0.037037
3,Who won the IWGP Heavyweight Championship on h...,[njcaa heavyweight champion ( 1998 ) north dak...,Brock Lesnar.,[Lesnar],Lesnar,1.0,1.000000,,0.927463,0.111111
4,What is the industrial process for the product...,[= = reactions of oxaziridines = = = = = hydra...,Peroxide process.,[Peroxide process],Peroxide process,0.0,0.000000,1.000000,0.810523,0.428571
...,...,...,...,...,...,...,...,...,...,...
150,What musical style has Hed PE referred to thei...,"[= = musical style = =, . hed pe's music is a ...",G-punk.,[G-punk],G-punk,0.0,0.850167,0.375000,1.000000,0.333333
151,What was the combat efficiency of the 23rd Reg...,[. when the attack finally ceased shortly afte...,38 percent.,[38 percent],38 percent,1.0,0.970747,0.272727,1.000000,1.000000
152,What is the reproductive strategy of most temn...,"[evolving from temnospondyls ), . like most li...",External fertilization.,[External fertilization],External fertilization,1.0,0.916365,0.750000,1.000000,0.588889
153,What force was greatly feared by nationalists ...,"[. on the third day of fighting, 14 august, th...",Ulster Special Constabulary (B-Specials),[Ulster Special Constabulary],Ulster Special Constabulary,1.0,0.877489,0.272727,1.000000,1.000000


In [46]:
# Logging to wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-eval",
     
    # track hyperparameters and run metadata(you can see that this is the "compiled version")
                                             ################################################
    config={
        "number_of_questions": len(ds),
        "comments": "Compiled QA RAG model with teleprompter - OVERLAP 0",
        "model": "RAG",
        "dataset": "Synthetic",
        "num_passages": 5,
        "openai_model": "gpt-3.5-turbo",
        "chroma_collection_name": "test",
        "chroma_persist_directory": "chroma.db",
        "chroma_local_embed_model": "sentence-transformers/paraphrase-MiniLM-L6-v2",

    }
)

wandb.log(result)

wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.026 MB uploaded\r'), FloatProgress(value=0.052009717314487634, max=1…

0,1
answer_relevancy,▁
context_precision,▁
context_recall,▁
context_relevancy,▁
faithfulness,▁

0,1
answer_relevancy,0.6861
context_precision,0.54397
context_recall,0.72751
context_relevancy,0.54958
faithfulness,0.71605


-------

No Retrieval
---

In [47]:
class BasicQA(dspy.Signature):
    """Answer questions with short factoid answers."""

    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [48]:
# Define the predictor.
generate_answer = dspy.Predict(BasicQA)

In [49]:
eval_results_rows = []

for index, row in test.iterrows():
    # Get the question
    question = row['question']
    # Response from rag
    response = generate_answer(question = question)
    # Create a dictionary to represent a row
    row_dict = {'question': question, 'answer': response.answer, 'ground_truths' : row['ground_truths']}
    # Append the row dictionary to the list
    eval_results_rows.append(row_dict)

# Create the df_eval_results DataFrame from the list of rows
df_eval_results = pd.DataFrame(eval_results_rows)

# Convert 'ground_truths' column to list
df_eval_results['ground_truths'] = df_eval_results['ground_truths'].apply(lambda x: ast.literal_eval(x))

In [52]:
from datasets import Dataset
from ragas.metrics import (
    answer_similarity
)

ds = Dataset.from_pandas(df_eval_results)


try:
    result = evaluate(
        dataset = ds,
        metrics=[
            answer_similarity
        ],
        raise_exceptions=False
    )
except Exception as e:
    print(e)



Evaluating:   0%|          | 0/155 [00:00<?, ?it/s]

In [None]:
# ds = Dataset.from_pandas(df_eval_results)

# result = evaluate(
#     ds,
#     metrics=[
#         answer_similarity
#     ],
# )

evaluating with [answer_similarity]


100%|██████████| 6/6 [00:03<00:00,  1.59it/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [53]:
result

{'answer_similarity': 0.8686}

In [54]:
EXP_NAME = "BASIC_QA_OVERLAP_64"
# save the result
result.to_pandas().to_csv('./results/evaluation_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

In [55]:
result.to_pandas()

Unnamed: 0,question,answer,ground_truths,ground_truth,answer_similarity
0,"What was the purpose of Operation Torch, devis...",To invade North Africa.,[To occupy French North Africa],To occupy French North Africa,0.928378
1,What is the estimated weight of Tres Zapotes M...,about 25 tons,[7.8 tons],7.8 tons,0.891800
2,Who was widely praised for their performance i...,Kevin Spacey,"[Spacey, Mendes, Ball]","Spacey, Mendes, Ball",0.860952
3,Who won the IWGP Heavyweight Championship on h...,Brock Lesnar,[Lesnar],Lesnar,0.952020
4,What is the industrial process for the product...,Nazarov cyclization,[Peroxide process],Peroxide process,0.771584
...,...,...,...,...,...
150,What musical style has Hed PE referred to thei...,G-punk,[G-punk],G-punk,1.000000
151,What was the combat efficiency of the 23rd Reg...,75%,[38 percent],38 percent,0.843014
152,What is the reproductive strategy of most temn...,External fertilization,[External fertilization],External fertilization,1.000000
153,What force was greatly feared by nationalists ...,British Army,[Ulster Special Constabulary],Ulster Special Constabulary,0.813200


In [56]:
# Logging to wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-eval",
    
    # track hyperparameters and run metadata
    config={
        "number_of_questions": len(ds),
        "comments": "No RAG model - just basic QA model - OVERLAP 64",
        "model": "RAG",
        "dataset": "Synthetic",
        "num_passages": 5,
        "openai_model": "gpt-3.5-turbo",
        "chroma_collection_name": "test",
        "chroma_persist_directory": "chroma.db",
        "chroma_local_embed_model": "sentence-transformers/paraphrase-MiniLM-L6-v2",

    }
)

wandb.log(result)

wandb.finish()

VBox(children=(Label(value='0.015 MB of 0.026 MB uploaded\r'), FloatProgress(value=0.5872857195749565, max=1.0…

0,1
answer_similarity,▁

0,1
answer_similarity,0.8686
