In [1]:
%load_ext autoreload
%autoreload 2

In [39]:
import re
import json
import wandb
import pandas as pd
import numpy as np
from tqdm import tqdm

## Get Ground Truth (Ref) Contexts

In [34]:
df = pd.read_json(
    "/Users/ayushthakur/integrations/wandbot-eval/wbeval/wandbot/data/eval/wandbot_cleaned_annotated_dataset_11-12-2023.jsonl",
    lines=True,
    orient="records",
)
df = df[
    (df["is_wandb_query"] == "YES") & (df["correctness"] == "correct")
]
len(df)

98

In [35]:
df.head()

Unnamed: 0,question,answer,context,correctness,is_wandb_query,notes
0,Hey I have a question about using wandb with f...,When integrating `wandb` (Weights & Biases) wi...,Source:\thttps://docs.wandb.ai/guides/track/tr...,correct,YES,The answer clearly explains the recommended pr...
1,Hey with wandb is it possible to link from the...,"Yes, with `wandb`, you can link to the best ru...",Source:\thttps://docs.wandb.ai/guides/track/pu...,correct,YES,This answer correctly explains how to use the ...
2,Explain how I can version datasets with Weight...,Versioning datasets with Weights & Biases (W&B...,Source:\thttps://github.com/wandb/examples/tre...,correct,YES,The answer correctly summarizes the informatio...
4,"Hi, can anybody help me with this issue? wandb...",The `wandb.sdk.service.service.ServiceStartTim...,Source:\thttps://docs.wandb.ai/guides/track/lo...,correct,YES,This requires more data to debug and probably ...
5,what is the difference between artifact.add_fi...,`artifact.add_file` and `wandb.save` are both ...,Source:\thttps://docs.wandb.ai/guides/artifact...,correct,YES,The answer correctly identifies the distinctio...


In [36]:
import re

ref_query_contexts = dict()

def split_contexts(text):
    # This pattern looks for 'Source:' followed by any characters (non-greedy), a URL, and ends with '\n---\n'
    pattern = r"(Source:\s*https?://[^\s]+\s*.*?)(?=\n---\n|$)"
    contexts = [match.group().strip() for match in re.finditer(pattern, text, re.DOTALL)]
    return contexts

# Assuming df['context'] is a column containing the text to be split
for idx, row in df.iterrows():
    contexts = split_contexts(row['context'])
    assert len(contexts) == 5  # Ensure there are exactly 5 contexts
    ref_query_contexts[row['question']] = [
        re.sub(r"Source:\s*https?://[^\s]+\s*", "", context).strip() for context in contexts
    ]

## Get Contexts for Best System

In [71]:
with open(
    "/Users/ayushthakur/integrations/wandbot-eval/wbeval/wandbot/artifacts/run-3b3vex63-EvaluationResults:v0/Evaluation Results.table.json"
) as f:
    data = json.load(f)
    columns = data["columns"]
    data = data["data"]
    df = pd.DataFrame(columns=columns, data=data)

In [79]:
df.head(1)

Unnamed: 0,idx,system_prompt,question,answer,model,sources,source_documents,total_tokens,prompt_tokens,completion_tokens,...,answer_relevancy_result,answer_relevancy_reason,answer_relevancy_score_(ragas),answer_faithfulness_score,answer_faithfulness_result,answer_faithfulness_reason,answer_faithfulness_score_(ragas),answer_similarity_score_(ragas),context_precision_score,context_recall_score
0,0,"system: You are wandbot, an expert support ass...",Hey I have a question about using wandb with f...,When integrating `wandb` with FastAPI or any o...,gpt-4-1106-preview,https://docs.wandb.ai/guides/integrations/fast...,"[{""source"": ""https://docs.wandb.ai/guides/inte...",5508,5026,482,...,True,The generated answer is relevant and addresses...,0.860686,1,False,The generated answer provides a recommendation...,,0.693624,0.0,1.0


In [80]:
context_sys_best = dict()

for idx, row in tqdm(df.iterrows()):
    context_sys_best[row.question] = [
        context_dict["text"] for context_dict in eval(row.source_documents)
    ]

98it [00:00, 5600.02it/s]


## Get Contexts for Current Commit

In [91]:
api = wandb.Api()
run = api.run("wandbot/wandbot-eval/8kfxwv2c")

for artifact in run.logged_artifacts():
    artifact.download()

[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [92]:
with open(
    "/Users/ayushthakur/integrations/wandbot-eval/wbeval/wandbot/artifacts/run-8kfxwv2c-EvaluationResults:v0/Evaluation Results.table.json"
) as f:
    data = json.load(f)
    columns = data["columns"]
    data = data["data"]
    df = pd.DataFrame(columns=columns, data=data)

In [96]:
df.head(1)

Unnamed: 0,idx,system_prompt,question,answer,model,sources,source_documents,total_tokens,prompt_tokens,completion_tokens,...,contexts,answer_correctness_score,answer_correctness_result,answer_correctness_reason,answer_relevancy_score,answer_relevancy_result,answer_relevancy_reason,answer_faithfulness_score,answer_faithfulness_result,answer_faithfulness_reason
0,0,System: As Wandbot - a support expert in Weigh...,Hey I have a question about using wandb with f...,When using `wandb` with FastAPI or any other w...,gpt-4-1106-preview,https://docs.wandb.ai/quickstart\nhttps://gith...,source: https://docs.wandb.ai/quickstart\nsour...,5031,4492,539,...,source: https://docs.wandb.ai/quickstart\nsour...,3,True,The generated answer provides a correct and co...,3,True,The generated answer is relevant and provides ...,1,False,The generated answer provides a recommendation...


In [123]:
import re

context_current = dict()

def split_contexts(text):
    # Regex pattern to capture blocks starting correctly after 'source:'
    # Ensuring we skip unnecessary '---' lines and capture till the next 'source:' or end of text
    pattern = r"(?:---\s*\n)*?(source:\s*https?://[^\s]+\nsource_type:\s*.*?\nhas_code:\s*.*?\n)((?:.*?)(?=(?:\n---\s*\nsource:|\n---\s*$)))"
    
    # Use re.DOTALL to match across multiple lines including newlines
    contexts = [match.group(1) + match.group(2) for match in re.finditer(pattern, text, re.DOTALL)]
    
    return contexts

# Assuming `df['context']` is a column containing the text blocks to be split
for idx, row in tqdm(df.iterrows()):
    contexts = split_contexts(row['source_documents'] + "\n---\n")
    if len(contexts) != 10:
        print(idx)
    # Store cleaned contexts, where we remove the 'source:', 'source_type:', and 'has_code:' lines if necessary
    context_current[row['question']] = [
        re.sub(r"(source:.*\n)|(source_type:.*\n)|(has_code:.*\n)", "", context).strip() for context in contexts
    ]

98it [00:00, 1971.16it/s]

5
21
22
28
50
58
61
76
79
80
97





In [82]:
def calculate_metrics(a, b):
    # Convert lists to sets
    set_a = set(a)
    set_b = set(b)
    
    # Calculate intersection and union
    intersection = set_a.intersection(set_b)
    union = set_a.union(set_b)
    
    # Calculate precision, recall, and F1 score
    precision = len(intersection) / len(set_b) if set_b else 0
    recall = len(intersection) / len(set_a) if set_a else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0
    
    # Calculate Jaccard Index
    jaccard_index = len(intersection) / len(union) if union else 0
    
    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "jaccard_index": jaccard_index
    }

# Given lists
a = ["text 1", "text 2", "text 3", "text 4", "text 5"]
b = ["text 3", "text 5", "text 7", "text 1", "text 10"]

# Calculate metrics
metrics = calculate_metrics(a, b)
print(metrics)

{'precision': 0.6, 'recall': 0.6, 'f1_score': 0.6, 'jaccard_index': 0.42857142857142855}


In [86]:
k = df.question.values[37]

In [87]:
a = ref_query_contexts[k]
b = context_sys_best[k]

In [88]:
calculate_metrics(a, b)

{'precision': 0.0, 'recall': 0.0, 'f1_score': 0, 'jaccard_index': 0.0}

In [89]:
a

['Project defaults\n\n\nChange the default behavior for your account within the **Project** **Defaults** section. You can manage the proceeding:\n\n\n* **Default location to create new projects** - Select the dropdown menu and choose the entity to set as the new default. Specify either your account or a team you are a member of.\n* **Default projects privacy in your personal account** - Set a project to public (anyone can view), private (only you can view and contribute) or open (anyone can submit runs or write the reports) automatically when you create a project. You can optionally create a team to collaborate on private projects.\n* **Enable code savings in your personal account** - Permit Weights and Biases to save the latest git commit hash by default. To enable code saving, toggle the Enable code savings in your personal account option. For more information about saving and comparing code, see Code Saving.',
 'Settings Page\n\n\nWithin your individual user account you can edit: yo

In [90]:
b

['Versions tab\nThe versions tab shows all versions of the artifact as well as columns for each of the numeric values of the Run History at the time of logging the version. This allows you to compare performance and quickly identify versions of interest.\nProject Defaults\nYou can change your project default settings manually in your User Settings at\n/settings.\n- Default location to create new projects: This is set to your own personal entity by default. By clicking on the dropdown, you can switch between your personal entity and the teams you\'re part of.\n- Default project privacy in your personal account: This is set to \'Private\' by default. In other words, your projects will be private and can only be accessed by you.\n- Enable code saving in your personal account: This is turned off by default. You can turn this on to save the main script or notebook to W&B.\nThese settings can also be specified by passing arguments to\nwandb.init.\nFrequently Asked Questions\nHow can I delete