In [1]:
!pip -qqq install pip --progress-bar off
!pip -qqq install groq==0.9.0 --progress-bar off
!pip -qqq install datasets==2.19.2 --progress-bar off
!pip -qqq install deepeval==0.21.55 --progress-bar off

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.[0m[31m
[0m  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-metadata 1.15.0 requires protobuf<4.21,>=3.20.3; python_version < "3.11", but you have protobuf 4.25.1 which is incompatible.[0m[31m
[0m

In [2]:
import json
import os
import pickle
import random
import time
import typing as t
from dataclasses import dataclass, field
from pathlib import Path
from textwrap import dedent
from typing import Any, Dict, List, Optional

import groq
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    HallucinationMetric,
)
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
from google.colab import userdata
from groq import Groq
from IPython.display import display
from tqdm import tqdm

os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")
os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] = "YES"
os.environ["ERROR_REPORTING"] = "NO"

N_ITEMS = 20
CRITIC_MODEL = "llama3-70b-8192"

np.random.seed(42)


def save_list(items, file_path):
    try:
        with Path(file_path).open("wb") as f:
            pickle.dump(items, f)
    except Exception as e:
        print(f"An error occurred while saving the list of objects to file: {e}")


def load_list(file_path):
    try:
        with Path(file_path).open("rb") as file:
            items = pickle.load(file)
        return items
    except Exception as e:
        print(f"An error occurred while loading the list of objects from file: {e}")
        return None

## Load Dataset

In [3]:
dataset = load_dataset("virattt/financial-qa-10K")
dataset

Downloading readme:   0%|          | 0.00/419 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'context', 'ticker', 'filing'],
        num_rows: 7000
    })
})

In [4]:
dataset["train"][0]

{'question': 'What area did NVIDIA initially focus on before expanding to other computationally intensive fields?',
 'answer': 'NVIDIA initially focused on PC graphics.',
 'context': 'Since our original focus on PC graphics, we have expanded to several other large and important computationally intensive fields.',
 'ticker': 'NVDA',
 'filing': '2023_10K'}

In [5]:
sample = dataset["train"].shuffle().select(range(N_ITEMS))

In [6]:
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))


def predict(prompt: str, model: str, client: Groq = client):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model,
        )
        return chat_completion.choices[0].message.content
    except groq.APIConnectionError as e:
        print("The server could not be reached")
        print(e.__cause__)
    except groq.RateLimitError as e:
        print("A 429 status code was received; we should back off a bit.")
    except groq.APIStatusError as e:
        print("Another non-200-range status code was received")
        print(e.status_code)
        print(e.response)

In [7]:
def format_prompt(question: str, context: str):
    return dedent(
        f"""
        Use the following context:
        ```
        {context}
        ```
        to answer the question:
        ```
        {question}
        ```

        Your answer must be succinct!
        Answer:
    """
    )

In [8]:
@dataclass
class QuestionAnswer:
    question: str
    answer: str
    true_answer: str
    context: str

In [9]:
def extract_predictions(dataset: Dataset, model: str) -> List[QuestionAnswer]:
    return [
        QuestionAnswer(
            question=item["question"],
            answer=predict(format_prompt(item["question"], item["context"]), model),
            true_answer=item["answer"],
            context=item["context"],
        )
        for item in tqdm(dataset)
    ]

In [10]:
models = ["gemma-7b-it", "llama3-8b-8192", "mixtral-8x7b-32768"]

predictions = {}
for model in models:
    predictions[model] = extract_predictions(sample, model=model)

100%|██████████| 20/20 [00:06<00:00,  2.97it/s]
100%|██████████| 20/20 [00:05<00:00,  3.80it/s]
100%|██████████| 20/20 [00:06<00:00,  3.10it/s]


In [None]:
save_list(predictions, "predictions.pkl")

## Simple Evaluation

In [11]:
eval_prompt = """

Consider the question: {question}
and answer: {answer}
based on the context: {context}
compare with the true answer: {true_answer}

Score how correct the response is on a scale from 0 to 10.
Respond with the integer number only.
"""

In [13]:
prediction = predictions["gemma-7b-it"][10]
prediction.__dict__

{'question': 'What are the interest rate ranges for the unsecured notes due between 2024 to 2029?',
 'answer': 'Interest rates for unsecured notes due between 2024 and 2029 range from 2.90% to 7.38%.',
 'true_answer': 'The interest rates for the unsecured notes due between 2024 to 2029 range from 2.90% to 7.38%.',
 'context': 'Unsecured notes due between 2024 and 2029 have interest rates ranging from 2.90% to 7.38%.'}

In [15]:
predict(
    eval_prompt.format(
        question=prediction.question,
        answer=prediction.answer,
        context=prediction.context,
        true_answer=prediction.true_answer,
    ),
    CRITIC_MODEL,
)

'9'

In [17]:
scores = {}
for model, model_predictions in predictions.items():
    model_scores = []
    for prediction in tqdm(model_predictions):
        score = predict(
            eval_prompt.format(
                question=prediction.question,
                answer=prediction.answer,
                context=prediction.context,
                true_answer=prediction.true_answer,
            ),
            CRITIC_MODEL,
        )
        model_scores.append(int(score))

        sleep_time = random.uniform(0.5, 2.0)
        time.sleep(sleep_time)
    scores[model] = model_scores

100%|██████████| 20/20 [00:31<00:00,  1.59s/it]
100%|██████████| 20/20 [00:43<00:00,  2.16s/it]
100%|██████████| 20/20 [00:54<00:00,  2.72s/it]


In [19]:
rows = []
for model, model_scores in scores.items():
    rows.append({"model": model, "score": np.mean(model_scores)})
pd.DataFrame(rows)

Unnamed: 0,model,score
0,gemma-7b-it,8.95
1,llama3-8b-8192,8.85
2,mixtral-8x7b-32768,9.15


## Critic Model

In [None]:
class GroqCriticModel(DeepEvalBaseLLM):
    def __init__(self, model: str):
        self.model = model

    def load_model(self):
        pass

    def generate(self, prompt: str) -> str:
        return predict(prompt, self.model)

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return self.model

In [None]:
critic_model = GroqCriticModel(CRITIC_MODEL)

## Metrics

In [None]:
test_case = LLMTestCase(
    input=prediction.question,
    actual_output=prediction.answer,
    context=[prediction.context],
    retrieval_context=[prediction.context],
)

In [None]:
relevancy_metric = AnswerRelevancyMetric(
    threshold=0.7, model=critic_model, include_reason=True
)

relevancy_metric.measure(test_case)
print(relevancy_metric.score, relevancy_metric.reason)

Output()

1.0 The score is 1.00 because the output perfectly addresses the question and provides accurate information without any irrelevant statements.


In [None]:
faithfulness_metric = FaithfulnessMetric(
    threshold=0.7, model=critic_model, include_reason=True
)

faithfulness_metric.measure(test_case)
print(faithfulness_metric.score, faithfulness_metric.reason)

Output()

1.0 The score is 1.00 because there are no contradictions found, meaning the actual output perfectly aligns with the retrieval context - great job!


In [None]:
hallucination_metric = HallucinationMetric(
    threshold=0.5, model=critic_model, include_reason=True
)

hallucination_metric.measure(test_case)
print(hallucination_metric.score, hallucination_metric.reason)

Output()

0.0 The score is 0.00 because the actual output does not contradict the context and aligns with the provided information.


## Evaluation

In [None]:
@dataclass
class ModelEvaluation:
    model: str
    relevancy: List[float] = field(default_factory=list)
    faithfulness: List[float] = field(default_factory=list)
    hallucination: List[float] = field(default_factory=list)

    no_relevancy_reasons: List[str] = field(default_factory=list)
    no_faithfulness_reasons: List[str] = field(default_factory=list)
    hallucination_reasons: List[str] = field(default_factory=list)

In [None]:
evaluations = []
for model, model_predictions in predictions.items():
    evaluation = ModelEvaluation(model)
    for prediction in tqdm(model_predictions):
        test_case = LLMTestCase(
            input=prediction.question,
            actual_output=prediction.answer,
            context=[prediction.context],
            retrieval_context=[prediction.context],
        )
        try:
            relevancy_metric.measure(test_case)
            faithfulness_metric.measure(test_case)
            hallucination_metric.measure(test_case)

            evaluation.relevancy.append(relevancy_metric.score)
            evaluation.faithfulness.append(faithfulness_metric.score)
            evaluation.hallucination.append(hallucination_metric.score)

            if relevancy_metric.score < 0.5:
                evaluation.no_relevancy_reasons.append(relevancy_metric.reason)
            if faithfulness_metric.score < 0.5:
                evaluation.no_faithfulness_reasons.append(faithfulness_metric.reason)
            if hallucination_metric.score > 0.5:
                evaluation.hallucination_reasons.append(hallucination_metric.reason)
        except:
            continue

        sleep_time = random.uniform(0.5, 2.0)
        time.sleep(sleep_time)
    evaluations.append(evaluation)

  0%|          | 0/20 [00:00<?, ?it/s]

Output()

Output()

Output()

  5%|▌         | 1/20 [00:09<02:52,  9.06s/it]

Output()

Output()

Output()

 10%|█         | 2/20 [00:25<04:04, 13.61s/it]

Output()

Output()

Output()

 15%|█▌        | 3/20 [01:10<07:53, 27.86s/it]

Output()

Output()

Output()

 20%|██        | 4/20 [01:48<08:27, 31.70s/it]

Output()

Output()

Output()

 25%|██▌       | 5/20 [02:23<08:16, 33.12s/it]

Output()

Output()

Output()

 30%|███       | 6/20 [03:14<09:07, 39.12s/it]

Output()

Output()

Output()

 35%|███▌      | 7/20 [03:52<08:23, 38.76s/it]

Output()

Output()

Output()

 40%|████      | 8/20 [04:41<08:22, 41.91s/it]

Output()

Output()

Output()

 45%|████▌     | 9/20 [05:14<07:10, 39.12s/it]

Output()

Output()

Output()

 50%|█████     | 10/20 [05:55<06:38, 39.90s/it]

Output()

Output()

Output()

 55%|█████▌    | 11/20 [06:43<06:21, 42.38s/it]

Output()

Output()

Output()

 60%|██████    | 12/20 [07:18<05:19, 39.89s/it]

Output()

Output()

Output()

 65%|██████▌   | 13/20 [07:57<04:38, 39.79s/it]

Output()

Output()

Output()

 70%|███████   | 14/20 [08:36<03:57, 39.56s/it]

Output()

Output()

Output()

 75%|███████▌  | 15/20 [09:17<03:19, 39.84s/it]

Output()

Output()

Output()

 80%|████████  | 16/20 [10:00<02:42, 40.74s/it]

Output()

Output()

Output()

 85%|████████▌ | 17/20 [10:40<02:02, 40.74s/it]

Output()

Output()

Output()

 90%|█████████ | 18/20 [11:15<01:17, 38.81s/it]

Output()

Output()

Output()

 95%|█████████▌| 19/20 [11:58<00:40, 40.25s/it]

Output()

Output()

Output()

100%|██████████| 20/20 [12:43<00:00, 38.17s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Output()

Output()

Output()

  5%|▌         | 1/20 [00:32<10:16, 32.43s/it]

Output()

Output()

Output()

 10%|█         | 2/20 [01:27<13:40, 45.56s/it]

Output()

Output()

Output()

 15%|█▌        | 3/20 [02:13<13:02, 46.01s/it]

Output()

Output()

Output()

 20%|██        | 4/20 [03:01<12:26, 46.66s/it]

Output()

Output()

Output()

 25%|██▌       | 5/20 [03:44<11:20, 45.33s/it]

Output()

Output()

Output()

 30%|███       | 6/20 [04:05<08:41, 37.27s/it]

Output()

Output()

Output()

 35%|███▌      | 7/20 [04:48<08:26, 38.95s/it]

Output()

Output()

Output()

 40%|████      | 8/20 [05:35<08:18, 41.55s/it]

Output()

Output()

Output()

 45%|████▌     | 9/20 [06:16<07:34, 41.34s/it]

Output()

Output()

Output()

 50%|█████     | 10/20 [06:52<06:38, 39.88s/it]

Output()

Output()

Output()

 55%|█████▌    | 11/20 [07:32<05:57, 39.70s/it]

Output()

Output()

Output()

 60%|██████    | 12/20 [08:15<05:25, 40.68s/it]

Output()

Output()

Output()

 65%|██████▌   | 13/20 [08:55<04:44, 40.61s/it]

Output()

Output()

Output()

 70%|███████   | 14/20 [09:23<03:40, 36.74s/it]

Output()

Output()

Output()

 75%|███████▌  | 15/20 [09:59<03:02, 36.45s/it]

Output()

Output()

Output()

 80%|████████  | 16/20 [10:41<02:33, 38.27s/it]

Output()

Output()

Output()

 85%|████████▌ | 17/20 [11:23<01:57, 39.22s/it]

Output()

Output()

Output()

 90%|█████████ | 18/20 [12:05<01:20, 40.08s/it]

Output()

Output()

Output()

 95%|█████████▌| 19/20 [12:50<00:41, 41.68s/it]

Output()

Output()

Output()

100%|██████████| 20/20 [13:31<00:00, 40.57s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Output()

Output()

Output()

  5%|▌         | 1/20 [00:33<10:34, 33.38s/it]

Output()

Output()

Output()

 10%|█         | 2/20 [01:36<15:17, 50.99s/it]

Output()

Output()

Output()

 15%|█▌        | 3/20 [02:19<13:20, 47.07s/it]

Output()

Output()

Output()

 20%|██        | 4/20 [02:56<11:32, 43.28s/it]

Output()

Output()

Output()

 25%|██▌       | 5/20 [03:38<10:40, 42.69s/it]

Output()

Output()

Output()

 30%|███       | 6/20 [04:25<10:17, 44.08s/it]

Output()

Output()

Output()

 35%|███▌      | 7/20 [05:01<09:00, 41.55s/it]

Output()

Output()

Output()

 40%|████      | 8/20 [05:43<08:20, 41.74s/it]

Output()

Output()

Output()

 45%|████▌     | 9/20 [06:34<08:09, 44.49s/it]

Output()

Output()

Output()

 50%|█████     | 10/20 [07:09<06:57, 41.74s/it]

Output()

Output()

Output()

 55%|█████▌    | 11/20 [07:42<05:50, 38.96s/it]

Output()

Output()

Output()

 60%|██████    | 12/20 [08:22<05:14, 39.25s/it]

Output()

Output()

Output()

 65%|██████▌   | 13/20 [09:02<04:37, 39.62s/it]

Output()

Output()

Output()

 70%|███████   | 14/20 [09:44<04:02, 40.39s/it]

Output()

Output()

Output()

 75%|███████▌  | 15/20 [10:25<03:22, 40.40s/it]

Output()

Output()

Output()

 80%|████████  | 16/20 [11:17<02:55, 43.98s/it]

Output()

Output()

Output()

 85%|████████▌ | 17/20 [11:59<02:09, 43.30s/it]

Output()

Output()

Output()

 90%|█████████ | 18/20 [12:36<01:22, 41.46s/it]

Output()

Output()

Output()

 95%|█████████▌| 19/20 [13:20<00:42, 42.32s/it]

Output()

Output()

Output()

100%|██████████| 20/20 [13:58<00:00, 41.95s/it]


In [None]:
evaluations[1]

ModelEvaluation(model='llama3-8b-8192', relevancy=[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], faithfulness=[1.0, 1.0, 1.0, 1.0, 1.0, 1, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], hallucination=[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0], no_relevancy_reasons=["The score is 0.00 because the actual output did not provide any relevant information about the company's monetization strategy for its film and television content, and instead included an unclear statement that did not address the input question."], no_faithfulness_reasons=[], hallucination_reasons=['The score is 1.00 because the actual output lacks crucial information about DJS-002 and only partially mentions its uses, resulting in significant hallucinations.', 'The score is 1.00 because the actual output provides incomplete information without mentioning the total future lease payments, wh

In [None]:
save_list(evaluations, "evaluations.pkl")

## Report

In [None]:
rows = []
for evaluation in evaluations:
    rows.append(
        {
            "model": evaluation.model,
            "relevancy": np.mean(evaluation.relevancy),
            "faithfulness": np.mean(evaluation.faithfulness),
            "hallucination": np.mean(evaluation.hallucination),
        }
    )

In [None]:
pd.DataFrame(rows)

Unnamed: 0,model,relevancy,faithfulness,hallucination
0,gemma-7b-it,1.0,1.0,0.0
1,llama3-8b-8192,0.95,0.975,0.4
2,mixtral-8x7b-32768,1.0,0.982456,0.0


In [None]:
report_prompt = """
Summarize the most common reasons in a paragraph based on the texts:

{relevancy}

{faithfulness}

{hallucination}

If the reasons are empty - conclude that the model did great.
"""

In [None]:
reports = {}
for evaluation in tqdm(evaluations):
    reports[evaluation.model] = predict(
        report_prompt.format(
            relevancy="\n".join(evaluation.no_relevancy_reasons),
            faithfulness="\n".join(evaluation.no_faithfulness_reasons),
            hallucination="\n".join(evaluation.hallucination_reasons),
        ),
        CRITIC_MODEL,
    )

100%|██████████| 3/3 [00:01<00:00,  2.57it/s]


In [None]:
for model, report in reports.items():
    print(model)
    print()
    display(report)
    print()
    print("-" * 50)
    print()

gemma-7b-it



'Since there are no texts provided, I conclude that the model did great!'


--------------------------------------------------

llama3-8b-8192



'The most common reasons for low scores are the lack of relevant information, incomplete answers, omission of crucial details, and hallucinations. The models often failed to provide specific details, ignored important context, and omitted essential information, leading to incomplete or misleading responses. Additionally, the models sometimes provided unclear or partial answers, failed to address the input question, or included irrelevant information, resulting in a significant mismatch with the context.'


--------------------------------------------------

mixtral-8x7b-32768



'There are no texts provided, so I assume the reasons are empty. Therefore, I conclude that the model did great!'


--------------------------------------------------



## References

- [DeepEval](https://github.com/confident-ai/deepeval)
- [ragas](https://github.com/explodinggradients/ragas/)
- [Giskard](https://github.com/Giskard-AI/giskard)
- [financial-qa-10K Dataset](https://huggingface.co/datasets/virattt/financial-qa-10K)