In [None]:
import os
from typing import Dict, Optional, Tuple

import wandb
import weave
from dotenv import load_dotenv

load_dotenv()

import instructor
from openai import OpenAI
from groq import Groq
from pydantic import BaseModel
from llama_index.core import (
    ServiceContext, StorageContext, load_index_from_storage
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [None]:
weave.init(project_name="groq-rag")

artifact = wandb.Api().artifact(
    "geekyrakshit/groq-rag/ncert-flamingoes-prose-embeddings:latest"
)
artifact_dir = artifact.download()

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

service_context = ServiceContext.from_defaults(
    embed_model=embed_model, llm=None
)

storage_context = StorageContext.from_defaults(persist_dir=artifact_dir)
index = load_index_from_storage(
    storage_context, service_context=service_context
)
retreival_engine = index.as_retriever(
    service_context=service_context,
    similarity_top_k=10,
)

In [None]:
class EnglishStudentResponseAssistant(weave.Model):
    model: str = "llama3-8b-8192"
    _groq_client: Optional[Groq] = None

    def __init__(self, model: Optional[str] = None):
        super().__init__()
        self.model = model if model is not None else self.model
        self._groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

    @weave.op()
    def get_prompt(
        self, question: str, context: str, word_limit_min: int, word_limit_max: int
    ) -> Tuple[str, str]:
        system_prompt = """
You are a student in a class and your teacher has asked you to answer the following question.
You have to write the answer in the given word limit."""
        user_prompt = f"""
We have provided context information below. 

---
{context}
---

Answer the following question within {word_limit_min}-{word_limit_max} words:

---
{question}
---"""
        return system_prompt, user_prompt

    @weave.op()
    def predict(self, question: str, total_marks: int) -> str:
        response = retreival_engine.retrieve(question)
        context = response[0].node.text
        if total_marks < 3:
            word_limit_min = 5
            word_limit_max = 50
        elif total_marks < 5:
            word_limit_min = 50
            word_limit_max = 100
        else:
            word_limit_min = 100
            word_limit_max = 200
        system_prompt, user_prompt = self.get_prompt(
            question, context, word_limit_min, word_limit_max
        )
        chat_completion = self._groq_client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": system_prompt,
                },
                {
                    "role": "user",
                    "content": user_prompt,
                },
            ],
            model=self.model,
        )
        return {
            "response": chat_completion.choices[0].message.content,
            "context": context,
        }

In [None]:
class JudgeResponse(BaseModel):
    question: str
    ground_truth_answer: str
    assistant_answer: str
    marks: float
    total_marks: int
    explanation: str

In [None]:
class OpenaAIJudgeModel(weave.Model):
    model: str = "gpt-4"
    _openai_client: Optional[instructor.Instructor] = None

    def __init__(self, model: Optional[str] = None):
        super().__init__()
        self.model = model if model is not None else self.model
        self._openai_client = instructor.from_openai(
            OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
        )

    @weave.op()
    def compose_judgement(
        self,
        question: str,
        context: str,
        ground_truth_answer: str,
        assistant_answer: str,
        total_marks: int,
    ) -> JudgeResponse:
        system_prompt = f"""
You are an expert in teacher of English langugage and literature.
Given a question, a context, a ground truth answer and an answer from an AI assistant,
you have to judge the assistant's answer based on the following criteria and assign
a score between 0 and total marks:

1. how well the assistant answers the question with respect to the context.
2. how well the assistant's answer holds up in correctness and relevance to
    the ground truth answer (assuming the ground truth answer is perfect).

You have to extract the question, the ground truth answer, the assistant's answer,
the marks to be awarded to the assistant's answer, the total marks for the question,
and a detailed explanation as to how the assistant's answer was judged."""
        user_prompt = f"""
We have asked the following question to an AI assistant for total_marks={total_marks}:

---
{question}
---

We have provided context information below. 

---
{context}
---

Th AI assistant has responded with the following answer:

---
{assistant_answer}
---

An ideal answer to the question would be the following:

---
{ground_truth_answer}
---"""
        return self._openai_client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": system_prompt,
                },
                {
                    "role": "user",
                    "content": user_prompt,
                },
            ],
            model=self.model,
            response_model=JudgeResponse,
        )

    @weave.op()
    def predict(
        self,
        question: str,
        answer: str,
        marks: int,
        model_output: Dict[str, str],
    ) -> Dict[str, float]:
        if marks == "3-4":
            total_marks = 4
        elif marks == "5-6":
            total_marks = 6
        judge_response = self.compose_judgement(
            question=question,
            context=model_output["context"],
            ground_truth_answer=answer,
            assistant_answer=model_output["response"],
            total_marks=total_marks,
        )
        return {
            "marks": judge_response.marks,
            "fractional_marks": judge_response.marks / total_marks,
            "percentage": (judge_response.marks / total_marks) * 100,
        }

In [None]:
assistant = EnglishStudentResponseAssistant()


@weave.op()
def get_assistant_prediction(question: str, marks: str):
    if marks == "3-4":
        marks = 4
    elif marks == "5-6":
        marks = 6
    return assistant.predict(question, marks)

In [None]:
judge_model = OpenaAIJudgeModel()
dataset = weave.ref('flamingos-prose-question-bank:v1').get()
evaluation = weave.Evaluation(dataset=dataset, scorers=[judge_model.predict])

In [None]:
await evaluation.evaluate(get_assistant_prediction)