<a href="https://colab.research.google.com/github/wesslen/llm-evaluations/blob/main/notebooks/deepeval_openai_compatible_endpoints.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
!uv pip install --system deepeval pytest pytest-asyncio responses

[2mUsing Python 3.10.12 environment at /usr[0m
[2K[2mResolved [1m87 packages[0m [2min 508ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/2)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/2)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/2)
[2mpytest-asyncio[0m [32m--------------------------[2m----[0m[0m 14.91 KiB/17.60 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/2)
[2mpytest-asyncio[0m [32m--------------------------[2m----[0m[0m 14.91 KiB/17.60 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/2)
[2mpytest-asyncio[0m [32m------------------------------[2m[0m[0m 17.60 KiB/17.60 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/2)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/2)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/2)
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/2)
[2K[2mPrepared [1m2 packages[0m [2min 22ms[0m[0m
[2K[2mInstalled [1m2 packages[0m [2min 1ms[0m[0m
 [32m+[39m [1m

In [31]:
from google.colab import userdata
LLM_API_KEY = userdata.get('DSBA_LLAMA3_KEY')
LLM_ENDPOINT = userdata.get('LLAMA_BASE_URL')
model_name = "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"

In [6]:
# Add at top of your script
import logging
logging.basicConfig(level=logging.DEBUG)

In [57]:
from typing import Optional, Dict, Any, Union
import json
import httpx
import logging
from pydantic import BaseModel, Field
from deepeval.models import DeepEvalBaseLLM

class LLMRequestError(Exception):
    """Custom exception for LLM request errors"""
    def __init__(self, message: str, status_code: Optional[int] = None, response_text: Optional[str] = None):
        self.status_code = status_code
        self.response_text = response_text
        super().__init__(f"{message} - Status: {status_code}, Response: {response_text}")

class CustomOpenAILLM(DeepEvalBaseLLM):
    """
    A custom LLM class that implements OpenAI-compatible endpoints for DeepEval
    """
    def __init__(
        self,
        api_key: str,
        model_name: str = "gpt-3.5-turbo",
        base_url: Optional[str] = None,
        temperature: float = 0.7,
        max_tokens: int = 200,
        verify_ssl: bool = True,
        debug: bool = False
    ):
        """
        Initialize the custom LLM.

        Args:
            api_key (str): API key for authentication
            model_name (str): Name of the model to use
            base_url (Optional[str]): Base URL for the API endpoint
            temperature (float): Sampling temperature
            max_tokens (int): Maximum tokens to generate
            verify_ssl (bool): Whether to verify SSL certificates
            debug (bool): Enable debug logging
        """
        self.api_key = api_key
        self.model_name = model_name
        self.base_url = (base_url or "https://api.openai.com/v1").rstrip('/')
        self.temperature = min(max(temperature, 0.0), 1.0)  # Clamp between 0 and 1
        self.max_tokens = max_tokens
        self.verify_ssl = verify_ssl

        if debug:
            logging.basicConfig(level=logging.DEBUG)
            self.logger = logging.getLogger(__name__)
        else:
            self.logger = logging.getLogger(__name__)
            self.logger.setLevel(logging.WARNING)

    def load_model(self) -> 'CustomOpenAILLM':
        """Required by DeepEval"""
        return self

    def get_model_name(self) -> str:
        """Required by DeepEval"""
        return self.model_name

    def _prepare_request_payload(self, prompt: str) -> Dict[str, Any]:
        """Prepare the request payload"""
        return {
            "model": self.model_name,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": self.temperature,
            "max_tokens": self.max_tokens
        }

    def _prepare_headers(self) -> Dict[str, str]:
        """Prepare request headers"""
        return {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }

    def _handle_response(self, response: httpx.Response) -> str:
        """Handle the API response and extract the content"""
        try:
            response_data = response.json()
            self.logger.debug(f"Response data: {response_data}")

            if not response.is_success:
                raise LLMRequestError(
                    "Request failed",
                    status_code=response.status_code,
                    response_text=response.text
                )

            if "choices" not in response_data or not response_data["choices"]:
                raise LLMRequestError("No choices in response",
                                    response.status_code,
                                    response.text)

            if "message" not in response_data["choices"][0]:
                # Try alternative response formats
                if "text" in response_data["choices"][0]:
                    return response_data["choices"][0]["text"]
                raise LLMRequestError("Unexpected response format",
                                    response.status_code,
                                    response.text)

            return response_data["choices"][0]["message"]["content"]

        except json.JSONDecodeError:
            raise LLMRequestError("Invalid JSON response",
                                response.status_code,
                                response.text)

    def generate(self, prompt: str, schema: Optional[BaseModel] = None) -> Union[str, BaseModel]:
        """Generate a response from the LLM"""
        self.logger.debug(f"Generating response for prompt: {prompt}")

        headers = self._prepare_headers()
        payload = self._prepare_request_payload(prompt)

        self.logger.debug(f"Request URL: {self.base_url}/chat/completions")
        self.logger.debug(f"Request Headers: {headers}")
        self.logger.debug(f"Request Payload: {payload}")

        try:
            with httpx.Client(verify=self.verify_ssl) as client:
                response = client.post(
                    f"{self.base_url}/chat/completions",
                    headers=headers,
                    json=payload,
                    timeout=30.0
                )

                response_text = self._handle_response(response)

                if schema:
                    try:
                        json_result = json.loads(response_text)
                        return schema(**json_result)
                    except json.JSONDecodeError:
                        raise ValueError("Model output is not valid JSON")
                return response_text

        except httpx.RequestError as e:
            raise LLMRequestError(f"Request failed: {str(e)}")

    async def a_generate(self, prompt: str, schema: Optional[BaseModel] = None) -> Union[str, BaseModel]:
        """Generate a response from the LLM asynchronously"""
        self.logger.debug(f"Generating async response for prompt: {prompt}")

        headers = self._prepare_headers()
        payload = self._prepare_request_payload(prompt)

        try:
            async with httpx.AsyncClient(verify=self.verify_ssl) as client:
                response = await client.post(
                    f"{self.base_url}/chat/completions",
                    headers=headers,
                    json=payload,
                    timeout=30.0
                )

                response_text = self._handle_response(response)

                if schema:
                    try:
                        json_result = json.loads(response_text)
                        return schema(**json_result)
                    except json.JSONDecodeError:
                        raise ValueError("Model output is not valid JSON")
                return response_text

        except httpx.RequestError as e:
            raise LLMRequestError(f"Async request failed: {str(e)}")

In [58]:
# For compatible endpoints without SSL verification
llm_no_ssl = CustomOpenAILLM(
    api_key=LLM_API_KEY,
    model_name=model_name,
    base_url=LLM_ENDPOINT,
    verify_ssl=False
)

In [59]:
llm_no_ssl.generate("What day is it?")

"I'm not currently able to share the date."

In [74]:
from deepeval import evaluate
from deepeval.metrics import PromptAlignmentMetric
from deepeval.test_case import LLMTestCase

metric = PromptAlignmentMetric(
    prompt_instructions=["Reply in all uppercase"],
    model=llm_no_ssl,
    include_reason=True
)
test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    # Replace this with the actual output from your LLM application
    actual_output="We offer a 30-day full refund at no extra cost."
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Output()

0.0
The score is 0.00 because the LLM output does not contain the input in all uppercase letters.


In [61]:
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
from deepeval.metrics import AnswerRelevancyMetric

answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5,model=llm_no_ssl)
test_case = LLMTestCase(
  input="Who is the current president of the United States of America?",
  actual_output="Joe Biden",
  retrieval_context=["Joe Biden serves as the current president of America."]
)

answer_relevancy_metric.measure(test_case)
print(answer_relevancy_metric.score)
print(answer_relevancy_metric.reason)

Output()

1.0
The score is 1.00 because the response directly answers the question without any irrelevant information.


In [64]:
from deepeval.metrics import ToxicityMetric
from deepeval.test_case import LLMTestCase


metric = ToxicityMetric(model=llm_no_ssl)
test_case = LLMTestCase(
    input="How is Sarah as a person?",
    # Replace this with the actual output from your LLM application
    actual_output="Sarah always meant well, but you couldn't help but sigh when she volunteered for a project."
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Output()

ValueError: Model output is not valid JSON

In [63]:
def test_endpoint():
    llm = CustomOpenAILLM(
        api_key=LLM_API_KEY,
        model_name=model_name,
        base_url=LLM_ENDPOINT,
        verify_ssl=False
    )

    try:
        response = llm.generate("This is a test prompt")
        print("Success:", response)
    except Exception as e:
        print(f"Error: {str(e)}")
        if hasattr(e, 'response'):
            print(f"Response status: {e.response.status_code}")
            print(f"Response body: {e.response.text}")

if __name__ == "__main__":
    test_endpoint()

Success: A blank slate! I'm ready to respond. How would you like this test to proceed?
