In [2]:
import json
import re
import ollama
import csv
from typing import List, Dict
from pathlib import Path
import pandas as pd

class LanguageModelEvaluator:
    def __init__(self, model_name: str, model_type: str = 'ollama'):
        self.model_name = model_name
        self.model_type = model_type
        self.model = self._load_model()
        # We'll use the same model for evaluation
        self.evaluator_model = self.model

    def _load_model(self):
        if self.model_type == 'ollama':
            return ollama.Client()
        else:
            raise NotImplementedError(f"Model type {self.model_type} not supported")

    def generate_response(self, prompt: str, context: str = None) -> str:
        if self.model_type == 'ollama':
            response = self.model.chat(model=self.model_name, messages=[
                {
                    'role': 'system',
                    'content': f"You are an AI assistant analyzing log files. Here's the log content:\n\n{context}\n\nAnswer the following question based on this log."
                },
                {
                    'role': 'user',
                    'content': prompt,
                }
            ])
            return response['message']['content']
        else:
            raise NotImplementedError(f"Model type {self.model_type} not supported")

    def evaluate_response(self, question: str, response: str, reference_answer: str, rubric: str) -> Dict[str, any]:
        # Prepare the evaluation prompt
        evaluation_prompt = f"""
You are an expert evaluator. Your task is to evaluate the following response to a question based on the reference answer and the provided rubric. Provide a score and feedback.

Question: {question}

Response: {response}

Reference Answer: {reference_answer}

Rubric:
{rubric}

Please provide your evaluation as follows:
Score: [1-5]
Feedback: [Your detailed feedback]

Remember, the score should be an integer between 1 and 5.

Evaluation:
"""
        # Generate evaluation using Llama 3.1
        eval_response = self.evaluator_model.chat(model=self.model_name, messages=[
            {
                'role': 'system',
                'content': "You are an expert evaluator."
            },
            {
                'role': 'user',
                'content': evaluation_prompt,
            }
        ])
        eval_text = eval_response['message']['content']

        # Extract score and feedback
        score_match = re.search(r"Score:\s*(\d+)", eval_text)
        feedback_match = re.search(r"Feedback:\s*(.*)", eval_text, re.DOTALL)

        if score_match:
            score = int(score_match.group(1))
        else:
            score = None

        if feedback_match:
            feedback = feedback_match.group(1).strip()
        else:
            feedback = None

        return {
            'score': score,
            'feedback': feedback,
            'evaluation_text': eval_text
        }

    def read_log_file(self, file_path: str) -> str:
        file_path = Path(file_path)
        file_extension = file_path.suffix.lower()

        if file_extension in ['.txt', '.log']:
            with open(file_path, 'r') as f:
                return f.read()
        elif file_extension == '.csv':
            log_content = []
            with open(file_path, 'r') as f:
                csv_reader = csv.reader(f)
                for row in csv_reader:
                    log_content.append(','.join(row))
                return '\n'.join(log_content)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")

    def evaluate(self, log_file: str, ground_truth_file: str, prompts: List[str], metric_names: List[str]) -> Dict[str, Dict[str, any]]:
        log_content = self.read_log_file(log_file)

        with open(ground_truth_file, 'r') as f:
            ground_truth = json.load(f)

        # Define the scoring rubric
        score_rubric = """
Score 1: The response is inaccurate or irrelevant to the question.
Score 2: The response has some relevance but contains significant inaccuracies or omissions.
Score 3: The response is generally accurate but lacks some important details.
Score 4: The response is accurate and includes most of the important details.
Score 5: The response is comprehensive, accurate, and addresses all aspects of the question thoroughly.
"""

        results = {}
        for prompt, metric_name in zip(prompts, metric_names):
            model_response = self.generate_response(prompt, log_content)
            reference_answer = ground_truth.get(metric_name, "")
            print(f"Prompt: {prompt}")
            print(f"Model Response: {model_response}")
            print(f"Reference Answer: {reference_answer}")

            evaluation = self.evaluate_response(prompt, model_response, reference_answer, score_rubric)
            print(f"Score: {evaluation['score']}")
            print(f"Feedback: {evaluation['feedback']}\n")

            results[metric_name] = {
                'score': evaluation['score'],
                'feedback': evaluation['feedback'],
                'model_response': model_response,
                'reference_answer': reference_answer
            }

        return results


In [6]:
if __name__ == '__main__':
    evaluator = LanguageModelEvaluator(model_name='llama3.1')

    # Define your prompts and metric names
    prompts = [
        "Why did the system restart?",
        "Why did the system crash?"
    ]

    metric_names = [
        "reason_for_restart",
        "reason_for_crash"
    ]

    # Evaluate the model
    results = evaluator.evaluate(
        log_file='./sample_log.txt',
        ground_truth_file='./ground_truth_qualitative.json',
        # log_file='./Mac_2k.log_structured.csv',
        # ground_truth_file='./ground_truth_qualitative.json',
        prompts=prompts,
        metric_names=metric_names
    )


    # Print the evaluation results
    for metric, result in results.items():
        print(f"Metric: {metric}")
        print(f"Score: {result['score']}")
        print(f"Feedback: {result['feedback']}")
        print(f"Model Response: {result['model_response']}")
        print(f"Reference Answer: {result['reference_answer']}\n")


Prompt: Why did the system restart?
Model Response: Based on the log content, it appears that the system restarted due to a "System crash detected" at 10:00:01 is not present but there is an entry at 11:00:00 which indicates a possible cause for the restart. However, another System crash detected event occurred before this restart.

More specifically, it's likely that the system crashed after the CPU usage spike at 10:45:12 (when CPU usage reached 56%), and then restarted. This is followed by another crash just before midnight.
Reference Answer: The system restarted due to a scheduled maintenance update at 02:00 AM.
Score: 2
Feedback: The response attempts to address the question, but it's marred by significant inaccuracies. The response provides a lengthy analysis of system crashes and CPU usage spikes, which are not relevant to the actual cause of the restart (scheduled maintenance update). Instead of acknowledging that the system restarted due to the scheduled update at 02:00 AM, th

### Maybe use cosine similarity with sentence embedding??

from sentence_transformers import SentenceTransformer, util

class LanguageModelEvaluator:
    # ... (other methods remain the same)

    def evaluate_response_similarity(self, response: str, reference_answer: str) -> float:
        # Load a pre-trained sentence transformer model
        model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose a different model
        # Encode the sentences
        embeddings = model.encode([response, reference_answer], convert_to_tensor=True)
        # Compute cosine similarity
        cosine_scores = util.cos_sim(embeddings[0], embeddings[1])
        similarity_score = cosine_scores.item()
        return similarity_score

    def evaluate(self, log_file: str, ground_truth_file: str, prompts: List[str], metric_names: List[str]) -> Dict[str, Dict[str, any]]:
        log_content = self.read_log_file(log_file)

        with open(ground_truth_file, 'r') as f:
            ground_truth = json.load(f)

        results = {}
        for prompt, metric_name in zip(prompts, metric_names):
            model_response = self.generate_response(prompt, log_content)
            reference_answer = ground_truth.get(metric_name, "")
            print(f"Prompt: {prompt}")
            print(f"Model Response: {model_response}")
            print(f"Reference Answer: {reference_answer}")

            similarity_score = self.evaluate_response_similarity(model_response, reference_answer)
            print(f"Similarity Score: {similarity_score}\n")

            results[metric_name] = {
                'similarity_score': similarity_score,
                'model_response': model_response,
                'reference_answer': reference_answer
            }

        return results
