In [3]:
import json
import re
import ollama
import csv
from typing import List, Dict
from pathlib import Path
from datetime import datetime
import pandas as pd
import os

class LanguageModelEvaluator:
    def __init__(self, model_name: str, model_type: str = 'ollama'):
        self.model_name = model_name
        self.model_type = model_type
        self.model = self._load_model()
        # Using the same model for evaluation for now, can change later with our model
        self.evaluator_model = self.model

    def _load_model(self):
        if self.model_type == 'ollama':
            return ollama.Client()
        else:
            raise NotImplementedError(f"Model type {self.model_type} not supported")

    def generate_response(self, prompt: str, context: str = None) -> str:
        if self.model_type == 'ollama':
            response = self.model.chat(model=self.model_name, messages=[
                {
                    'role': 'system',
                    'content': f"You are an AI assistant analyzing log files. Here's the log content:\n\n{context}\n\nAnswer the following question based on this log."
                },
                {
                    'role': 'user',
                    'content': prompt,
                }
            ])
            return response['message']['content']
        else:
            raise NotImplementedError(f"Model type {self.model_type} not supported")

    def evaluate_response(self, question: str, response: str, reference_answer: str, rubric: str) -> Dict[str, any]:
        # Prepare the evaluation prompt
        evaluation_prompt = f"""
You are an expert evaluator. Your task is to evaluate the following response to a question based on the reference answer and the provided rubric. Provide a score and feedback.

Question: {question}

Response: {response}

Reference Answer: {reference_answer}

Rubric:
{rubric}

Please provide your evaluation as follows, grading the response of the language model with the rubric relative to the reference answer:
Score: [1-5]
Feedback: [Your detailed feedback]

Remember, the score should be an integer between 1 and 5.

Evaluation:
"""
        # Generating evaluation 
        eval_response = self.evaluator_model.chat(model=self.model_name, messages=[
            {
                'role': 'system',
                'content': "You are an expert evaluator on the responses produced by language models."
            },
            {
                'role': 'user',
                'content': evaluation_prompt,
            }
        ])
        eval_text = eval_response['message']['content']

        # Extracting the score and feedback
        score_match = re.search(r"Score:\s*(\d+)", eval_text)
        feedback_match = re.search(r"Feedback:\s*(.*)", eval_text, re.DOTALL)

        if score_match:
            score = int(score_match.group(1))
        else:
            score = None

        if feedback_match:
            feedback = feedback_match.group(1).strip()
        else:
            feedback = None

        return {
            'score': score,
            'feedback': feedback,
            'evaluation_text': eval_text
        }

    def read_log_file(self, file_path: str) -> str:
        file_path = Path(file_path)
        file_extension = file_path.suffix.lower()

        if file_extension in ['.txt', '.log']:
            with open(file_path, 'r') as f:
                return f.read()
        elif file_extension == '.csv':
            log_content = []
            with open(file_path, 'r') as f:
                csv_reader = csv.reader(f)
                for row in csv_reader:
                    log_content.append(','.join(row))
                return '\n'.join(log_content)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")

    def evaluate(self, log_file: str, ground_truth_file: str, prompts: List[str], metric_names: List[str]) -> Dict[str, Dict[str, any]]:
        log_content = self.read_log_file(log_file)

        with open(ground_truth_file, 'r') as f:
            ground_truth = json.load(f)

        # Define the scoring rubric
        score_rubric = """
Score 1: The response is inaccurate or irrelevant to the question.
Score 2: The response has some relevance but contains significant inaccuracies or omissions.
Score 3: The response is generally accurate but lacks some important details.
Score 4: The response is accurate and includes most of the important details.
Score 5: The response is comprehensive, accurate, and addresses all aspects of the question thoroughly.
"""

        results = {}
        for prompt, metric_name in zip(prompts, metric_names):
            model_response = self.generate_response(prompt, log_content)
            reference_answer = ground_truth.get(metric_name, "")
            print(f"Prompt: {prompt}")
            print(f"Model Response: {model_response}")
            print(f"Reference Answer: {reference_answer}")

            evaluation = self.evaluate_response(prompt, model_response, reference_answer, score_rubric)
            print(f"Score: {evaluation['score']}")
            print(f"Feedback: {evaluation['feedback']}\n")

            results[metric_name] = {
                'score': evaluation['score'],
                'feedback': evaluation['feedback'],
                'model_response': model_response,
                'reference_answer': reference_answer
            }
            
        

        return results

    def save_results(self, results: Dict[str, Dict[str, any]], output_file: str = None) -> str:
        if output_file is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_file = f"./results/qualitative_evaluation_results_{timestamp}.json"

        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        output = {
            "model_name": self.model_name,
            "model_type": self.model_type,
            "evaluation_time": datetime.now().isoformat(),
            "results": results
        }

        with open(output_file, 'w') as f:
            json.dump(output, f, indent=2)

        print(f"Results saved to {output_file}")
        return output_file


In [4]:
if __name__ == '__main__':
    evaluator = LanguageModelEvaluator(model_name='llama3.1')

    # Define your prompts and metric names
    prompts = [
        "Why did the system restart?",
        "Why did the system crash?"
    ]

    metric_names = [
        "reason_for_restart",
        "reason_for_crash"
    ]

    # Evaluate the model
    results = evaluator.evaluate(
        log_file='./sample_log.txt',
        ground_truth_file='./ground_truth_qualitative.json',
        # log_file='./Mac_2k.log_structured.csv',
        # ground_truth_file='./ground_truth_qualitative.json',
        prompts=prompts,
        metric_names=metric_names
    )


    # Print the evaluation results
    for metric, result in results.items():
        print(f"Metric: {metric}")
        print(f"Score: {result['score']}")
        print(f"Feedback: {result['feedback']}")
        print(f"Model Response: {result['model_response']}")
        print(f"Reference Answer: {result['reference_answer']}\n")

    output_file = evaluator.save_results(results)

Prompt: Why did the system restart?
Model Response: According to the log, the system restarted twice:

1. The first restart occurred at 10:00:01 (initial system startup) is not relevant here.
2. The second restart occurred after a "System crash detected" event at 11:00:00.

This suggests that a system crash was the reason for the restart. However, there isn't any additional information in the log about the cause of the crash.
Reference Answer: The system restarted due to a scheduled maintenance update at 02:00 AM.
Score: 2
Feedback: The response has some relevance to the question as it mentions a system restart and provides details from the log. However, it is inaccurate in identifying the reason for the restart (system crash detected vs scheduled maintenance update). Additionally, it overlooks the timing mentioned in the reference answer (02:00 AM) and only focuses on the time of the second restart (11:00:00) which is not relevant to the question. The response also jumps to a conclusi

In [4]:
if __name__ == '__main__':
    evaluator = LanguageModelEvaluator(model_name='llama3.1')

    # Define your prompts and metric names
    prompts = [
        "What were the main system performance issues observed in the logs?",
        "Describe the network connectivity problems encountered by the system.",
        "Summarize the battery status and power management events in the logs.",
        "What were the most significant application errors or crashes reported in the logs?",
        "Identify and explain any security-related events or issues in the logs."
    ]

    metric_names = [
        "system_performance",
        "network_connectivity",
        "battery_status",
        "application_errors",
        "security_events"
    ]

    # Evaluate the model
    results = evaluator.evaluate(
        # log_file='./sample_log.txt',
        # ground_truth_file='./ground_truth_qualitative.json',
        log_file='./Mac_2k.log_structured.csv',
        ground_truth_file='./mac_ground_truth_qualitative.json',
        prompts=prompts,
        metric_names=metric_names
    )


    # Print the evaluation results
    for metric, result in results.items():
        print(f"Metric: {metric}")
        print(f"Score: {result['score']}")
        print(f"Feedback: {result['feedback']}")
        print(f"Model Response: {result['model_response']}")
        print(f"Reference Answer: {result['reference_answer']}\n")


Prompt: What were the main system performance issues observed in the logs?
Model Response: I don’t have information about specific system performance issues from logs. If you could provide more details or context, I would be happy to try and help with a more general inquiry on how to analyze log data for system performance issues.
Reference Answer: The system experienced several performance issues, including slow response times, unexpected restarts, and occasional crashes. There were multiple instances of high thermal pressure and memory pressure, which could contribute to performance degradation.
Score: 2
Feedback: The response shows some relevance to the question by acknowledging a lack of specific information about system performance issues from logs. However, it fails to provide any actual details or insights about the observed issues, which is crucial for addressing the question. While the response suggests a willingness to help with general inquiries on log data analysis, it does

KeyboardInterrupt: 