 ### Evaluation of model with ground truth

In [9]:
import json
import re
import ollama
import csv
from typing import List, Dict
from pathlib import Path

class LanguageModelEvaluator:
    def __init__(self, model_name: str, model_type: str = 'ollama'):
        self.model_name = model_name
        self.model_type = model_type
        self.model = self._load_model()

    def _load_model(self):
        if self.model_type == 'ollama':
            return ollama.Client()
        else:
            raise NotImplementedError(f"Model type {self.model_type} not supported")

    def generate_response(self, prompt: str, context: str) -> str:
        if self.model_type == 'ollama':
            response = self.model.chat(model=self.model_name, messages=[
                {
                    'role': 'system',
                    'content': f"You are an AI assistant analyzing log files. Here's the log content:\n\n{context}\n\nAnswer the following question based on this log and remember to return a number at the end where applicable:"
                },
                {
                    'role': 'user',
                    'content': prompt,
                }
            ])
            return response['message']['content']
        else:
            raise NotImplementedError(f"Model type {self.model_type} not supported")

    def extract_numeric_value(self, text: str) -> float:
        # Use a non-capturing group to get full matches
        matches = re.findall(r'\d+(?:\.\d+)?', text)
        if matches:
            # Return the last numeric value found
            return float(matches[-1])
        else:
            raise ValueError(f"No numeric value found in the text: {text}")
    
    def calculate_accuracy(self, predicted: float, actual: float) -> float:
        if predicted == actual:
            return 100.0
        elif actual == 0:
            return 0.0 if predicted != 0 else 100.0
        else:
            error = abs(predicted - actual) / actual
            accuracy = max(0, (1 - error)) * 100
            return min(accuracy, 100.0)

    def read_log_file(self, file_path: str) -> str:
        file_path = Path(file_path)
        file_extension = file_path.suffix.lower()

        if file_extension in ['.txt', '.log']:
            with open(file_path, 'r') as f:
                return f.read()
        elif file_extension == '.csv':
            log_content = []
            with open(file_path, 'r') as f:
                csv_reader = csv.reader(f)
                for row in csv_reader:
                    log_content.append(','.join(row))
            return '\n'.join(log_content)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")

    def evaluate(self, log_file: str, ground_truth_file: str, prompts: List[str], metric_names: List[str]) -> Dict[str, float]:
        log_content = self.read_log_file(log_file)

        with open(ground_truth_file, 'r') as f:
            ground_truth = json.load(f)

        results = {}
        for prompt, metric_name in zip(prompts, metric_names):
            model_response = self.generate_response(prompt, log_content)
            print(f"Prompt: {prompt}")
            print(f"Model response: {model_response}")
            try:
                extracted_value = self.extract_numeric_value(model_response)
                ground_truth_value = ground_truth[metric_name]
                print(f"Extracted value: {extracted_value}")
                print(f"Ground truth value: {ground_truth_value}")
                accuracy = self.calculate_accuracy(extracted_value, ground_truth_value)
                results[metric_name] = accuracy
            except ValueError as e:
                print(f"Error processing {metric_name}: {str(e)}")
                results[metric_name] = 0.0
            print(f"Calculated accuracy: {accuracy}%\n")

        return results

In [2]:
# from language_model_evaluator import LanguageModelEvaluator
def main():
    model_name = "llama3.1"  # Replace with the model you want to use
    log_file = "sample_log.txt"
    ground_truth_file = "ground_truth.json"
    prompts = [
        "How many times did the laptop crash in the past hour?",
        "What was the average CPU usage percentage?",
        "How many total network requests were made?"
    ]
    metric_names = ["laptop_crashes", "avg_cpu_usage", "total_network_requests"]

    evaluator = LanguageModelEvaluator(model_name)
    results = evaluator.evaluate(log_file, ground_truth_file, prompts, metric_names)

    print("Evaluation Results:")
    for metric, accuracy in results.items():
        print(f"{metric}: {accuracy:.2f}% accuracy")

if __name__ == "__main__":
    main()

Prompt: How many times did the laptop crash in the past hour?
Model response: According to the log, there were two system crashes detected in the past hour:

1. At 2024-09-25 11:00:00
2. At 2024-09-25 11:59:59

So, the laptop crashed twice in the past hour. The answer is 2.
Extracted value: 2.0
Ground truth value: 2
Calculated accuracy: 100.0%

Prompt: What was the average CPU usage percentage?
Model response: To calculate the average CPU usage, I will add up all the CPU usage values (42% + 38% + 56%) and divide by the total number of values.

Average CPU usage = (42 + 38 + 56) / 3
= 136 / 3
= 45.33%

So, the average CPU usage percentage was approximately **45.33%.**
Extracted value: 45.33
Ground truth value: 45.3
Calculated accuracy: 99.93377483443709%

Prompt: How many total network requests were made?
Model response: To find the total number of network requests, we need to add up the values from each relevant entry in the log. There are three entries:

- 45 (2024-09-25 11:15:18)
- 3

In [1]:
# from language_model_evaluator import LanguageModelEvaluator
def main():
    model_name = "llama3.1"  # Replace with the model you want to use
    log_file = "Mac_2k.log_structured.csv"
    ground_truth_file = "mac_ground_truth.json"
    prompts = [
        "Can your explain the first log to me?",
        "How many times did the event with eventid E189 occur?",
        "How many times did the event with eventid E188 occur?",
        "How many times did the event with eventid E120 occur?",
        "How many times did the event with eventid E203 occur?",
        "How many times did the event with eventid E323 occur?",
        "How many times did the event with component kernel occur?",
        "How many times did the event with component com.apple.cts occur?",
        "How many times did the event with component corecaptured occur?",
        "How many times did the event with component QQ occur?",
        "How many times did the event with component Microsoft Word occur?",
        "How many times did the event with the user authorMacBook-Pro occur?",
    ]
    metric_names = ['test','E189', 'E188', 'E120', 'E203', 'E323', 'kernel', 'com.apple.cts', 'corecaptured', 'QQ', 'Microsoft Word', 'authorMacBook-Pro']

    evaluator = LanguageModelEvaluator(model_name)
    results = evaluator.evaluate(log_file, ground_truth_file, prompts, metric_names)

    print("Evaluation Results:")
    for metric, accuracy in results.items():
        print(f"{metric}: {accuracy:.2f}% accuracy")

if __name__ == "__main__":
    main()

NameError: name 'LanguageModelEvaluator' is not defined

In [None]:
from pandasai import Agent
from regular_agent.agent_ai import Agent_Ai

def pandas_legend(self):
    llm  = self.get_llm().llm
    pandas_ai = Agent(
        self.df, # Replace this with the log df
        config={
        "llm":llm,
        "open_charts":False,
        "enable_cache" : False,
        "save_charts": True,
        "max_retries":3,
    })
    return pandas_ai

In [3]:
# !pip install pandasai
!pip install regular_agent.agent_ai

ERROR: Could not find a version that satisfies the requirement regular_agent.agent_ai (from versions: none)
ERROR: No matching distribution found for regular_agent.agent_ai
