 ### Evaluation of model with ground truth

In [25]:
import json
import re
from typing import List, Dict
import pandas as pd
from datetime import datetime
import os
from graph_manager import Graph, global_graph

class LanguageModelEvaluator:
    def __init__(self):
        if global_graph is None:
            self.graph = Graph.create_graph()
        else:
            self.graph = global_graph

    def generate_response(self, prompt: str) -> str:
        response = self.graph.run(prompt)
        if isinstance(response, bytes):
            return response.decode('utf-8')
        return str(response)

    def extract_numeric_value(self, text: str) -> float:
        if isinstance(text, bytes):
            text = text.decode('utf-8')
        matches = re.findall(r'\d+(?:\.\d+)?', text)
        if matches:
            return float(matches[-1])
        else:
            raise ValueError(f"No numeric value found in the text: {text}")
    
    def calculate_accuracy(self, predicted: float, actual: float) -> float:
        if predicted == actual:
            return 100.0
        elif actual == 0:
            return 0.0 if predicted != 0 else 100.0
        else:
            error = abs(predicted - actual) / actual
            accuracy = max(0, (1 - error)) * 100
            return min(accuracy, 100.0)

    def evaluate(self, ground_truth_file: str, prompts: List[str], metric_names: List[str]) -> Dict[str, Dict]:
        with open(ground_truth_file, 'r') as f:
            ground_truth = json.load(f)

        results = {}
        for prompt, metric_name in zip(prompts, metric_names):
            model_response = self.generate_response(prompt)
            print(f"Prompt: {prompt}")
            print(f"Model response: {model_response}")
            try:
                extracted_value = self.extract_numeric_value(model_response)
                ground_truth_value = ground_truth[metric_name]
                print(f"Extracted value: {extracted_value}")
                print(f"Ground truth value: {ground_truth_value}")
                accuracy = self.calculate_accuracy(extracted_value, ground_truth_value)
                results[metric_name] = {
                    "prompt": prompt,
                    "model_response": model_response,
                    "extracted_value": extracted_value,
                    "ground_truth_value": ground_truth_value,
                    "accuracy": accuracy
                }
            except ValueError as e:
                print(f"Error processing {metric_name}: {str(e)}")
                results[metric_name] = {
                    "prompt": prompt,
                    "model_response": model_response,
                    "error": str(e),
                    "accuracy": 0.0
                }
            print(f"Calculated accuracy: {results[metric_name]['accuracy']}%\n")

        return results

    def save_results(self, results: Dict[str, Dict], output_file: str = None):
        if output_file is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_file = f"./results/quantitative_evaluation_results_{timestamp}.json"
        
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        
        output = {
            "evaluation_time": datetime.now().isoformat(),
            "results": results
        }

        with open(output_file, 'w') as f:
            json.dump(output, f, indent=2)

        print(f"Results saved to {output_file}")
        return output_file

In [6]:
# from language_model_evaluator import LanguageModelEvaluator
def main():
    model_name = "llama3.1" 
    log_file = "sample_log.txt"
    ground_truth_file = "ground_truth.json"
    prompts = [
        "How many times did the laptop crash in the past hour?",
        "What was the average CPU usage percentage?",
        "How many total network requests were made?"
    ]
    metric_names = ["laptop_crashes", "avg_cpu_usage", "total_network_requests"]

    evaluator = LanguageModelEvaluator(model_name)
    results = evaluator.evaluate(log_file, ground_truth_file, prompts, metric_names)

    output_file = evaluator.save_results(results)

    print("\nEvaluation Results:")
    for metric, data in results.items():
        print(f"{metric}: {data['accuracy']:.2f}% accuracy")

if __name__ == "__main__":
    main()

Prompt: How many times did the laptop crash in the past hour?
Model response: According to the log, there were two system crashes detected within the last hour:

1. [2024-09-25 10:00:01] to [2024-09-25 11:00:00]: No crashes detected
2. [2024-09-25 11:00:00] System crash detected (first one)
3. [2024-09-25 11:59:59] System crash detected (second one)

Therefore, the laptop crashed **twice** within the last hour.
Extracted value: 59.0
Ground truth value: 2
Calculated accuracy: 0%

Prompt: What was the average CPU usage percentage?
Model response: To calculate the average CPU usage, we need to add up all the CPU usage percentages and divide by the total number of readings.

Here are the CPU usage readings:
- 42%
- 38%
- 56%

Total CPU usage: 42 + 38 + 56 = 136

There were 3 CPU usage readings. To find the average, we divide the total by the number of readings:

Average CPU usage: 136 / 3 = 45.33%

So, the average CPU usage percentage was approximately 45.33%.
Extracted value: 45.33
Ground

In [26]:
def main():
    ground_truth_file = "mac_ground_truth.json"
    prompts = [
        "How many times did the event with eventid E189 occur?",
        "How many times did the event with eventid E188 occur?",
        "How many times did the event with eventid E120 occur?",
        "How many times did the event with eventid E203 occur?",
        "How many times did the event with eventid E323 occur?",
        "How many times did the event with component kernel occur?",
        "How many times did the event with component com.apple.cts occur?",
        "How many times did the event with component corecaptured occur?",
        "How many times did the event with component QQ occur?",
        "How many times did the event with component Microsoft Word occur?",
        "How many times did the event with the user authorMacBook-Pro occur?",
    ]
    metric_names = ['E189', 'E188', 'E120', 'E203', 'E323', 'kernel', 'com.apple.cts', 'corecaptured', 'QQ', 'Microsoft Word', 'authorMacBook-Pro']

    evaluator = LanguageModelEvaluator()
    results = evaluator.evaluate(ground_truth_file, prompts, metric_names)

    print("Evaluation Results:")
    for metric, result in results.items():
        print(f"{metric}: {result['accuracy']:.2f}% accuracy")

    evaluator.save_results(results)

    return results

# Run the main function
evaluation_results = main()

[STAGE] Try again agent: No
[STAGE] Router Agent
ROUTER AGENT OUTPUT:  Thought: The user question is asking for a specific count of events in the dataframe, which can be achieved through data manipulation using Python and pandas.

Answer: yes.
[STAGE] Router summary agent
ROUTER SUMMARY AGENT OUTPUT:  no. 

the question is asking for a specific count of occurrences, which implies it's looking for a detailed answer rather than a high-level overview.
[STAGE] Pandas AI agent
Prompt: How many times did the event with eventid E189 occur?
Model response: 0


TypeError: cannot use a string pattern on a bytes-like object

In [None]:
from graph_manager import Graph, global_graph
# Creates global graph if not yet created
if global_graph is None:
    graph = Graph.create_graph()
else:
    graph = global_graph

result = graph.run("You are amazing!")

In [19]:
# from language_model_evaluator import LanguageModelEvaluator
# from graph_manager import Graph, global_graph
# import pandas as pd


def main():
    model_name = "pandasai" 
    log_file = "Mac_2k.log_structured.csv"
    ground_truth_file = "mac_ground_truth.json"
    prompts = [
        "How many times did the event with eventid E189 occur?",
        "How many times did the event with eventid E188 occur?",
        "How many times did the event with eventid E120 occur?",
        "How many times did the event with eventid E203 occur?",
        "How many times did the event with eventid E323 occur?",
        "How many times did the event with component kernel occur?",
        "How many times did the event with component com.apple.cts occur?",
        "How many times did the event with component corecaptured occur?",
        "How many times did the event with component QQ occur?",
        "How many times did the event with component Microsoft Word occur?",
        "How many times did the event with the user authorMacBook-Pro occur?",
    ]
    metric_names = ['E189', 'E188', 'E120', 'E203', 'E323', 'kernel', 'com.apple.cts', 'corecaptured', 'QQ', 'Microsoft Word', 'authorMacBook-Pro']

    evaluator = LanguageModelEvaluator(model_name, model_type='pandasai')
    
    # Load the data
    df = pd.read_csv(log_file)
    evaluator.set_pandas_agent(df)
    
    results = evaluator.evaluate(log_file, ground_truth_file, prompts, metric_names)

    print("Evaluation Results:")
    for metric, result in results.items():
        print(f"{metric}: {result['accuracy']:.2f}% accuracy")

    # Graph setup
    global global_graph
    if global_graph is None:
        global_graph = Graph.create_graph()

    # # Use the graph object
    # graph_result = global_graph.run("You are amazing!")
    # print("Graph result:", graph_result)

    return results

In [24]:
# Run the main function
# evaluation_results, graph_output = main()