In [32]:
import torch
print(torch.cuda.is_available())


False


In [19]:
!nvidia-smi


Wed Oct  2 00:14:58 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.99                 Driver Version: 555.99         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1650 Ti   WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   65C    P8              1W /   50W |       0MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [42]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from difflib import SequenceMatcher

# Download necessary NLTK data files
# nltk.download('punkt')

class Evaluator:
    def __init__(self):
        pass

    def evaluate(self, response, reference_answer):
        scores = {}
        scores['Accuracy'] = self.calculate_accuracy(response, reference_answer)
        scores['Relevance'] = self.calculate_relevance(response, reference_answer)
        scores['Clarity'] = self.calculate_clarity(response)
        scores['Conciseness'] = self.calculate_conciseness(response, reference_answer)
        return scores

    def calculate_accuracy(self, response, reference):
        # Use BLEU score as a proxy for accuracy
        reference_tokens = [word_tokenize(reference.lower())]
        response_tokens = word_tokenize(response.lower())
        bleu_score = sentence_bleu(reference_tokens, response_tokens)
        return bleu_score

    def calculate_relevance(self, response, reference):
        # Use sequence matching ratio
        matcher = SequenceMatcher(None, response.lower(), reference.lower())
        relevance_score = matcher.ratio()
        return relevance_score

    def calculate_clarity(self, response):
        # Assume clarity is inversely proportional to average word length
        words = word_tokenize(response)
        if not words:
            return 0
        avg_word_length = sum(len(word) for word in words) / len(words)
        clarity_score = max(0, 1 - (avg_word_length - 4) / 10)
        return clarity_score

    def calculate_conciseness(self, response, reference):
        # Compare lengths of response and reference
        response_length = len(response)
        reference_length = len(reference)
        if reference_length == 0:
            return 0
        length_ratio = response_length / reference_length
        conciseness_score = max(0, 1 - abs(length_ratio - 1))
        return conciseness_score

def load_model_and_tokenizer(model_name, device='cpu'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    model.to(device)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def generate_responses(model, tokenizer, log_data_list, question_list, device='cpu', max_length=512):
    input_texts = [f"{log_data}\n\nUser Question: {question}\n\nAssistant:" 
                   for log_data, question in zip(log_data_list, question_list)]
    inputs = tokenizer(input_texts, return_tensors='pt', padding=True, truncation=True).to(device)

    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    responses = []
    for i, output in enumerate(outputs):
        response = tokenizer.decode(output, skip_special_tokens=True)
        response = response[len(input_texts[i]):].strip()
        responses.append(response)

    return responses

def main():
    # Device configuration
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Model name (change as needed)
    model_name = 'gpt2'  # Replace with 'Llama-3.1', 'Mistral', etc.

    # Load model and tokenizer
    model, tokenizer = load_model_and_tokenizer(model_name, device)

    # Initialize evaluator
    evaluator = Evaluator()

    # Sample data (replace with your actual log data and questions)
    data = [
        {
            'log_data': '2023-10-02 12:34:56 ERROR 1234 Process failed due to unexpected input.',
            'question': 'What caused this system error?',
            'reference_answer': 'The error was caused by an unexpected input that led the process to fail.'
        },
        {
            'log_data': '2023-10-02 12:35:00 WARNING 5678 Disk space running low on server.',
            'question': 'What does this log entry mean?',
            'reference_answer': 'This log indicates that the disk space on the server is running low.'
        }
        # Add more entries as needed
    ]

    # Collect log data, questions, and reference answers
    log_data_list = [entry['log_data'] for entry in data]
    question_list = [entry['question'] for entry in data]
    reference_answers = [entry['reference_answer'] for entry in data]

    # Generate responses
    responses = generate_responses(model, tokenizer, log_data_list, question_list, device)

    # Evaluate and print results
    for i, response in enumerate(responses):
        reference_answer = reference_answers[i]
        scores = evaluator.evaluate(response, reference_answer)

        print(f"Question: {question_list[i]}")
        print(f"Response: {response}")
        print("Evaluation Scores:")
        for criterion, score in scores.items():
            print(f"  {criterion}: {score:.2f}")
        print("\n")

if __name__ == '__main__':
    main()


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Question: What caused this system error?
Response: A new screen has opened.

User Question: Could there be any further issues, please let me know.

Assistant: I'm running Java 3.

User Question: Could you provide a way to prevent this using Java-X?

Assistant:

Yes. For an alternative Java 3 implementation of Java, please read section "JVM: JRE" in the JRE FAQ

Computer:

I can't figure out any other way to handle the error log for this system log.

Computer: What was the error log?

Computer: Sorry, I don't understand the format. I tried opening the log of the system which looks something like this:

This is an error. The system and system.

"User" is either the user or username.

"System" is either System 2.

If I'm creating new jobs in your background, these are jobs created by your background processes.

"Job" is, after all, the current process in the background. Since all processes are running, this means that the current and all others started in the foreground.

"System" is Syst

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [43]:
import torch
import ollama
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from difflib import SequenceMatcher

class Evaluator:
    def __init__(self):
        pass

    def evaluate(self, response, reference_answer):
        scores = {}
        scores['Accuracy'] = self.calculate_accuracy(response, reference_answer)
        scores['Relevance'] = self.calculate_relevance(response, reference_answer)
        scores['Clarity'] = self.calculate_clarity(response)
        scores['Conciseness'] = self.calculate_conciseness(response, reference_answer)
        return scores

    def calculate_accuracy(self, response, reference):
        # Use BLEU score as a proxy for accuracy
        reference_tokens = [word_tokenize(reference.lower())]
        response_tokens = word_tokenize(response.lower())
        bleu_score = sentence_bleu(reference_tokens, response_tokens)
        return bleu_score

    def calculate_relevance(self, response, reference):
        # Use sequence matching ratio
        matcher = SequenceMatcher(None, response.lower(), reference.lower())
        relevance_score = matcher.ratio()
        return relevance_score

    def calculate_clarity(self, response):
        # Assume clarity is inversely proportional to average word length
        words = word_tokenize(response)
        if not words:
            return 0
        avg_word_length = sum(len(word) for word in words) / len(words)
        clarity_score = max(0, 1 - (avg_word_length - 4) / 10)
        return clarity_score

    def calculate_conciseness(self, response, reference):
        # Compare lengths of response and reference
        response_length = len(response)
        reference_length = len(reference)
        if reference_length == 0:
            return 0
        length_ratio = response_length / reference_length
        conciseness_score = max(0, 1 - abs(length_ratio - 1))
        return conciseness_score

def load_model_and_tokenizer(model_name, model_type='ollama'):
    if model_type == 'ollama':
        return ollama.Client(), None
    else:
        raise NotImplementedError(f"Model type {model_type} not supported")

def generate_responses_ollama(model, log_data_list, question_list):
    responses = []
    for log_data, question in zip(log_data_list, question_list):
        context = f"You are an AI assistant analyzing log files. Here's the log content:\n\n{log_data}\n\nAnswer the following question based on this log."
        response = model.chat(model='llama3.1', messages=[
            {'role': 'system', 'content': context},
            {'role': 'user', 'content': question}
        ])
        responses.append(response['message']['content'].strip())
    return responses

def main():
    # Load Llama 3.1 model using Ollama client
    model, _ = load_model_and_tokenizer('llama3.1')

    # Initialize evaluator
    evaluator = Evaluator()

    # Sample data (replace with your actual log data and questions)
    data = [
        {
            'log_data': '2023-10-02 12:34:56 ERROR 1234 Process failed due to unexpected input.',
            'question': 'What caused this system error?',
            'reference_answer': 'The error was caused by an unexpected input that led the process to fail.'
        },
        {
            'log_data': '2023-10-02 12:35:00 WARNING 5678 Disk space running low on server.',
            'question': 'What does this log entry mean?',
            'reference_answer': 'This log indicates that the disk space on the server is running low.'
        }
    ]

    # Collect log data, questions, and reference answers
    log_data_list = [entry['log_data'] for entry in data]
    question_list = [entry['question'] for entry in data]
    reference_answers = [entry['reference_answer'] for entry in data]

    # Generate responses using Llama 3.1
    responses = generate_responses_ollama(model, log_data_list, question_list)

    # Evaluate and print results
    for i, response in enumerate(responses):
        reference_answer = reference_answers[i]
        scores = evaluator.evaluate(response, reference_answer)

        print(f"Question: {question_list[i]}")
        print(f"Response: {response}")
        print("Evaluation Scores:")
        for criterion, score in scores.items():
            print(f"  {criterion}: {score:.2f}")
        print("\n")

if __name__ == '__main__':
    main()


Question: What caused this system error?
Response: According to the log, the error (Process failed due to unexpected input) was caused by an unexpected input in process ID 1234. This suggests that there was some kind of invalid or unusual data being processed by the system at that time, which resulted in the failure of process 1234.
Evaluation Scores:
  Accuracy: 0.10
  Relevance: 0.34
  Clarity: 0.98
  Conciseness: 0.00


Question: What does this log entry mean?
Evaluation Scores:
  Accuracy: 0.12
  Relevance: 0.31
  Clarity: 0.99
  Conciseness: 0.00




In [2]:
import json
import ollama
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from difflib import SequenceMatcher
from pathlib import Path

class Evaluator:
    def __init__(self):
        pass

    def evaluate(self, response, reference_answer):
        scores = {}
        scores['Accuracy'] = self.calculate_accuracy(response, reference_answer)
        scores['Relevance'] = self.calculate_relevance(response, reference_answer)
        scores['Clarity'] = self.calculate_clarity(response)
        scores['Conciseness'] = self.calculate_conciseness(response, reference_answer)
        return scores

    def calculate_accuracy(self, response, reference):
        # Use BLEU score as a proxy for accuracy
        reference_tokens = [word_tokenize(reference.lower())]
        response_tokens = word_tokenize(response.lower())
        bleu_score = sentence_bleu(reference_tokens, response_tokens)
        # Ensure score is within [0, 1]
        return min(max(bleu_score, 0), 1)

    def calculate_relevance(self, response, reference):
        # Use sequence matching ratio
        matcher = SequenceMatcher(None, response.lower(), reference.lower())
        relevance_score = matcher.ratio()
        # Ensure score is within [0, 1]
        return min(max(relevance_score, 0), 1)

    def calculate_clarity(self, response):
        # Assume clarity is inversely proportional to average word length
        words = word_tokenize(response)
        if not words:
            return 0
        avg_word_length = sum(len(word) for word in words) / len(words)
        # Ensure clarity is bounded between 0 and 1, inversely proportional to word length (ideal word length ~ 4-5)
        clarity_score = max(0, 1 - abs(avg_word_length - 4) / 10)
        return min(max(clarity_score, 0), 1)

    def calculate_conciseness(self, response, reference):
        # Compare lengths of response and reference
        response_length = len(response)
        reference_length = len(reference)
        if reference_length == 0:
            return 0
        length_ratio = response_length / reference_length
        # Ensure conciseness is bounded between 0 and 1, where values close to 1 are ideal
        conciseness_score = max(0, 1 - abs(length_ratio - 1))
        return min(max(conciseness_score, 0), 1)

def load_model_and_tokenizer(model_name, model_type='ollama'):
    if model_type == 'ollama':
        return ollama.Client(), None
    else:
        raise NotImplementedError(f"Model type {model_type} not supported")

def generate_responses_ollama(model, log_data, question_list):
    responses = []
    for question in question_list:
        context = f"You are an AI assistant analyzing log files. Here's the log content:\n\n{log_data}\n\nAnswer the following question based on this log."
        response = model.chat(model='llama3.1', messages=[
            {'role': 'system', 'content': context},
            {'role': 'user', 'content': question}
        ])
        responses.append(response['message']['content'].strip())
    return responses

def read_log_file(log_file_path):
    with open(log_file_path, 'r') as file:
        return file.read()

def load_ground_truth(ground_truth_path):
    with open(ground_truth_path, 'r') as f:
        return json.load(f)

def main():
    # Load Llama 3.1 model using Ollama client
    model, _ = load_model_and_tokenizer('llama3.1')

    # Initialize evaluator
    evaluator = Evaluator()

    # Load the log file content once
    log_file_path = 'sample_log.txt'
    log_data = read_log_file(log_file_path)

    # Load the ground truth answers from the JSON file
    ground_truth_path = 'ground_truth_qualitative.json'
    ground_truth = load_ground_truth(ground_truth_path)

    # List of questions
    question_list = [
        "What was the CPU usage right before the first system crash?",
        "How many times did the system crash?",
        "What was the system doing at 10:00:01?",
        "How many network requests were made between 11:15:18 and 11:45:55?",
        "At what time was the system restarted after the first crash?",
        "What is the average CPU usage recorded in the log?",
        "What was the last recorded event in the log?",
        "What is the pattern in the network requests between 11:15:18 and 11:45:55?"
    ]

    # Generate responses using Llama 3.1
    responses = generate_responses_ollama(model, log_data, question_list)

    # Evaluate and print results
    for i, response in enumerate(responses):
        question = question_list[i]
        reference_answer = ground_truth.get(f"question_{i + 1}", "")  # Fetch the reference answer by question index
        scores = evaluator.evaluate(response, reference_answer)

        print(f"Question: {question}")
        print(f"Response: {response}")
        print(f"Reference Answer: {reference_answer}")
        print("Evaluation Scores:")
        for criterion, score in scores.items():
            print(f"  {criterion}: {score:.2f}")
        print("\n")

if __name__ == '__main__':
    main()




Question: What was the CPU usage right before the first system crash?
Response: According to the log, the CPU usage right before the first system crash (which occurred at 2024-09-25 11:00:00) was reported as 56% at 10:45:12.
Reference Answer: The CPU usage was 56% before the first system crash at 11:00:00.
Evaluation Scores:
  Accuracy: 0.19
  Relevance: 0.55
  Clarity: 0.98
  Conciseness: 0.00


Question: How many times did the system crash?
Response: Based on the log, the system crashed twice:

1. [2024-09-25 10:45:12] - CPU usage spiked to 56% and then a crash was detected
2. [2024-09-25 11:59:59] - Another system crash was detected, likely related to the previous restart attempt
Reference Answer: The system crashed twice: once at 11:00:00 and again at 11:59:59.
Evaluation Scores:
  Accuracy: 0.08
  Relevance: 0.32
  Clarity: 1.00
  Conciseness: 0.00


Question: What was the system doing at 10:00:01?
Response: According to the log, at 10:00:01 the system was "startup". This suggests

In [10]:
import json
import ollama
from pathlib import Path

# Import Prometheus Evaluator LM
from prometheus_eval import PrometheusEval
from prometheus_eval.vllm import VLLM


def load_model_and_tokenizer(model_name, model_type='ollama'):
    if model_type == 'ollama':
        return ollama.Client(), None
    else:
        raise NotImplementedError(f"Model type {model_type} not supported")

def generate_responses_ollama(model, log_data, question_list):
    responses = []
    for question in question_list:
        context = f"You are an AI assistant analyzing log files. Here's the log content:\n\n{log_data}\n\nAnswer the following question based on this log."
        response = model.chat(model='llama3.1', messages=[
            {'role': 'system', 'content': context},
            {'role': 'user', 'content': question}
        ])
        responses.append(response['message']['content'].strip())
    return responses

def read_log_file(log_file_path):
    with open(log_file_path, 'r') as file:
        return file.read()

def load_ground_truth(ground_truth_path):
    with open(ground_truth_path, 'r') as f:
        return json.load(f)

def main():
    # Load Llama 3.1 model using Ollama client for generating responses
    model, _ = load_model_and_tokenizer('llama3.1')

    # Initialize Prometheus Evaluator using VLLM and force it to use CPU
    prometheus_model = VLLM(model="prometheus-eval/prometheus-7b-v2.0", tensor_parallel_size=1, dtype="float32", device="cpu")
    judge = PrometheusEval(model=prometheus_model)

    # Load the log file content once
    log_file_path = 'sample_log.txt'
    log_data = read_log_file(log_file_path)

    # Load the ground truth answers from the JSON file
    ground_truth_path = 'ground_truth_qualitative.json'
    ground_truth = load_ground_truth(ground_truth_path)

    # List of questions
    question_list = [
        "What was the CPU usage right before the first system crash?",
        "How many times did the system crash?",
        "What was the system doing at 10:00:01?",
        "How many network requests were made between 11:15:18 and 11:45:55?",
        "At what time was the system restarted after the first crash?",
        "What is the average CPU usage recorded in the log?",
        "What was the last recorded event in the log?",
        "What is the pattern in the network requests between 11:15:18 and 11:45:55?"
    ]

    # Generate responses using Llama 3.1
    responses = generate_responses_ollama(model, log_data, question_list)

    # Evaluate and print results
    for i, response in enumerate(responses):
        question = question_list[i]
        reference_answer = ground_truth.get(f"question_{i + 1}", "")  # Fetch the reference answer by question index

        # Create a rubric template (or use a predefined rubric)
        score_rubric = """
        Score 1: The response is inaccurate or incomplete.
        Score 2: The response is somewhat relevant but lacks important information.
        Score 3: The response is relevant and covers some of the important details.
        Score 4: The response is relevant and covers most of the important details with clarity.
        Score 5: The response is accurate, relevant, clear, and complete.
        """

        # Use Prometheus Evaluator to evaluate the response
        feedback, score = judge.single_absolute_grade(
            instruction=question,
            response=response,
            reference_answer=reference_answer,
            rubric=score_rubric
        )

        print(f"Question: {question}")
        print(f"Response: {response}")
        print(f"Reference Answer: {reference_answer}")
        print(f"Feedback: {feedback}")
        print(f"Score: {score}")
        print("\n")

if __name__ == '__main__':
    main()



INFO 10-02 01:26:51 config.py:1648] Upcasting torch.bfloat16 to torch.float32.


ValueError: cannot find context for 'fork'

In [3]:
# !pip install prometheus-eval ollama


