In [1]:
import requests
import time
import os
import json
import pickle
from tqdm import tqdm,trange
from openai import OpenAI
import pandas as pd
from typing import List, Tuple, Optional, Dict, Any
from sklearn.metrics import accuracy_score
import demjson3
import re

In [None]:
class SmartJSONExtractor:
    """"""
    
    def __init__(self):
        # 
        self.json_patterns = [
            r'```json\s*\n(.*?)\n```',  # 
            r'```\s*json\s*\n(.*?)\n```',  # 
            r'```\s*\n(.*?)\n```',  # ```
            r'```json(.*?)```',  # 
            r'```(.*?)```',  # 
            r'`(.*?)`',  # 
        ]
        
        # 
        self.object_patterns = [
            r'(\{(?:[^{}]|{[^{}]*})*\})',  # 
            r'(\[(?:[^\[\]]|\[[^\[\]]*\])*\])',  # 
        ]
    
    def extract_json(self, text: str) -> Optional[Dict[Any, Any]]:

        json_obj = self._extract_from_code_blocks(text)
        if json_obj:
            return json_obj

        json_obj = self._extract_from_text(text)
        if json_obj:
            return json_obj
        
        json_obj = self._extract_with_fixes(text)
        if json_obj:
            return json_obj
        
        return None
    
    def _extract_from_code_blocks(self, text: str) -> Optional[Dict[Any, Any]]:

        for pattern in self.json_patterns:
            matches = re.findall(pattern, text, re.DOTALL)
            for match in matches:
                json_obj = self._try_parse_json(match.strip())
                if json_obj:
                    return json_obj
        return None
    
    def _extract_from_text(self, text: str) -> Optional[Dict[Any, Any]]:

        for pattern in self.object_patterns:
            matches = re.findall(pattern, text, re.DOTALL)
            for match in matches:
                json_obj = self._try_parse_json(match.strip())
                if json_obj:
                    return json_obj
        return None
    
    def _extract_with_fixes(self, text: str) -> Optional[Dict[Any, Any]]:

        fixes = [
            lambda x: x.replace('""', '"'),  # 
            lambda x: re.sub(r'(\w+):', r'"\1":', x),  # 
            lambda x: re.sub(r':\s*([^",\[\]{}]+)(?=\s*[,}])', r': "\1"', x),  # 
            lambda x: x.replace("'", '"'),  # 
            lambda x: re.sub(r',\s*}', '}', x),  #
            lambda x: re.sub(r',\s*]', ']', x),  
        ]
        
        for fix in fixes:
            try:
                fixed_text = fix(text)

                json_obj = self._extract_from_code_blocks(fixed_text)
                if json_obj:
                    return json_obj
                json_obj = self._extract_from_text(fixed_text)
                if json_obj:
                    return json_obj
            except:
                continue
        
        return None
    
    def _try_parse_json(self, text: str) -> Optional[Dict[Any, Any]]:

        try:
            return json.loads(text)
        except json.JSONDecodeError:
            return None
    
    def extract_all_json(self, text: str) -> List[Dict[Any, Any]]:

        json_objects = []
        

        for pattern in self.json_patterns:
            matches = re.findall(pattern, text, re.DOTALL)
            for match in matches:
                json_obj = self._try_parse_json(match.strip())
                if json_obj:
                    json_objects.append(json_obj)
        

        for pattern in self.object_patterns:
            matches = re.findall(pattern, text, re.DOTALL)
            for match in matches:
                json_obj = self._try_parse_json(match.strip())
                if json_obj and json_obj not in json_objects:
                    json_objects.append(json_obj)
        
        return json_objects

def extract_categories(text):

    pattern = r'\[([A-Z](?:\.\d+)*)\]'
    

    matches = re.findall(pattern, text)
    
    return matches

def extract_categories_plus(text):

    categories = []

    pattern1 = r'\[([A-Z](?:\.\d+)*)\]'
    matches1 = re.findall(pattern1, text)
    categories.extend(matches1)
    
    pattern2 = r'([A-Z](?:\.\d+)*)(?=\[)'
    matches2 = re.findall(pattern2, text)
    categories.extend(matches2)

    pattern3 = r'([A-Z](?:\.\d+)*)(?=\s+(?![[\]]))'
    matches3 = re.findall(pattern3, text)

    for match in matches3:

        match_pos = text.find(match)
        if match_pos != -1:
            after_match = text[match_pos + len(match):match_pos + len(match) + 1]
            if after_match != '[': 
                categories.append(match)
    

    seen = set()
    unique_categories = []
    for category in categories:
        if category not in seen:
            seen.add(category)
            unique_categories.append(category)
    
    return unique_categories

In [3]:
## analyze the accuracy of filtering issues by LLM

model_list = ["gpt-4o-mini", "deepseek-chat","claude-3-7-sonnet-20250219","o3-2025-04-16","gemini-2.5-flash"]

for model in model_list:
    print(f'Processing {model}...')
    df = pd.read_csv(f'./res/Filteration_{model}.csv')

    pred_labels = []
    # scan each data, if the Accept in LLM_classification,  then append 1 to pred_labels, otherwise append 0
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        if 'Accept' in row['LLM_classification']:
            pred_labels.append(1)
        else:
            pred_labels.append(0)

    # take the 'label' column from df to a list, and caculate the accuracy of the pred_labels, wri
    label_list = df['label'].tolist()
    accuracy = accuracy_score(label_list, pred_labels)

    print(f'Accuracy: {accuracy:.4f}')

Processing gpt-4o-mini...


100%|██████████| 500/500 [00:00<00:00, 29266.13it/s]


Accuracy: 0.7120
Processing deepseek-chat...


100%|██████████| 500/500 [00:00<00:00, 29623.30it/s]


Accuracy: 0.7500
Processing claude-3-7-sonnet-20250219...


100%|██████████| 500/500 [00:00<00:00, 31768.29it/s]

Accuracy: 0.7760
Processing o3-2025-04-16...



100%|██████████| 500/500 [00:00<00:00, 32222.24it/s]


Accuracy: 0.7560
Processing gemini-2.5-flash...


100%|██████████| 500/500 [00:00<00:00, 32021.99it/s]

Accuracy: 0.7840





In [None]:
## analyze the accuracy of labeling bugs by LLM
# extractor = SmartJSONExtractor()

model_list = ["gpt-4o-mini", "deepseek-chat","claude-3-7-sonnet-20250219","o3-2025-04-16","gemini-2.5-flash"]



for model in model_list:
    print(f"Processing labeling results of {model}...")

    symptom_correctness = {
    '0':[],
    '1':[],
    '2':[],
    'leaf':[]
    }

    root_cause_correctness = {
        '0':[],
        '1':[],
        '2':[],
        'leaf':[]
    }
    df = pd.read_csv(f'./res/Labeling_{model}.csv')

    symptom_pred_ids = []
    root_cause_pred_ids = []

    for idx, row in df.iterrows():
        llm_cls = str(row['LLM_classification'])

        split_idx = llm_cls.find('"root_cause"')
        if split_idx == -1:

            symptom_part = ""
            root_cause_part = ""
        else:
            symptom_part = llm_cls[:split_idx]
            root_cause_part = llm_cls[split_idx:]

        if model == 'gemini-2.5-flash':
            symptom_ids = extract_categories_plus(symptom_part)
            root_cause_ids = extract_categories_plus(root_cause_part)
        else:
            symptom_ids = extract_categories(symptom_part)
            root_cause_ids = extract_categories(root_cause_part)

        symptom_ids_sorted = sorted(symptom_ids, key=len)
        root_cause_ids_sorted = sorted(root_cause_ids, key=len)
        symptom_pred_ids.append(symptom_ids_sorted)
        root_cause_pred_ids.append(root_cause_ids_sorted)

        gt_symptom_id = row['symptom_id']
        gt_root_cause_id = row['root_causes_id']

        level_symptom = gt_symptom_id.count('.')
        level_root_cause = gt_root_cause_id.count('.')

        for i in range(len(symptom_ids_sorted)):
            cur_level = int(symptom_ids_sorted[i].count('.'))
            if cur_level >  level_symptom:
                break
            if symptom_ids_sorted[i] in gt_symptom_id:
                symptom_correctness[str(cur_level)].append(1)
            else:
                symptom_correctness[str(cur_level)].append(0)

        for i in range(len(root_cause_ids_sorted)):
            cur_level = root_cause_ids_sorted[i].count('.')
            if cur_level >  level_root_cause:
                break
            if root_cause_ids_sorted[i] in gt_root_cause_id:
                root_cause_correctness[str(cur_level)].append(1)
            else:
                root_cause_correctness[str(cur_level)].append(0)
        
        symptom_correctness['leaf'].append(symptom_correctness[str(level_symptom)][-1])
        root_cause_correctness['leaf'].append(root_cause_correctness[str(level_root_cause)][-1])

    for level in symptom_correctness.keys():
        if len(symptom_correctness[level]) > 0:
            symptom_correctness[level] = sum(symptom_correctness[level]) / len(symptom_correctness[level])
        else:
            symptom_correctness[level] = 'NaN'
    for level in root_cause_correctness.keys():
        if len(root_cause_correctness[level]) > 0:
            root_cause_correctness[level] = sum(root_cause_correctness[level]) / len(root_cause_correctness[level])
        else:
            root_cause_correctness[level] = 'NaN'
    print(f"Symptom correctness: {symptom_correctness}")
    print(f"Root cause correctness: {root_cause_correctness}")
    print(f"--------------------------------")
    





    

Processing labeling results of gpt-4o-mini...
Symptom correctness: {'0': 0.6746666666666666, '1': 0.5260869565217391, '2': 0.543859649122807, 'leaf': 0.5740223463687151}
Root cause correctness: {'0': 0.5493333333333333, '1': 0.34782608695652173, '2': 'NaN', 'leaf': 0.33519553072625696}
--------------------------------
Processing labeling results of deepseek-chat...
Symptom correctness: {'0': 0.7650918635170604, '1': 0.6345291479820628, '2': 0.6090909090909091, 'leaf': 0.6480446927374302}
Root cause correctness: {'0': 0.6150627615062761, '1': 0.4393939393939394, '2': 'NaN', 'leaf': 0.4064245810055866}
--------------------------------
Processing labeling results of claude-3-7-sonnet-20250219...
Symptom correctness: {'0': 0.7568681318681318, '1': 0.5995762711864406, '2': 0.7204301075268817, 'leaf': 0.6730205278592375}
Root cause correctness: {'0': 0.6524926686217009, '1': 0.5070866141732283, '2': 'NaN', 'leaf': 0.47214076246334313}
--------------------------------
Processing labeling resu

  matches3 = re.findall(pattern3, text)
