In [1]:
from typing import Dict

In [6]:
import csv


def generate_csv_metrics(enhanced_summary):
    """Generate CSV file with task metrics for easy analysis."""
    if not enhanced_summary.get("results"):
        print("No results to export to CSV")
        return

    current_timestamp = enhanced_summary["benchmark_metadata"]["timestamp"]
    csv_filename = f"nlweb_mcp_enhanced_metrics_{current_timestamp}.csv"


    # Prepare CSV data
    csv_data = []
    for result in enhanced_summary["results"]:
        
        csv_row = {
            "task_category": result.get("task_id", "").split("_Task")[0],
            "task_id": result.get("task_id", ""),
            "task_completion_rate": result.get("task_completion_rate", 0),
            "precision": result.get("precision", 0),
            "recall": result.get("recall", 0),
            "f1_score": result.get("f1_score", 0),
            "prompt_tokens": result.get("prompt_tokens", 0),
            "completion_tokens": result.get("completion_tokens", 0),
            "execution_time": result.get("execution_time_seconds", 0),
        }
        csv_data.append(csv_row)

    # Write CSV file
    try:
        with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ["task_category","task_id", "task_completion_rate", "precision",
                          "recall", "f1_score", "prompt_tokens", "completion_tokens", "execution_time"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(csv_data)

        print(f"📊 CSV metrics exported to: {csv_filename}")
        return csv_filename
    except Exception as e:
        print(f"Failed to generate CSV: {e}")
        return None

In [7]:
import json

with open("../results/v1/mcp/gpt4.1/hybrid_execution_history_20250725_104451.json", "r") as f:
    enhanced_summary = json.load(f)

generate_csv_metrics(enhanced_summary)

📊 CSV metrics exported to: nlweb_mcp_enhanced_metrics_20250725_104451.csv


'nlweb_mcp_enhanced_metrics_20250725_104451.csv'

In [2]:
# 1) Configuration: your real shop URLs and solution page URL
URLS = {
    "URL_1": "https://webmall-1.informatik.uni-mannheim.de",
    "URL_2": "https://webmall-2.informatik.uni-mannheim.de",
    "URL_3": "https://webmall-3.informatik.uni-mannheim.de",
    "URL_4": "https://webmall-4.informatik.uni-mannheim.de"
}


def normalize_url(url: str) -> str:
    """Normalize URL for comparison by removing trailing slashes and converting to lowercase."""
    return url.rstrip('/').lower()


def fill_urls(text_or_list, urls: Dict[str, str]):
    def replace_in_text(text: str) -> str:
        for key, val in urls.items():
            text = text.replace("{{" + key + "}}", val)
        return text

    if isinstance(text_or_list, list):
        return [replace_in_text(t) for t in text_or_list]
    else:
        return replace_in_text(text_or_list)

In [3]:
def calculation_results(benchmark_solutions, model_solution):
    """
    Calculate task completion, precision, and recall metrics.
    
    Args:
        benchmark_solutions: List of sets containing benchmark solutions
        model_solution: List of sets containing model solution
    
    Returns:
        dict: Contains task_completion_rate, avg_precision, avg_recall, f1_score
    """
    if len(benchmark_solutions) != len(model_solution):
        raise ValueError("benchmark_solutions and model_solution must have the same length")
    
    if len(benchmark_solutions) == 0:
        return {
            'task_completion_rate': 0.0,
            'avg_precision': 0.0,
            'avg_recall': 0.0,
            'f1_score': 0.0
        }

    task_completions = []
    precisions = []
    recalls = []

    for benchmark_set, model_set in zip(benchmark_solutions, model_solution):
        # Convert to sets if they aren't already
        benchmark_set = set(benchmark_set) if benchmark_set is not None else set()
        model_set = set(model_set) if model_set is not None else set()

        # Task completion: 1 if exact match, 0 otherwise
        task_completion = 1 if benchmark_set == model_set else 0
        task_completions.append(task_completion)

        # Precision: intersection / model_set size
        if len(model_set) > 0:
            precision = len(benchmark_set.intersection(model_set)) / len(model_set)
        else:
            precision = 0.0
        precisions.append(precision)

        # Recall: intersection / benchmark_set size
        if len(benchmark_set) > 0:
            recall = len(benchmark_set.intersection(model_set)) / len(benchmark_set)
        else:
            recall = 0.0
        recalls.append(recall)

    # Calculate aggregated metrics
    task_completion_rate = sum(task_completions) / len(benchmark_solutions)
    avg_precision = sum(precisions) / len(benchmark_solutions)
    avg_recall = sum(recalls) / len(benchmark_solutions)
    
    # Calculate F1 score with zero division protection
    if avg_precision + avg_recall > 0:
        f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
    else:
        f1_score = 0.0

    return {
        'task_completion_rate': task_completion_rate,
        'avg_precision': avg_precision,
        'avg_recall': avg_recall,
        'f1_score': f1_score
    }

In [4]:
import pandas as pd
import json

In [5]:
log_file = "./rag/v2/results/v2/benchmark_v2_improved_results_20250722_215549.json"

In [6]:
try:    
    df = pd.read_json(log_file)
except Exception as e:
    # read json file
    with open(log_file, "r") as f:
        data = json.load(f)
    df = pd.DataFrame(data["results"])

# Create a category column based on the id, removing the Task part
df["category"] = df["task_id"].str.replace(r'_Task\d+$', '', regex=True)

# Move the category column to the first position
df = df[["category"] + [col for col in df.columns if col != "category"]]
df.head()


Unnamed: 0,category,task_id,user_task,metrics,parsed_urls,db_urls_found,db_urls_missing,db_coverage,tool_history,total_searches,...,additional_urls,missing_urls,parsed_model_response,model_response,task_category,evaluation_urls,cart_checkout_urls,prompt_tokens,completion_tokens,total_tokens
0,Webmall_Single_Product_Search,Webmall_Single_Product_Search_Task1,\nFind all offers for the AMD Ryzen 9 5900X.\n,"{'task_completion_rate': 1, 'avg_precision': 1...",[https://webmall-1.informatik.uni-mannheim.de/...,[https://webmall-2.informatik.uni-mannheim.de/...,[],1.0,"[{'tool_name': 'search_products', 'tool_args':...",1,...,[],[],[https://webmall-1.informatik.uni-mannheim.de/...,"[\n ""https://webmall-1.informatik.uni-mannhei...",Webmall_Single_Product_Search,[https://webmall-1.informatik.uni-mannheim.de/...,[],15379,110,15489
1,Webmall_Single_Product_Search,Webmall_Single_Product_Search_Task2,\nFind all offers for the Canon EOS R5 Mark II.\n,"{'task_completion_rate': 1, 'avg_precision': 1...",[https://webmall-3.informatik.uni-mannheim.de/...,[https://webmall-2.informatik.uni-mannheim.de/...,[],1.0,"[{'tool_name': 'search_products', 'tool_args':...",1,...,[],[],[https://webmall-3.informatik.uni-mannheim.de/...,"[\n ""https://webmall-3.informatik.uni-mannhei...",Webmall_Single_Product_Search,[https://webmall-3.informatik.uni-mannheim.de/...,[],29566,259,29825
2,Webmall_Single_Product_Search,Webmall_Single_Product_Search_Task3,\nFind all offers for the Samsung Galaxy A25 a...,"{'task_completion_rate': 1, 'avg_precision': 1...",[https://webmall-1.informatik.uni-mannheim.de/...,[https://webmall-1.informatik.uni-mannheim.de/...,[],1.0,"[{'tool_name': 'search_products', 'tool_args':...",2,...,[],[],[https://webmall-1.informatik.uni-mannheim.de/...,"[\n ""https://webmall-1.informatik.uni-mannhei...",Webmall_Single_Product_Search,[https://webmall-1.informatik.uni-mannheim.de/...,[],60841,493,61334
3,Webmall_Single_Product_Search,Webmall_Single_Product_Search_Task4,\nFind all offers for the Asus ROG Ryujin II A...,"{'task_completion_rate': 1, 'avg_precision': 1...",[https://webmall-3.informatik.uni-mannheim.de/...,[https://webmall-3.informatik.uni-mannheim.de/...,[],1.0,"[{'tool_name': 'search_products', 'tool_args':...",1,...,[],[],[https://webmall-3.informatik.uni-mannheim.de/...,"[\n ""https://webmall-3.informatik.uni-mannhei...",Webmall_Single_Product_Search,[https://webmall-3.informatik.uni-mannheim.de/...,[],76235,654,76889
4,Webmall_Single_Product_Search,Webmall_Single_Product_Search_Task5,\nFind all offers for the Kingston 1TB NV2 M.2...,"{'task_completion_rate': 1, 'avg_precision': 1...",[https://webmall-1.informatik.uni-mannheim.de/...,[https://webmall-3.informatik.uni-mannheim.de/...,[],1.0,"[{'tool_name': 'search_products', 'tool_args':...",1,...,[],[],[https://webmall-1.informatik.uni-mannheim.de/...,"[\n ""https://webmall-1.informatik.uni-mannhei...",Webmall_Single_Product_Search,[https://webmall-1.informatik.uni-mannheim.de/...,[],93929,798,94727


In [7]:
task_set = pd.read_json("../task_sets.json")

# Extract all task ids from the 'tasks' column (which is a list of dicts per row)
task_rows = []
for _, row in task_set.iterrows():
    for task in row['tasks']:
        task_rows.append({'task_id': task['id'], "correct_answer": task['correct_answer']["answers"]})

tasks_df = pd.DataFrame(task_rows)


In [8]:
# Helper to robustly parse a JSON list of URLs from a string, even if there is extra data or formatting issues
def safe_parse_url_list(s):
    try:
        # Try to parse as a single JSON list
        result = json.loads(s)
        if isinstance(result, list):
            return set(normalize_url(url) for url in result)
        # If not a list, try to extract the first list found in the string
    except json.JSONDecodeError:
        pass
    # Fallback: try to extract the first JSON array from the string
    import re
    match = re.search(r'(\[.*?\])', s, re.DOTALL)
    if match:
        try:
            result = json.loads(match.group(1))
            if isinstance(result, list):
                return set(normalize_url(url) for url in result)
        except Exception:
            pass
    # If all fails, return empty set
    return set()



In [9]:
# Assume fill_url is defined elsewhere in the notebook
tasks_df["correct_answer"] = tasks_df["correct_answer"].apply(lambda lst: fill_urls(lst, URLS))

# Build a mapping from task_id to correct_answer for fast lookup
taskid_to_answer = dict(zip(tasks_df["task_id"], tasks_df["correct_answer"]))

# Add the column to df based on the id of the task, handling missing ids gracefully
def get_correct_answer_for_id(task_id):
    # Return the correct answer if found, else return an empty list
    return taskid_to_answer.get(task_id, [])

df["correct_answers"] = df["task_id"].apply(get_correct_answer_for_id)

# parse model answers so that it is a list of strings
if "model_response" in df.columns:
    df["parser_model_answers"] = df["model_response"].apply(lambda lst: [normalize_url(url) for url in safe_parse_url_list(lst)])
else:
    df["parser_model_answers"] = df["raw_response"].apply(lambda lst: [normalize_url(url) for url in safe_parse_url_list(lst)])

In [None]:
# Compute metrics for each row
metrics = df.apply(lambda row: calculation_results([row["correct_answers"]], [row["parser_model_answers"]]), axis=1)
# Convert metrics (a Series of dicts) to a DataFrame
metrics_df = pd.DataFrame(list(metrics))

# Compute per-row prompt_tokens and completion_tokens by differencing
#prompt_tokens = df["prompt_tokens"].astype(int).diff().fillna(df["prompt_tokens"].iloc[0]).astype(int)
#completion_tokens = df["completion_tokens"].astype(int).diff().fillna(df["completion_tokens"].iloc[0]).astype(int)



# Prepare DataFrame in the requested order
ordered_metrics = pd.DataFrame({
    "category": df["category"].reset_index(drop=True),
    "task_id": df["task_id"].reset_index(drop=True),
    "task_completion_rate": metrics_df["task_completion_rate"],
    "avg_precision": metrics_df["avg_precision"],
    "avg_recall": metrics_df["avg_recall"],
    "f1_score": metrics_df["f1_score"],
    #"prompt_tokens": prompt_tokens.reset_index(drop=True),
    #"completion_tokens": completion_tokens.reset_index(drop=True)
    "prompt_tokens": df["prompt_tokens"].reset_index(drop=True),
    "completion_tokens": df["completion_tokens"].reset_index(drop=True)
})

ordered_metrics.to_clipboard()

In [79]:
ordered_metrics

Unnamed: 0,category,task_id,task_completion_rate,avg_precision,avg_recall,f1_score,prompt_tokens,completion_tokens
0,Webmall_Add_To_Cart,Webmall_Add_To_Cart_Task1,0.0,0.0,0.0,0.0,90062,220
1,Webmall_Add_To_Cart,Webmall_Add_To_Cart_Task2,0.0,0.0,0.0,0.0,119197,244
2,Webmall_Add_To_Cart,Webmall_Add_To_Cart_Task3,0.0,0.0,0.0,0.0,92469,236
3,Webmall_Add_To_Cart,Webmall_Add_To_Cart_Task4,0.0,0.0,0.0,0.0,159256,281
4,Webmall_Add_To_Cart,Webmall_Add_To_Cart_Task5,0.0,0.0,0.0,0.0,103487,212
5,Webmall_Add_To_Cart,Webmall_Add_To_Cart_Task6,0.0,0.0,0.0,0.0,99901,244
6,Webmall_Add_To_Cart,Webmall_Add_To_Cart_Task7,0.0,0.0,0.0,0.0,91854,212
7,Webmall_Checkout,Webmall_Checkout_Task1,1.0,1.0,1.0,1.0,2967,83
8,Webmall_Checkout,Webmall_Checkout_Task2,1.0,1.0,1.0,1.0,2993,115
9,Webmall_Checkout,Webmall_Checkout_Task3,1.0,1.0,1.0,1.0,112208,476


In [56]:
category_metrics = df.groupby("category").apply(
    lambda group: calculation_results(group["correct_answers"], group["parser_model_answers"])
)

# category_metrics is a Series with category as index and dicts as values.
# Convert to DataFrame with dict keys as columns.
results_df = pd.DataFrame(list(category_metrics.values), index=category_metrics.index).reset_index()
results_df = results_df.rename(columns={'index': 'category'})
results_df

  category_metrics = df.groupby("category").apply(


Unnamed: 0,category,task_completion_rate,avg_precision,avg_recall,f1_score
0,Webmall_Add_To_Cart,0.0,0.0,0.0,0.0
1,Webmall_Checkout,1.0,1.0,1.0,1.0


In [57]:
# Check if metrics are in separate columns or nested
has_metrics_column = 'metrics' in df.columns
has_separate_metrics = all(col in df.columns for col in ['accuracy', 'precision', 'recall'])

if has_metrics_column and not has_separate_metrics:
    # Original format with nested metrics
    category_metrics = df.groupby('category').agg({
        'metrics': lambda x: {
            'accuracy': x.apply(lambda m: m['accuracy']).mean(),
            'precision': x.apply(lambda m: m['precision']).mean(),
            'recall': x.apply(lambda m: m['recall']).mean(),
            'f1_score': x.apply(lambda m: m['f1_score']).mean()
        },
        'prompt_tokens': 'mean',
        'completion_tokens': 'mean'
    }).reset_index()
    
    # Calculate completion rate (percentage of tasks with accuracy = 1.0)
    completion_rates = df.groupby('category').apply(
        lambda x: (x['metrics'].apply(lambda m: m['accuracy']) == 1.0).mean()
    ).reset_index()
    completion_rates.columns = ['category', 'completion_rate']
    
    # Create a DataFrame with separate columns for each metric
    metrics_df = pd.DataFrame([
        {
            'category': row['category'],
            'accuracy': row['metrics']['accuracy'],
            'precision': row['metrics']['precision'],
            'recall': row['metrics']['recall'],
            'f1_score': row['metrics']['f1_score'],
            'prompt_tokens': row['prompt_tokens'],
            'completion_tokens': row['completion_tokens']
        }
        for _, row in category_metrics.iterrows()
    ])
    
elif has_separate_metrics:
    # New format with separate metric columns
    agg_dict = {
        'accuracy': 'mean',
        'precision': 'mean', 
        'recall': 'mean',
        'prompt_tokens': 'mean',
        'completion_tokens': 'mean'
    }
    
    # Add f1_score if it exists, otherwise calculate it
    if 'f1_score' in df.columns:
        agg_dict['f1_score'] = 'mean'
    
    # Add total_tokens if it exists
    if 'total_tokens' in df.columns:
        agg_dict['total_tokens'] = 'mean'
    
    category_metrics = df.groupby('category').agg(agg_dict).reset_index()
    
    # Calculate f1_score if it doesn't exist
    if 'f1_score' not in category_metrics.columns:
        category_metrics['f1_score'] = 2 * (category_metrics['precision'] * category_metrics['recall']) / (category_metrics['precision'] + category_metrics['recall'])
        category_metrics['f1_score'] = category_metrics['f1_score'].fillna(0)  # Handle division by zero
    
    # Calculate completion rate (percentage of tasks with accuracy = 1.0)
    completion_rates = df.groupby('category').apply(
        lambda x: (x['accuracy'] == 1.0).mean()
    ).reset_index()
    completion_rates.columns = ['category', 'completion_rate']
    
    metrics_df = category_metrics
else:
    raise ValueError("Could not find metrics in expected format")

# Add NLWeb token usage if available
if "nlweb_token_usage" in df.columns:
    # Get all unique models across all categories
    all_models = set()
    for _, row in df.iterrows():
        if 'nlweb_token_usage' in row and 'by_model' in row['nlweb_token_usage']:
            all_models.update(row['nlweb_token_usage']['by_model'].keys())
    
    # Calculate average NLWeb token usage by category
    nlweb_data = []
    for category in metrics_df['category']:
        category_data = df[df['category'] == category]
        
        # Initialize row data
        row_data = {
            'category': category,
            'nlweb_total_calls': category_data['nlweb_token_usage'].apply(lambda n: n.get('total_calls', 0)).mean(),
            'nlweb_total_prompt_tokens': category_data['nlweb_token_usage'].apply(lambda n: n.get('summary', {}).get('total_prompt_tokens', 0)).mean(),
            'nlweb_total_completion_tokens': category_data['nlweb_token_usage'].apply(lambda n: n.get('summary', {}).get('total_completion_tokens', 0)).mean(),
            'nlweb_total_tokens': category_data['nlweb_token_usage'].apply(lambda n: n.get('summary', {}).get('total_tokens', 0)).mean()
        }
        
        # Add individual model token usage
        for model in all_models:
            model_prompt_tokens = []
            model_completion_tokens = []
            model_total_tokens = []
            model_call_count = []
            
            for _, task_row in category_data.iterrows():
                if ('nlweb_token_usage' in task_row and 
                    'by_model' in task_row['nlweb_token_usage'] and 
                    model in task_row['nlweb_token_usage']['by_model']):
                    
                    model_data = task_row['nlweb_token_usage']['by_model'][model]
                    model_prompt_tokens.append(model_data.get('prompt_tokens', 0))
                    model_completion_tokens.append(model_data.get('completion_tokens', 0))
                    model_total_tokens.append(model_data.get('total_tokens', 0))
                    model_call_count.append(model_data.get('call_count', 0))
            
            # Use the requested column naming convention
            model_name_clean = model.replace('-', '_').replace('.', '_')
            if model_prompt_tokens:  # Only add if we have data for this model
                row_data[f'nl_web_usage_{model_name_clean}_prompt'] = sum(model_prompt_tokens) / len(model_prompt_tokens)
                row_data[f'nl_web_usage_{model_name_clean}_output'] = sum(model_completion_tokens) / len(model_completion_tokens)
                row_data[f'nl_web_usage_{model_name_clean}_total'] = sum(model_total_tokens) / len(model_total_tokens)
                row_data[f'nl_web_usage_{model_name_clean}_calls'] = sum(model_call_count) / len(model_call_count)
            else:
                row_data[f'nl_web_usage_{model_name_clean}_prompt'] = 0
                row_data[f'nl_web_usage_{model_name_clean}_output'] = 0
                row_data[f'nl_web_usage_{model_name_clean}_total'] = 0
                row_data[f'nl_web_usage_{model_name_clean}_calls'] = 0
        
        nlweb_data.append(row_data)
    
    # Create NLWeb DataFrame and merge with main metrics
    nlweb_df = pd.DataFrame(nlweb_data)
    metrics_df = metrics_df.merge(nlweb_df, on='category', how='left')

# Merge with completion rates
metrics_df = metrics_df.merge(completion_rates, on='category')

metrics_df

# write to csv
metrics_df.to_csv('metrics_df.csv', index=False)

  completion_rates = df.groupby('category').apply(


In [58]:
metrics_df

Unnamed: 0,category,accuracy,precision,recall,f1_score,prompt_tokens,completion_tokens,completion_rate
0,Webmall_Add_To_Cart,0.928571,0.809524,0.928571,0.852381,108032.285714,235.571429,0.857143
1,Webmall_Checkout,1.0,1.0,1.0,1.0,106614.5,384.625,1.0


In [18]:
# Sum up the total tokens for each model across all categories
total_tokens_by_model = {}

for _, row in metrics_df.iterrows():
    for col in metrics_df.columns:
        if col.startswith('nl_web_usage_') and col.endswith('_total'):
            model_name = col.replace('nl_web_usage_', '').replace('_total', '')
            if model_name not in total_tokens_by_model:
                total_tokens_by_model[model_name] = 0
            total_tokens_by_model[model_name] += row[col]

print("Total tokens used by each model across all categories:")
for model, total_tokens in total_tokens_by_model.items():
    print(f"{model}: {total_tokens:,.0f} tokens")

# Sum up the input and output tokens for each model across all categories
total_input_tokens_by_model = {}
total_output_tokens_by_model = {}

for _, row in metrics_df.iterrows():
    for col in metrics_df.columns:
        if col.startswith('nl_web_usage_') and col.endswith('_prompt'):
            model_name = col.replace('nl_web_usage_', '').replace('_prompt', '')
            if model_name not in total_input_tokens_by_model:
                total_input_tokens_by_model[model_name] = 0
            total_input_tokens_by_model[model_name] += row[col]
        elif col.startswith('nl_web_usage_') and col.endswith('_output'):
            model_name = col.replace('nl_web_usage_', '').replace('_output', '')
            if model_name not in total_output_tokens_by_model:
                total_output_tokens_by_model[model_name] = 0
            total_output_tokens_by_model[model_name] += row[col]

print("\nTotal input tokens used by each model across all categories:")
for model, total_tokens in total_input_tokens_by_model.items():
    print(f"{model}: {total_tokens:,.0f} tokens")

print("\nTotal output tokens used by each model across all categories:")
for model, total_tokens in total_output_tokens_by_model.items():
    print(f"{model}: {total_tokens:,.0f} tokens")


Total tokens used by each model across all categories:
gpt_4_1_mini: 581,329 tokens
gpt_4_1: 14,576 tokens

Total input tokens used by each model across all categories:
gpt_4_1_mini: 518,246 tokens
gpt_4_1: 12,763 tokens

Total output tokens used by each model across all categories:
gpt_4_1_mini: 63,083 tokens
gpt_4_1: 1,812 tokens


In [59]:
# Generate a tool call statistics table
import json

# Extract all tool calls from all tasks
all_tool_calls = []
for _, row in df.iterrows():
    all_tool_calls.extend(row['tool_calls'])

# Count tool usage by name and server
tool_stats = {}
for tool_call in all_tool_calls:
    tool_name = tool_call['tool_name']
    mcp_server = tool_call['mcp_server']
    
    key = f"{tool_name} ({mcp_server})"
    if key not in tool_stats:
        tool_stats[key] = {
            'tool_name': tool_name,
            'mcp_server': mcp_server,
            'count': 0,
            'successful_results': 0,
            'empty_results': 0
        }
    
    tool_stats[key]['count'] += 1
    
    # Check if the tool call returned results
    try:
        output = json.loads(tool_call['tool_output'])
        if 'products' in output and output['products']:
            tool_stats[key]['successful_results'] += 1
        elif 'items' in output and output['items']:
            tool_stats[key]['successful_results'] += 1
        elif 'ecommerce_items' in output and output['ecommerce_items']:
            tool_stats[key]['successful_results'] += 1
        elif 'products_stock_info' in output and output['products_stock_info']:
            tool_stats[key]['successful_results'] += 1
        else:
            tool_stats[key]['empty_results'] += 1
    except:
        tool_stats[key]['empty_results'] += 1

# Create DataFrame for tool statistics
tool_stats_df = pd.DataFrame.from_dict(tool_stats, orient='index')
tool_stats_df = tool_stats_df.reset_index(drop=True)
tool_stats_df = tool_stats_df.sort_values('count', ascending=False)


print("Tool Call Statistics:")
print(tool_stats_df.to_string(index=False))

# Display summary
print(f"\nTotal tool calls: {len(all_tool_calls)}")
print(f"Unique tools used: {len(tool_stats_df)}")


Tool Call Statistics:
            tool_name                   mcp_server  count  successful_results  empty_results
        ask_webmall_4    WebMall-4 (Hardware Cafe)     10                   0             10
        ask_webmall_2         WebMall-2 (TechTalk)     10                   0             10
        ask_webmall_1 WebMall-1 (E-Store Athletes)      9                   0              9
add_to_cart_webmall_4    WebMall-4 (Hardware Cafe)      9                   0              9
add_to_cart_webmall_1 WebMall-1 (E-Store Athletes)      9                   0              9
        ask_webmall_3       WebMall-3 (CamelCases)      8                   0              8
add_to_cart_webmall_2         WebMall-2 (TechTalk)      8                   0              8
add_to_cart_webmall_3       WebMall-3 (CamelCases)      7                   0              7
   checkout_webmall_4    WebMall-4 (Hardware Cafe)      7                   0              7
   checkout_webmall_2         WebMall-2 (TechTal

In [5]:
tool_stats_df

Unnamed: 0,tool_name,mcp_server,count,successful_results,empty_results
0,ask_webmall_1,WebMall-1 (E-Store Athletes),74,0,74
1,ask_webmall_2,WebMall-2 (TechTalk),74,0,74
2,ask_webmall_3,WebMall-3 (CamelCases),74,0,74
3,ask_webmall_4,WebMall-4 (Hardware Cafe),74,0,74
5,get_product_webmall_3,WebMall-3 (CamelCases),3,0,3
4,get_product_webmall_1,WebMall-1 (E-Store Athletes),2,0,2


In [6]:
len(all_tool_calls)

301

# Error Analysis

This section analyzes three types of errors across different evaluation approaches:
1. Cases where the retrieval system found all correct solutions but the model didn't return all  
2. Cases where the model found additional (incorrect) solutions
3. Cases where the retrieval system didn't retrieve all correct solutions

The analysis adapts to different data formats and metrics available (recall vs RAG efficiency).

In [8]:
# Error Analysis Function - Adapts to different data formats
def analyze_errors(df):
    """
    Analyze errors in different evaluation approaches.
    Handles both RAG-based and MCP-based evaluation formats.
    """
    
    # Check available columns to determine data format
    has_rag_coverage = 'rag_coverage' in df.columns
    has_mcp_metrics = 'mcp_metrics' in df.columns
    has_nested_metrics = 'metrics' in df.columns and isinstance(df['metrics'].iloc[0], dict)
    has_separate_metrics = 'recall' in df.columns and 'precision' in df.columns
    
    print("=== ERROR ANALYSIS ===")
    print(f"Data format detected: ", end="")
    
    if has_rag_coverage:
        print("RAG-based evaluation")
        return analyze_rag_errors(df)
    elif has_mcp_metrics:
        print("MCP-based evaluation")
        return analyze_mcp_errors(df)
    elif has_nested_metrics or has_separate_metrics:
        print("Standard metrics evaluation")
        return analyze_standard_errors(df)
    else:
        print("Unknown format - attempting basic analysis")
        return analyze_basic_errors(df)

def analyze_rag_errors(df):
    """Analyze errors in RAG-based evaluation format"""
    print("\n=== CASE 1: RAG found all solutions but model missed some ===")
    
    # Filter for cases where RAG found everything but model didn't
    case1 = df[(df['rag_coverage'] == 1.0) & (df['accuracy'] < 1.0)]
    print(f"Total cases: {len(case1)}")
    
    if len(case1) > 0:
        print("\nDetailed analysis:")
        for idx, row in case1.iterrows():  # Show first 3 for brevity
            print(f"\nTask ID: {row['task_id']}")
            print(f"Task: {row.get('task', row.get('user_task', 'N/A'))[:100]}...")
            print(f"RAG Coverage: {row['rag_coverage']:.0%} | Model Accuracy: {row['accuracy']:.0%}")
            print(f"Missing URLs: {row['missing_urls']}")
            print("-" * 60)
    
    print("\n=== CASE 2: Model found additional (incorrect) solutions ===")
    
    # Filter for cases where model returned extra URLs
    case2 = df[df['additional_urls'].apply(lambda x: len(x) > 0)]
    print(f"Total cases: {len(case2)}")
    
    if len(case2) > 0:
        print("\nDetailed analysis:")
        for idx, row in case2.head(3).iterrows():
            print(f"\nTask ID: {row['task_id']}")
            print(f"Task: {row.get('task', row.get('user_task', 'N/A'))[:100]}...")
            print(f"Accuracy: {row['accuracy']:.0%} | Precision: {row['precision']:.0%}")
            print(f"Additional URLs: {len(row['additional_urls'])}")
            print("-" * 60)
    
    print("\n=== CASE 3: RAG didn't retrieve all correct solutions ===")
    
    # Filter for cases where RAG missed some correct answers
    case3 = df[df['rag_coverage'] < 1.0]
    print(f"Total cases: {len(case3)}")
    
    if len(case3) > 0:
        print("\nBreakdown by category:")
        category_stats = case3.groupby('category').agg({
            'rag_coverage': ['count', 'mean'],
            'accuracy': 'mean'
        }).round(2)
        print(category_stats)
    
    return case1, case2, case3

def analyze_mcp_errors(df):
    """Analyze errors in MCP-based evaluation format"""
    print("\n=== CASE 1: MCP found all solutions but model missed some ===")
    
    # Calculate MCP efficiency (recall from MCP retrieval)
    df['mcp_recall'] = df['mcp_metrics'].apply(lambda x: x.get('recall', 0))
    df['mcp_precision'] = df['mcp_metrics'].apply(lambda x: x.get('precision', 0))
    
    # Filter for cases where MCP found everything but model didn't
    case1 = df[(df['mcp_recall'] == 1.0) & (df['metrics'].apply(lambda x: x.get('accuracy', 0)) < 1.0)]
    print(f"Total cases: {len(case1)}")
    
    if len(case1) > 0:
        print("\nDetailed analysis:")
        for idx, row in case1.iterrows():
            print(f"\nTask ID: {row['task_id']}")
            print(f"Task: {row.get('task', 'N/A')[:100]}...")
            print(f"MCP Recall: {row['mcp_recall']:.0%} | Model Accuracy: {row['metrics']['accuracy']:.0%}")
            print(f"Missing URLs: {row['missing_urls']}")
            print(f"Model response: {row['raw_response']}")
            print("-" * 60)
    
    print("\n=== CASE 2: Model found additional (incorrect) solutions ===")
    
    # Filter for cases where model returned extra URLs
    case2 = df[df['additional_urls'].apply(lambda x: len(x) > 0)]
    print(f"Total cases: {len(case2)}")
    
    if len(case2) > 0:
        print("\nDetailed analysis:")
        for idx, row in case2.head(3).iterrows():
            print(f"\nTask ID: {row['task_id']}")
            print(f"Task: {row.get('task', 'N/A')[:100]}...")
            print(f"Accuracy: {row['metrics']['accuracy']:.0%} | Precision: {row['metrics']['precision']:.0%}")
            print(f"Additional URLs: {len(row['additional_urls'])}")
            print("-" * 60)
    
    print("\n=== CASE 3: MCP didn't retrieve all correct solutions ===")
    
    # Filter for cases where MCP missed some correct answers
    case3 = df[df['mcp_recall'] < 1.0]
    print(f"Total cases: {len(case3)}")
    
    if len(case3) > 0:
        print("\nBreakdown by category:")
        category_stats = case3.groupby('category').agg({
            'mcp_recall': ['count', 'mean'],
            'metrics': lambda x: x.apply(lambda m: m.get('accuracy', 0)).mean()
        }).round(2)
        print(category_stats)
    
    return case1, case2, case3

def analyze_standard_errors(df):
    """Analyze errors in standard metrics format"""
    print("\n=== CASE 1: Perfect recall but imperfect accuracy ===")
    
    # Get recall values (nested or separate)
    if 'recall' in df.columns:
        recall_values = df['recall']
        accuracy_values = df['accuracy']  
        precision_values = df['precision']
    else:
        recall_values = df['metrics'].apply(lambda x: x.get('recall', 0))
        accuracy_values = df['metrics'].apply(lambda x: x.get('accuracy', 0))
        precision_values = df['metrics'].apply(lambda x: x.get('precision', 0))
    
    # Filter for cases where recall is perfect but accuracy isn't
    case1 = df[(recall_values == 1.0) & (accuracy_values < 1.0)]
    print(f"Total cases: {len(case1)}")
    
    if len(case1) > 0:
        print("\nDetailed analysis:")
        for idx, row in case1.head(3).iterrows():
            recall = recall_values.iloc[idx] if hasattr(recall_values, 'iloc') else recall_values[idx]
            accuracy = accuracy_values.iloc[idx] if hasattr(accuracy_values, 'iloc') else accuracy_values[idx]
            print(f"\nTask ID: {row['task_id']}")
            print(f"Task: {row.get('task', 'N/A')[:100]}...")
            print(f"Recall: {recall:.0%} | Accuracy: {accuracy:.0%}")
            print(f"Missing URLs: {row.get('missing_urls', 'N/A')}")
            print("-" * 60)
    
    print("\n=== CASE 2: Model found additional (incorrect) solutions ===")
    
    # Filter for cases where model returned extra URLs
    case2 = df[df['additional_urls'].apply(lambda x: len(x) > 0)]
    print(f"Total cases: {len(case2)}")
    
    if len(case2) > 0:
        print("\nDetailed analysis:")
        for idx, row in case2.head(3).iterrows():
            accuracy = accuracy_values.iloc[idx] if hasattr(accuracy_values, 'iloc') else accuracy_values[idx]
            precision = precision_values.iloc[idx] if hasattr(precision_values, 'iloc') else precision_values[idx]
            print(f"\nTask ID: {row['task_id']}")
            print(f"Task: {row.get('task', 'N/A')[:100]}...")
            print(f"Accuracy: {accuracy:.0%} | Precision: {precision:.0%}")
            print(f"Additional URLs: {len(row['additional_urls'])}")
            print("-" * 60)
    
    print("\n=== CASE 3: Imperfect recall ===")
    
    # Filter for cases where recall is imperfect
    case3 = df[recall_values < 1.0]
    print(f"Total cases: {len(case3)}")
    
    if len(case3) > 0:
        print("\nBreakdown by category:")
        category_stats = case3.groupby('category').agg({
            'task_id': 'count',
            'recall': 'mean' if 'recall' in df.columns else lambda x: x.apply(lambda r: r.get('recall', 0) if isinstance(r, dict) else 0).mean(),
            'accuracy': 'mean' if 'accuracy' in df.columns else lambda x: x.apply(lambda r: r.get('accuracy', 0) if isinstance(r, dict) else 0).mean()
        }).round(2)
        print(category_stats)
    
    return case1, case2, case3

def analyze_basic_errors(df):
    """Basic error analysis for unknown formats"""
    print("\nPerforming basic error analysis...")
    
    # Try to find any available metrics
    available_cols = df.columns.tolist()
    print(f"Available columns: {available_cols}")
    
    if 'additional_urls' in df.columns:
        case2 = df[df['additional_urls'].apply(lambda x: len(x) > 0)]
        print(f"Tasks with additional URLs: {len(case2)}")
    
    if 'missing_urls' in df.columns:
        case3 = df[df['missing_urls'].apply(lambda x: len(x) > 0)]
        print(f"Tasks with missing URLs: {len(case3)}")
    
    return None, None, None

# Run the analysis
case1, case2, case3 = analyze_errors(df)

=== ERROR ANALYSIS ===
Data format detected: MCP-based evaluation

=== CASE 1: MCP found all solutions but model missed some ===
Total cases: 15

Detailed analysis:

Task ID: Webmall_Single_Product_Search_Task3
Task: \nFind all offers for the Samsung Galaxy A25 and A35.\n...
MCP Recall: 100% | Model Accuracy: 75%
Missing URLs: ['https://webmall-2.informatik.uni-mannheim.de/product/samsung-galaxy-a35-5g-a356b-dual-sim-256gb-awesome-iceblue-android-14-smartphone']
Model response: ["https://webmall-1.informatik.uni-mannheim.de/product/smartphone-samsung-galaxy-a25-5g-6-128gb-yellow-sm-a256bzydeue/","https://webmall-2.informatik.uni-mannheim.de/product/samsung-galaxy-a25-5g-a256b-dual-sim-128gb-blue-black-android-14-0-smartphone/","https://webmall-2.informatik.uni-mannheim.de/product/samsung-galaxy-a25-5g-a256b-dual-sim-128gb-blue-android-14-0-smartphone/","https://webmall-3.informatik.uni-mannheim.de/product/samsung-galaxy-a35-5g-a356b-dual-sim-256gb-awesome-iceblue-android-14-smartphone/

In [20]:
# Summary statistics for error analysis
def print_error_summary(df, case1, case2, case3):
    """Print summary statistics for error analysis"""
    print("\n=== ERROR ANALYSIS SUMMARY ===")
    
    total_tasks = len(df)
    
    # Handle case where cases might be None (for basic analysis)
    case1_count = len(case1) if case1 is not None else 0
    case2_count = len(case2) if case2 is not None else 0
    case3_count = len(case3) if case3 is not None else 0
    
    print(f"Total tasks analyzed: {total_tasks}")
    print(f"\nError breakdown:")
    print(f"Case 1 (Perfect retrieval, imperfect model): {case1_count} ({case1_count/total_tasks*100:.1f}%)")
    print(f"Case 2 (Model added incorrect URLs): {case2_count} ({case2_count/total_tasks*100:.1f}%)")
    print(f"Case 3 (Imperfect retrieval): {case3_count} ({case3_count/total_tasks*100:.1f}%)")
    
    # Calculate overlaps if possible
    if case1 is not None and case2 is not None:
        case1_and_2 = len(set(case1.index).intersection(set(case2.index)))
        print(f"\nError overlaps:")
        print(f"Tasks with both Case 1 & 2 errors: {case1_and_2}")
        
        if case3 is not None:
            case2_and_3 = len(set(case2.index).intersection(set(case3.index)))
            print(f"Tasks with both Case 2 & 3 errors: {case2_and_3}")
    
    # Perfect performance analysis
    try:
        # Try different ways to access accuracy and precision
        if 'accuracy' in df.columns and 'precision' in df.columns:
            perfect = len(df[(df['accuracy'] == 1.0) & (df['precision'] == 1.0)])
        elif 'metrics' in df.columns:
            perfect = len(df[
                (df['metrics'].apply(lambda x: x.get('accuracy', 0)) == 1.0) & 
                (df['metrics'].apply(lambda x: x.get('precision', 0)) == 1.0)
            ])
        else:
            perfect = 0
        
        print(f"\nPerfect performance (100% accuracy & precision): {perfect} ({perfect/total_tasks*100:.1f}%)")
    except:
        print("\nPerfect performance calculation not available for this data format")

# Print the summary
print_error_summary(df, case1, case2, case3)


=== ERROR ANALYSIS SUMMARY ===
Total tasks analyzed: 91

Error breakdown:
Case 1 (Perfect retrieval, imperfect model): 15 (16.5%)
Case 2 (Model added incorrect URLs): 52 (57.1%)
Case 3 (Imperfect retrieval): 29 (31.9%)

Error overlaps:
Tasks with both Case 1 & 2 errors: 14
Tasks with both Case 2 & 3 errors: 20

Perfect performance (100% accuracy & precision): 29 (31.9%)


In [None]:
# Additional Analysis: MCP vs Model Performance Comparison
def analyze_mcp_vs_model_performance(df):
    """
    Compare MCP retrieval performance vs model performance
    Only available for datasets with MCP metrics
    """
    if 'mcp_metrics' not in df.columns:
        print("MCP metrics not available in this dataset")
        return
    
    print("\n=== MCP VS MODEL PERFORMANCE COMPARISON ===")
    
    # Extract metrics
    mcp_precision = df['mcp_metrics'].apply(lambda x: x.get('precision', 0))
    mcp_recall = df['mcp_metrics'].apply(lambda x: x.get('recall', 0))
    model_precision = df['metrics'].apply(lambda x: x.get('precision', 0))
    model_recall = df['metrics'].apply(lambda x: x.get('recall', 0))
    
    # Overall comparison
    print(f"MCP Retrieval vs Model Performance:")
    print(f"  MCP Precision: {mcp_precision.mean():.3f} | Model Precision: {model_precision.mean():.3f}")
    print(f"  MCP Recall: {mcp_recall.mean():.3f} | Model Recall: {model_recall.mean():.3f}")
    
    # Category-wise comparison
    print(f"\nCategory-wise comparison:")
    category_comparison = df.groupby('category').agg({
        'mcp_metrics': lambda x: {
            'precision': x.apply(lambda m: m.get('precision', 0)).mean(),
            'recall': x.apply(lambda m: m.get('recall', 0)).mean()
        },
        'metrics': lambda x: {
            'precision': x.apply(lambda m: m.get('precision', 0)).mean(),
            'recall': x.apply(lambda m: m.get('recall', 0)).mean()
        }
    })
    
    for idx, row in category_comparison.iterrows():
        print(f"\n{idx}:")
        print(f"  MCP - Precision: {row['mcp_metrics']['precision']:.3f}, Recall: {row['mcp_metrics']['recall']:.3f}")
        print(f"  Model - Precision: {row['metrics']['precision']:.3f}, Recall: {row['metrics']['recall']:.3f}")
    
    # Cases where MCP performed better than model
    mcp_better_precision = df[mcp_precision > model_precision]
    mcp_better_recall = df[mcp_recall > model_recall]
    
    print(f"\nMCP advantages:")
    print(f"  Tasks where MCP had better precision: {len(mcp_better_precision)} ({len(mcp_better_precision)/len(df)*100:.1f}%)")
    print(f"  Tasks where MCP had better recall: {len(mcp_better_recall)} ({len(mcp_better_recall)/len(df)*100:.1f}%)")

# Run MCP vs Model analysis if applicable
analyze_mcp_vs_model_performance(df)

In [13]:
def analyze_incomplete_tasks(df, category_name):
    """
    Analyze all incomplete tasks (completion score != 1) for a given category.
    
    Args:
        df: DataFrame containing task results
        category_name: Name of the category to analyze
    
    Returns:
        DataFrame with detailed analysis of incomplete tasks
    """
    # Filter for the specific category
    category_df = df[df['category'] == category_name].copy()
    
    if len(category_df) == 0:
        print(f"No tasks found for category: {category_name}")
        return None
    
    # Calculate task completion for each task
    task_results = []
    
    for idx, row in category_df.iterrows():
        # Calculate individual task completion
        benchmark_set = set(row['correct_answers'])
        model_set = set(row['parser_model_answers'])
        
        task_completion = 1 if benchmark_set == model_set else 0
        
        # Calculate precision and recall for this task
        if len(model_set) > 0:
            precision = len(benchmark_set.intersection(model_set)) / len(model_set)
        else:
            precision = 0.0
            
        if len(benchmark_set) > 0:
            recall = len(benchmark_set.intersection(model_set)) / len(benchmark_set)
        else:
            recall = 0.0
        
        # Calculate differences
        missing_urls = benchmark_set - model_set
        additional_urls = model_set - benchmark_set
        
        task_results.append({
            'task_id': row['task_id'],
            'task_description': row['user_task'],
            'task_completion': task_completion,
            'precision': precision,
            'recall': recall,
            'benchmark_answers': sorted(list(benchmark_set)),
            'model_answers': sorted(list(model_set)),
            'missing_urls': sorted(list(missing_urls)),
            'additional_urls': sorted(list(additional_urls)),
            'benchmark_count': len(benchmark_set),
            'model_count': len(model_set)
        })
    
    results_df = pd.DataFrame(task_results)
    
    # Filter for incomplete tasks only
    incomplete_tasks = results_df[results_df['task_completion'] != 1]
    
    print(f"=== INCOMPLETE TASKS ANALYSIS FOR {category_name} ===")
    print(f"Total tasks in category: {len(results_df)}")
    print(f"Incomplete tasks: {len(incomplete_tasks)}")
    print(f"Completion rate: {(len(results_df) - len(incomplete_tasks)) / len(results_df) * 100:.1f}%")
    
    if len(incomplete_tasks) == 0:
        print("All tasks in this category are complete!")
        return results_df
    
    print("\n" + "="*80)
    
    # Display each incomplete task
    for idx, task in incomplete_tasks.iterrows():
        print(f"\n📋 TASK: {task['task_id']}")
        print(f"Description: {task['task_description'].strip()}")
        print(f"Completion: {task['task_completion']} | Precision: {task['precision']:.3f} | Recall: {task['recall']:.3f}")
        
        print(f"\n✅ BENCHMARK ANSWERS ({task['benchmark_count']} URLs):")
        for i, url in enumerate(task['benchmark_answers'], 1):
            print(f"  {i}. {url}")
        
        print(f"\n🤖 MODEL ANSWERS ({task['model_count']} URLs):")
        for i, url in enumerate(task['model_answers'], 1):
            print(f"  {i}. {url}")
        
        if task['missing_urls']:
            print(f"\n❌ MISSING URLs ({len(task['missing_urls'])}):")
            for i, url in enumerate(task['missing_urls'], 1):
                print(f"  {i}. {url}")
        
        if task['additional_urls']:
            print(f"\n➕ ADDITIONAL URLs ({len(task['additional_urls'])}):")
            for i, url in enumerate(task['additional_urls'], 1):
                print(f"  {i}. {url}")
        
        print("\n" + "-"*80)
    
    return incomplete_tasks

# Example usage function
def show_category_options(df):
    """Show available categories and their completion rates"""
    print("Available categories:")
    category_stats = df.groupby('category').agg({
        'task_id': 'count',
        'correct_answers': lambda x: sum(1 for benchmark, model in zip(x, df.loc[x.index, 'parser_model_answers']) 
                                        if set(benchmark) == set(model)),
    }).reset_index()
    
    category_stats['completion_rate'] = category_stats['correct_answers'] / category_stats['task_id']
    category_stats = category_stats.rename(columns={'task_id': 'total_tasks', 'correct_answers': 'completed_tasks'})
    
    for _, row in category_stats.iterrows():
        print(f"  {row['category']}: {row['completed_tasks']}/{row['total_tasks']} tasks complete ({row['completion_rate']:.1%})")
    
    return category_stats

# Show available categories
print("=== CATEGORY OVERVIEW ===")
category_overview = show_category_options(df)

=== CATEGORY OVERVIEW ===
Available categories:
  Webmall_Add_To_Cart: 7/7 tasks complete (100.0%)
  Webmall_Best_Fit_Specific: 5/11 tasks complete (45.5%)
  Webmall_Best_Fit_Vague: 1/8 tasks complete (12.5%)
  Webmall_Cheapest_Best_Fit_Specific: 1/10 tasks complete (10.0%)
  Webmall_Cheapest_Best_Fit_Vague: 0/6 tasks complete (0.0%)
  Webmall_Cheapest_Product_Search: 7/10 tasks complete (70.0%)
  Webmall_Checkout: 0/8 tasks complete (0.0%)
  Webmall_EndToEnd: 6/8 tasks complete (75.0%)
  Webmall_Find_Compatible_Products: 0/5 tasks complete (0.0%)
  Webmall_Single_Product_Search: 10/12 tasks complete (83.3%)
  Webmall_Substitutes: 2/6 tasks complete (33.3%)


In [17]:
analyze_incomplete_tasks(df, "Webmall_Substitutes")

=== INCOMPLETE TASKS ANALYSIS FOR Webmall_Substitutes ===
Total tasks in category: 6
Incomplete tasks: 4
Completion rate: 33.3%


📋 TASK: Webmall_Substitutes_Task1
Description: \nFind the cheapest alternative for this item: https://webmall-3.informatik.uni-mannheim.de/product/arctic-liquid-freezer-iii-360mm-liquid-cpu-cooler-p12-pwm-pst-fans-pwm-controlled-pump .\n
Completion: 0 | Precision: 0.000 | Recall: 0.000

✅ BENCHMARK ANSWERS (1 URLs):
  1. https://webmall-2.informatik.uni-mannheim.de/product/arctic-liquid-freezer-iii-360mm-liquid-cpu-cooler-p12-pwm-pst-fans-amp-pwm-controlled-pump

🤖 MODEL ANSWERS (1 URLs):
  1. https://webmall-2.informatik.uni-mannheim.de/product/arctic-liquid-freezer-iii-360mm-liquid-cpu-cooler-p12-pwm-pst-fans-pwm-controlled-pump

❌ MISSING URLs (1):
  1. https://webmall-2.informatik.uni-mannheim.de/product/arctic-liquid-freezer-iii-360mm-liquid-cpu-cooler-p12-pwm-pst-fans-amp-pwm-controlled-pump

➕ ADDITIONAL URLs (1):
  1. https://webmall-2.informatik.uni

Unnamed: 0,task_id,task_description,task_completion,precision,recall,benchmark_answers,model_answers,missing_urls,additional_urls,benchmark_count,model_count
0,Webmall_Substitutes_Task1,\nFind the cheapest alternative for this item:...,0,0.0,0.0,[https://webmall-2.informatik.uni-mannheim.de/...,[https://webmall-2.informatik.uni-mannheim.de/...,[https://webmall-2.informatik.uni-mannheim.de/...,[https://webmall-2.informatik.uni-mannheim.de/...,1,1
2,Webmall_Substitutes_Task3,\nFind cheaper alternatives with at least the ...,0,0.333333,1.0,[https://webmall-4.informatik.uni-mannheim.de/...,[https://webmall-3.informatik.uni-mannheim.de/...,[],[https://webmall-3.informatik.uni-mannheim.de/...,1,3
3,Webmall_Substitutes_Task4,\nFind cheaper alternatives for this monitor h...,0,0.0,0.0,[https://webmall-1.informatik.uni-mannheim.de/...,[https://webmall-3.informatik.uni-mannheim.de/...,[https://webmall-1.informatik.uni-mannheim.de/...,[https://webmall-3.informatik.uni-mannheim.de/...,2,2
5,Webmall_Substitutes_Task6,\nFind cheaper alternatives to this keyboard t...,0,0.333333,0.333333,[https://webmall-1.informatik.uni-mannheim.de/...,[https://webmall-3.informatik.uni-mannheim.de/...,[https://webmall-1.informatik.uni-mannheim.de/...,[https://webmall-3.informatik.uni-mannheim.de/...,3,3


In [None]:
# Example usage - analyze incomplete tasks for a specific category
# Uncomment and run the line below to analyze a specific category:
# analyze_incomplete_tasks(df, "Webmall_Best_Fit_Specific")