In [9]:
import json
import pandas as pd
import os
from pathlib import Path

In [None]:
# Define paths
base_path = Path("main_evaluation")
output_path = Path("evaluation_metrics_by_test_types.csv")

# Define datasets and test types
datasets = ["AM", "DS", "MCS"]
test_types = ["closed_end", "opened_end", "multihop2"]

# Define metrics to extract
metrics = [
    "avg_exact_match",
    "avg_f1_score", 
    "avg_bleu4",
    "avg_meteor",
    "avg_rouge_l"
]

print(f"Datasets: {datasets}")
print(f"Test types: {test_types}")
print(f"Metrics: {metrics}")


Datasets: ['AM', 'DS', 'MCS']
Test types: ['closed_end', 'opened_end', 'multihop', 'multihop2']
Metrics: ['avg_exact_match', 'avg_f1_score', 'avg_bleu4', 'avg_meteor', 'avg_rouge_l']


In [11]:
def read_json_file(file_path):
    """Read JSON file and return parsed content"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

def extract_metrics_from_json(data, test_type):
    """Extract metrics for specific test type from JSON data"""
    try:
        # Navigate to the specific test type results
        individual_results = data.get("individual_results", {})
        test_data = individual_results.get(test_type, {})
        aggregate_metrics = test_data.get("aggregate_metrics", {})
        
        # Extract required metrics
        extracted = {}
        for metric in metrics:
            extracted[metric] = aggregate_metrics.get(metric, 0.0)
        
        return extracted
    except Exception as e:
        print(f"Error extracting metrics for {test_type}: {e}")
        return {metric: 0.0 for metric in metrics}

def process_dataset(dataset_name):
    """Process all test types for a specific dataset"""
    file_path = base_path / f"{dataset_name}_all_test_types_summary.json"
    
    if not file_path.exists():
        print(f"File not found: {file_path}")
        return []
    
    print(f"Processing {dataset_name}...")
    
    # Read JSON data
    data = read_json_file(file_path)
    if data is None:
        return []
    
    results = []
    
    # Extract metrics for each test type
    for test_type in test_types:
        metrics_data = extract_metrics_from_json(data, test_type)
        
        row = {
            "Dataset": dataset_name,
            "Test_Type": test_type,
            "Exact_Match": metrics_data["avg_exact_match"],
            "F1_Score": metrics_data["avg_f1_score"],
            "BLEU_4": metrics_data["avg_bleu4"],
            "METEOR": metrics_data["avg_meteor"],
            "ROUGE_L": metrics_data["avg_rouge_l"]
        }
        
        results.append(row)
        print(f"  {test_type}: EM={row['Exact_Match']:.4f}, F1={row['F1_Score']:.4f}")
    
    return results

print("Functions defined successfully!")


Functions defined successfully!


In [12]:
# Check if main_evaluation directory exists
if not base_path.exists():
    print(f"Directory not found: {base_path}")
    print("Available directories:")
    for item in Path(".").iterdir():
        if item.is_dir():
            print(f"  {item}")
else:
    print(f"Found directory: {base_path}")
    print("Available files:")
    for item in base_path.iterdir():
        if item.is_file() and item.suffix == '.json':
            print(f"  {item.name}")


Directory not found: main_evaluation
Available directories:
  output_logs


In [13]:
# Process all datasets
all_results = []

for dataset in datasets:
    dataset_results = process_dataset(dataset)
    all_results.extend(dataset_results)
    print(f"Completed {dataset}\n")

print(f"Total rows extracted: {len(all_results)}")


File not found: main_evaluation\AM_all_test_types_summary.json
Completed AM

File not found: main_evaluation\DS_all_test_types_summary.json
Completed DS

File not found: main_evaluation\MCS_all_test_types_summary.json
Completed MCS

Total rows extracted: 0


In [14]:
# Create DataFrame
df = pd.DataFrame(all_results)

# Display basic info
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nDataFrame preview:")
print(df)


DataFrame shape: (0, 0)
Columns: []

DataFrame preview:
Empty DataFrame
Columns: []
Index: []


In [15]:
# Save to CSV
df.to_csv(output_path, index=False, float_format='%.4f')
print(f"Data saved to: {output_path}")

# Verify file was created
if output_path.exists():
    file_size = output_path.stat().st_size
    print(f"File size: {file_size} bytes")
    
    # Read back and display first few lines
    print("\nFirst few lines of saved file:")
    with open(output_path, 'r') as f:
        for i, line in enumerate(f):
            if i < 5:
                print(line.strip())
            else:
                break


Data saved to: evaluation_metrics_by_test_types.csv
File size: 2 bytes

First few lines of saved file:



In [16]:
# Summary by dataset
print("=== AVERAGE METRICS BY DATASET ===")
summary_by_dataset = df.groupby('Dataset')[['Exact_Match', 'F1_Score', 'BLEU_4', 'METEOR', 'ROUGE_L']].mean()
print(summary_by_dataset.round(4))

print("\n=== AVERAGE METRICS BY TEST TYPE ===")
summary_by_test = df.groupby('Test_Type')[['Exact_Match', 'F1_Score', 'BLEU_4', 'METEOR', 'ROUGE_L']].mean()
print(summary_by_test.round(4))


=== AVERAGE METRICS BY DATASET ===


KeyError: 'Dataset'

In [None]:
# Create pivot tables for better visualization
print("=== EXACT MATCH BY DATASET AND TEST TYPE ===")
pivot_em = df.pivot(index='Test_Type', columns='Dataset', values='Exact_Match')
print(pivot_em.round(4))

print("\n=== F1 SCORE BY DATASET AND TEST TYPE ===")
pivot_f1 = df.pivot(index='Test_Type', columns='Dataset', values='F1_Score')
print(pivot_f1.round(4))

# Save summary tables
summary_by_dataset.to_csv("summary_by_dataset.csv", float_format='%.4f')
summary_by_test.to_csv("summary_by_test_type.csv", float_format='%.4f')
pivot_em.to_csv("exact_match_pivot.csv", float_format='%.4f')
pivot_f1.to_csv("f1_score_pivot.csv", float_format='%.4f')

print("\n=== ADDITIONAL FILES CREATED ===")
print("- summary_by_dataset.csv")
print("- summary_by_test_type.csv")
print("- exact_match_pivot.csv")
print("- f1_score_pivot.csv")


=== EXACT MATCH BY DATASET AND TEST TYPE ===
Dataset         AM   DS   MCS
Test_Type                    
closed_end  0.6731  0.6  0.76
multihop    0.0000  0.0  0.00
multihop2   0.0000  0.0  0.00
opened_end  0.0000  0.0  0.00

=== F1 SCORE BY DATASET AND TEST TYPE ===
Dataset         AM      DS     MCS
Test_Type                         
closed_end  0.7207  0.6795  0.8217
multihop    0.4199  0.4489  0.4973
multihop2   0.4637  0.4089  0.4332
opened_end  0.5658  0.3984  0.5546

=== ADDITIONAL FILES CREATED ===
- summary_by_dataset.csv
- summary_by_test_type.csv
- exact_match_pivot.csv
- f1_score_pivot.csv
