In [3]:
import pandas as pd
import json
import os
from pathlib import Path

# Get all files from outputs folder
outputs_folder = 'outputs'
files = [f for f in os.listdir(outputs_folder) if f.endswith('.json')]

# Parse JSON files and extract metrics
data = []
for filename in files:
    filepath = os.path.join(outputs_folder, filename)
    
    try:
        with open(filepath, 'r') as f:
            json_data = json.load(f)
        
        # Extract metadata
        metadata = json_data.get('metadata', {})
        grader = metadata.get('grader_name', 'Unknown')
        doc = metadata.get('document_name', 'Unknown')
        
        # Extract results - each metric with its rating, evidence, notes
        results = json_data.get('results', {})
        for metric, metric_data in results.items():
            data.append({
                'doc': doc,
                'grader': grader,
                'metric': metric,
                'rating': metric_data.get('rating', None),
                # 'evidence': metric_data.get('evidence', ''),
                # 'notes': metric_data.get('notes', '')
            })
    except json.JSONDecodeError:
        print(f"Error parsing {filename}")
        continue

# Create summary table
summary_df = pd.DataFrame(data)
print(summary_df)


       doc grader                       metric  rating
0   Oxford     VJ      Definition and Examples       4
1   Oxford     VJ                 Permited Use       2
2   Oxford     VJ               Prohibited Use       3
3   Oxford     VJ           Academic Integrity       3
4   Oxford     VJ                 Transparency       1
..     ...    ...                          ...     ...
75     UEA     MR               Accountability       2
76     UEA     MR                        Risks       1
77     UEA     MR  Copyright and Data Privacy        4
78     UEA     MR       Training and Resources       3
79     UEA     MR                Policy Review       3

[80 rows x 4 columns]


In [4]:
# Pivot the dataframe to get grader ratings as separate columns
summary_df = summary_df.pivot_table(
    index=['doc', 'metric'],
    columns='grader',
    values='rating',
    aggfunc='first'
).reset_index()

# Rename columns to be clearer
summary_df.columns.name = None

# Convert rating columns to integers to display as whole numbers
grader_columns = [col for col in summary_df.columns if col not in ['doc', 'metric']]
for col in grader_columns:
    summary_df[col] = summary_df[col].astype('Int64')

print(summary_df)


       doc                       metric    MR    VJ  sdsd
0       HW           Academic Integrity     1     3  <NA>
1       HW               Accountability     2     4  <NA>
2       HW  Copyright and Data Privacy      4     4  <NA>
3       HW      Definition and Examples     0     4  <NA>
4       HW                 Permited Use     4     2  <NA>
5       HW                Policy Review     3     2  <NA>
6       HW               Prohibited Use     3     3  <NA>
7       HW                        Risks     1     4  <NA>
8       HW       Training and Resources     3     3  <NA>
9       HW                 Transparency     2     1  <NA>
10  Oxford           Academic Integrity  <NA>     3  <NA>
11  Oxford               Accountability  <NA>     4  <NA>
12  Oxford  Copyright and Data Privacy   <NA>     4  <NA>
13  Oxford      Definition and Examples  <NA>     4  <NA>
14  Oxford                 Permited Use  <NA>     2  <NA>
15  Oxford                Policy Review  <NA>     2  <NA>
16  Oxford    