In [8]:
import json
import sys
import yaml
import re
# List of entity types we want to evaluate
ENTITY_TYPES = [
    "citation",
    "judges",
    "ConvCourtName",
    "ConvictPleaDate",
    "ConvictOffence",
    "AcquitOffence",
    "ConfessPleadGuilty",  # Adjusted entity name to match YAML sample
    "PleaPoint",
    "RemandDecision",
    "RemandCustodyTime",
    "SentCourtName",
    "SentenceReceived",
    "SentenceServed",
    "WhatAncilliary",
    "OffSex",
    "OffAgeOffence",
    "OffJobOffence",
    "OffHomeOffence",
    "OffMentalOffence",
    "OffIntoxOffence",
    "OffVicRelation",
    "VictimType",
    "VicNum",
    "VicSex",
    "VicAgeOffence",
    "VicJobOffence",
    "VicHomeOffence",
    "VicMentalOffence",
    "VicIntoxOffence",
    "ProsEvidTypeTrial",
    "DefEvidTypeTrial",
    "PreSentReport",
    "AggFactSent",
    "MitFactSent",
    "VicImpactStatement",
    "Appellant",
    "CoDefAccNum",
    "AppealAgainst",
    "AppealGround",
    "SentGuideWhich",
    "AppealOutcome",
    "ReasonQuashConv",
    "ReasonSentExcessNotLenient",
    "ReasonSentLenientNotExcess",
    "ReasonDismiss",
]

def is_empty(val):
    """Determine if a value is effectively empty or non-informative."""
    if val is None:
        return True
    if isinstance(val, str):
        cleaned = val.strip().lower()
        if cleaned in ["", "nan", "null", "-"]:
            return True
    return False

def compute_metrics_from_counts(tp, fp, fn):
    """Compute precision, recall, and F1 given counts of TP, FP, FN."""
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return precision, recall, f1

def fix_single_quotes(yaml_text):
    """
    Fix single-quote issues in YAML and quote unquoted Exact values.
    """
    # Fix single-quoted strings
    pattern = re.compile(r"(^\s*[^:]+:\s*)'(.*?)'(\s*(?:\n|$))", re.MULTILINE)
    
    def replacer(match):
        prefix = match.group(1)
        content = match.group(2)
        suffix = match.group(3)
        fixed_content = content.replace("'", "''")
        return f"{prefix}'{fixed_content}'{suffix}"
    
    fixed_text = pattern.sub(replacer, yaml_text)
    
    # Wrap unquoted Exact: values in double quotes to handle problematic characters
    pattern_exact = re.compile(r"(^\s*Exact:\s*)([^\n]+)(\n|$)", re.MULTILINE)
    
    def replacer_exact(match):
        prefix = match.group(1)
        value = match.group(2)
        suffix = match.group(3)
        # If value isn't already quoted, wrap it in quotes
        if not (value.startswith('"') or value.startswith("'")):
            fixed_value = value.replace("'", "''")  # double single quotes inside value
            return f'{prefix}"{fixed_value}"{suffix}'
        else:
            return match.group(0)
    
    fixed_text = pattern_exact.sub(replacer_exact, fixed_text)
    return fixed_text

def parse_yaml_content(text):
    """
    Parse YAML from the provided text, fixing quote issues and standardizing empty values to 'nan'.
    """
    # Remove markdown fences if present
    if text.startswith("```yaml"):
        text = text.replace("```yaml", "", 1)
    if text.endswith("```"):
        text = text[:-len("```")]

    # Fix quoting issues in YAML content
    text_fixed = fix_single_quotes(text)

    # Parse the corrected YAML content
    yaml_data = yaml.safe_load(text_fixed)

    # Standardize empty or dash values to 'nan'
    def standardize_values(data):
        if isinstance(data, dict):
            return {
                key: standardize_values(value)
                for key, value in data.items()
            }
        elif isinstance(data, list):
            return [standardize_values(item) for item in data]
        elif data in ["-", ""]:
            return "nan"
        else:
            return data

    return standardize_values(yaml_data)


In [9]:
entity_stats = { entity: {"TP": 0, "FP": 0, "FN": 0} for entity in ENTITY_TYPES }

total_tp = total_fp = total_fn = 0

In [10]:
json_file_path='/home/stirunag/Downloads/sft_on_english_data/sft_on_english_data/predictions/llama_3.1_8b_instruct_fine_tuned_appealcourt_coded/outputs_42.json'

In [11]:
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

data[1]

{'answer': '```yaml\ncitation:\n  Comment: \'[2014] EWCA Crim 1594\'\n  Exact: \'On 12th June 2014, this court, differently constituted, considered a Reference by the Registrar of Criminal Appeals, pursuant to section 20 of the Criminal Appeal Act 1968, to make a summary determination whether an application for leave to appeal a confiscation order should be dismissed as frivolous or vexatious.\'\n\njudges:\n  Comment: \'PRESIDENT OF THE QUEEN\\\'S BENCH DIVISION (SIR BRIAN LEVESON), MR JUSTICE GLOBE, SIR RODERICK EVANS\'\n  Exact: \'PRESIDENT OF THE QUEEN\\\'S BENCH DIVISION: 1. On 12th June 2014, this court, differently constituted, considered a Reference by the Registrar of Criminal Appeals, pursuant to section 20 of the Criminal Appeal Act 1968, to make a summary determination whether an application for leave to appeal a confiscation order should be dismissed as frivolous or vexatious.\'\n\nConvCourtName:\n  Comment: \'Crown Court at Inner London, Stockport Magistrates\\\' Court\'\n

In [12]:
answer_text = data[1].get("answer", "")
gold_text = data[1].get("gold", "")

answer_yaml = parse_yaml_content(answer_text) or {}
gold_yaml = parse_yaml_content(gold_text) or {}

In [13]:
answer_yaml

{'citation': {'Comment': '[2014] EWCA Crim 1594',
  'Exact': 'On 12th June 2014, this court, differently constituted, considered a Reference by the Registrar of Criminal Appeals, pursuant to section 20 of the Criminal Appeal Act 1968, to make a summary determination whether an application for leave to appeal a confiscation order should be dismissed as frivolous or vexatious.'},
 'judges': {'Comment': "PRESIDENT OF THE QUEEN\\'S BENCH DIVISION (SIR BRIAN LEVESON), MR JUSTICE GLOBE, SIR RODERICK EVANS",
  'Exact': "PRESIDENT OF THE QUEEN\\'S BENCH DIVISION: 1. On 12th June 2014, this court, differently constituted, considered a Reference by the Registrar of Criminal Appeals, pursuant to section 20 of the Criminal Appeal Act 1968, to make a summary determination whether an application for leave to appeal a confiscation order should be dismissed as frivolous or vexatious."},
 'ConvCourtName': {'Comment': "Crown Court at Inner London, Stockport Magistrates\\' Court",
  'Exact': 'On 14th Jun

In [14]:
gold_yaml


{'citation': {'Comment': 'nan', 'Exact': '[2014] EWCA Crim 1594'},
 'judges': {'Comment': 'nan',
  'Exact': "[''MR JUSTICE GLOBE'', ''SIR RODERICK EVANS'']",
  'ConvCourtName': {'Comment': 'nan', 'Exact': 'Crown Court at Inner London'},
  'ConvictPleaDate': {'Comment': 'nan', 'Exact': '14th June 2011'},
  'ConvictOffence': {'Comment': 'nan',
   'Exact': 'conspiracy to supply controlled drugs of Class A'},
  'AcquitOffence': {'Comment': 'nan', 'Exact': 'nan'},
  'Confess/Plead': {'Comment': 'nan', 'Exact': 'nan'},
  'PleaPoint': {'Comment': 'nan', 'Exact': 'nan'},
  'RemandDecision': {'Comment': 'nan', 'Exact': 'nan'},
  'RemandCustodyTime': {'Comment': 'nan', 'Exact': 'nan'},
  'SentCourtName': {'Comment': 'nan', 'Exact': 'Crown Court at Inner London'},
  'Sentence': {'Comment': 'nan', 'Exact': 'nan'},
  'SentServe': {'Comment': 'nan', 'Exact': 'nan'},
  'WhatAncilliary': {'Comment': 'nan', 'Exact': 'nan'},
  'OffSex': {'Comment': 'male', 'Exact': 'Vincent Graham,'},
  'OffAgeOffence':

In [18]:
from difflib import SequenceMatcher

def similarity(a, b):
    """Compute similarity ratio between two strings."""
    return SequenceMatcher(None, a, b).ratio()

SIMILARITY_THRESHOLD = 0.5  # Adjust based on acceptable similarity level




Per-Entity Metrics:

citation                  P=0.000  R=0.000  F1=0.000  (TP=0, FP=5, FN=3)
judges                    P=0.000  R=0.000  F1=0.000  (TP=0, FP=5, FN=3)
ConvCourtName             P=0.000  R=0.000  F1=0.000  (TP=0, FP=5, FN=0)
ConvictPleaDate           P=0.000  R=0.000  F1=0.000  (TP=0, FP=5, FN=0)
ConvictOffence            P=0.000  R=0.000  F1=0.000  (TP=0, FP=5, FN=0)
AcquitOffence             P=0.000  R=0.000  F1=0.000  (TP=0, FP=0, FN=0)
ConfessPleadGuilty        P=0.000  R=0.000  F1=0.000  (TP=0, FP=5, FN=0)
PleaPoint                 P=0.000  R=0.000  F1=0.000  (TP=0, FP=5, FN=0)
RemandDecision            P=0.000  R=0.000  F1=0.000  (TP=0, FP=2, FN=0)
RemandCustodyTime         P=0.000  R=0.000  F1=0.000  (TP=0, FP=0, FN=0)
SentCourtName             P=0.000  R=0.000  F1=0.000  (TP=0, FP=0, FN=0)
SentenceReceived          P=0.000  R=0.000  F1=0.000  (TP=0, FP=0, FN=0)
SentenceServed            P=0.000  R=0.000  F1=0.000  (TP=0, FP=0, FN=0)
WhatAncilliary            P=0.

In [20]:
# Process each record
for record in data:
    answer_text = record.get("answer", "")
    gold_text = record.get("gold", "")

    # Parse YAML content from answer and gold fields
    try:
        answer_yaml = parse_yaml_content(answer_text) or {}
    except Exception as e:
        print(f"Error parsing answer YAML: {e}")
        answer_yaml = {}
    try:
        gold_yaml = parse_yaml_content(gold_text) or {}
    except Exception as e:
        print(f"Error parsing gold YAML: {e}")
        gold_yaml = {}

    for entity in ENTITY_TYPES:
        # Extract "Comment" and "Exact" values
        gold_entity = gold_yaml.get(entity, {}) if isinstance(gold_yaml, dict) else {}
        pred_entity = answer_yaml.get(entity, {}) if isinstance(answer_yaml, dict) else {}
    
        gold_comment = gold_entity.get("Comment") if isinstance(gold_entity, dict) else None
        pred_comment = pred_entity.get("Comment") if isinstance(pred_entity, dict) else None
    
        gold_exact = gold_entity.get("Exact") if isinstance(gold_entity, dict) else None
        pred_exact = pred_entity.get("Exact") if isinstance(pred_entity, dict) else None
    
        # Initialize stats for the entity
        if entity not in entity_stats:
            entity_stats[entity] = {"TP": 0, "FP": 0, "FN": 0}
    
        # Function to update TP, FP, FN based on match
        def update_metrics(gold, pred):
            gold_present = not is_empty(gold)
            pred_present = not is_empty(pred)
            if gold_present:
                if pred_present:
                    # Check for similarity
                    if similarity(str(gold).strip(), str(pred).strip()) >= SIMILARITY_THRESHOLD:
                        entity_stats[entity]["TP"] += 1
                        return 1, 0, 0  # TP
                    else:
                        # Similarity below threshold
                        return 0, 1, 1  # FP, FN
                else:
                    # Gold present but prediction missing
                    return 0, 0, 1  # FN
            else:
                if pred_present:
                    # Prediction made but nothing in gold
                    return 0, 1, 0  # FP
                # Neither present: do nothing
                return 0, 0, 0
    
        # Update metrics for "Comment"
        tp, fp, fn = update_metrics(gold_comment, pred_comment)
        entity_stats[entity]["TP"] += tp
        entity_stats[entity]["FP"] += fp
        entity_stats[entity]["FN"] += fn
        total_tp += tp
        total_fp += fp
        total_fn += fn
    
        # Update metrics for "Exact"
        tp, fp, fn = update_metrics(gold_exact, pred_exact)
        entity_stats[entity]["TP"] += tp
        entity_stats[entity]["FP"] += fp
        entity_stats[entity]["FN"] += fn
        total_tp += tp
        total_fp += fp
        total_fn += fn
    
    # Print per-entity metrics
    print("Per-Entity Metrics:\n")
    for entity in ENTITY_TYPES:
        stats = entity_stats[entity]
        precision, recall, f1 = compute_metrics_from_counts(stats["TP"], stats["FP"], stats["FN"])
        print(f"{entity:25s} P={precision:.3f}  R={recall:.3f}  F1={f1:.3f}  (TP={stats['TP']}, FP={stats['FP']}, FN={stats['FN']})")
    
# Compute overall micro-averaged metrics
overall_precision, overall_recall, overall_f1 = compute_metrics_from_counts(total_tp, total_fp, total_fn)
print("\nOverall (Micro-Averaged) Metrics:")
print(f"Precision = {overall_precision:.3f}")
print(f"Recall    = {overall_recall:.3f}")
print(f"F1        = {overall_f1:.3f}")
    
    


Per-Entity Metrics:

citation                  P=0.017  R=0.017  F1=0.017  (TP=2, FP=118, FN=118)
judges                    P=0.160  R=0.155  F1=0.157  (TP=20, FP=105, FN=109)
ConvCourtName             P=0.000  R=0.000  F1=0.000  (TP=0, FP=117, FN=0)
ConvictPleaDate           P=0.000  R=0.000  F1=0.000  (TP=0, FP=111, FN=0)
ConvictOffence            P=0.000  R=0.000  F1=0.000  (TP=0, FP=117, FN=0)
AcquitOffence             P=0.000  R=0.000  F1=0.000  (TP=0, FP=12, FN=0)
ConfessPleadGuilty        P=0.000  R=0.000  F1=0.000  (TP=0, FP=115, FN=0)
PleaPoint                 P=0.000  R=0.000  F1=0.000  (TP=0, FP=93, FN=0)
RemandDecision            P=0.000  R=0.000  F1=0.000  (TP=0, FP=62, FN=0)
RemandCustodyTime         P=0.000  R=0.000  F1=0.000  (TP=0, FP=13, FN=0)
SentCourtName             P=0.000  R=0.000  F1=0.000  (TP=0, FP=72, FN=0)
SentenceReceived          P=0.000  R=0.000  F1=0.000  (TP=0, FP=100, FN=0)
SentenceServed            P=0.000  R=0.000  F1=0.000  (TP=0, FP=81, FN=0)
WhatA

In [22]:
import pandas as pd

# Initialize lists to store data for the DataFrame
data = []

# Populate rows for human evaluation
for entity in ENTITY_TYPES:
    # Extract gold and predicted values
    gold_entity = gold_yaml.get(entity, {}) if isinstance(gold_yaml, dict) else {}
    pred_entity = answer_yaml.get(entity, {}) if isinstance(answer_yaml, dict) else {}

    # Extract values for 'Exact' and 'Comment'
    gold_exact = gold_entity.get("Exact", "nan")
    gold_comment = gold_entity.get("Comment", "nan")
    pred_exact = pred_entity.get("Exact", "nan")
    pred_comment = pred_entity.get("Comment", "nan")

    # Add row to the data list
    data.append({
        "Entity Type": entity,
        "Gold (Human) - Exact": gold_exact,
        "Gold (Human) - Comment": gold_comment,
        "AI - Exact": pred_exact,
        "AI - Comment": pred_comment,
    })

# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV for human evaluation
file_path = "data/human_evaluation.csv"
df.to_csv(file_path, index=False)


OSError: Cannot save file into a non-existent directory: 'data'

In [24]:
pwd

'/home/stirunag/work/github/JuDDGES/nbs/Data'