# PC

In [4]:
import torch
from torch_geometric.utils import to_networkx
from torch_geometric.data import Data
import networkx as nx
import statistics
from pathlib import Path

PT_FILES_DIR = Path("PC legalBERT_processed_graph_data_for_joint_prediction")  # Change to your directory path

def load_data(file_path):
    return torch.load(file_path, weights_only=False)

def get_graph_stats(data):
    G_nx = to_networkx(data, to_undirected=False)

    indegrees = dict(G_nx.in_degree())
    outdegrees = dict(G_nx.out_degree())

    # Average shortest path length on largest SCC if graph not strongly connected
    if nx.is_strongly_connected(G_nx):
        avg_path_len = nx.average_shortest_path_length(G_nx)
    else:
        largest_cc = max(nx.strongly_connected_components(G_nx), key=len)
        subgraph = G_nx.subgraph(largest_cc)
        avg_path_len = nx.average_shortest_path_length(subgraph)

    lengths = []
    for _, target_dict in nx.all_pairs_shortest_path_length(G_nx):
        lengths.extend(target_dict.values())

    median_path_len = statistics.median(lengths)
    mean_path_len = statistics.mean(lengths)

    return indegrees, outdegrees, median_path_len, mean_path_len, avg_path_len

def print_graph_stats(indegrees, outdegrees, median_path_len, mean_path_len, avg_path_len):
    print("Indegrees:", indegrees)
    print("Outdegrees:", outdegrees)
    print(f"Median shortest path length: {median_path_len:.4f}")
    print(f"Mean shortest path length: {mean_path_len:.4f}")
    print(f"Average shortest path length: {avg_path_len:.4f}")
    print("-------------------------------------------------------")

def main():
    pt_files = list(PT_FILES_DIR.glob("*.pt"))
    print(f"Found {len(pt_files)} .pt files")

    # Accumulators for statistics
    all_indegrees = []
    all_outdegrees = []
    median_path_lengths = []
    mean_path_lengths = []
    avg_path_lengths = []

    for file in pt_files:
        print(f"Processing file: {file.name}")
        data = load_data(file)
        indegrees, outdegrees, median_path_len, mean_path_len, avg_path_len = get_graph_stats(data)
        print_graph_stats(indegrees, outdegrees, median_path_len, mean_path_len, avg_path_len)

        # Accumulate average in/outdegree for summary (mean per graph)
        if indegrees:
            all_indegrees.append(sum(indegrees.values()) / len(indegrees))
        if outdegrees:
            all_outdegrees.append(sum(outdegrees.values()) / len(outdegrees))
        median_path_lengths.append(median_path_len)
        mean_path_lengths.append(mean_path_len)
        avg_path_lengths.append(avg_path_len)

    # Compute averages over all graphs
    avg_indegree = sum(all_indegrees) / len(all_indegrees) if all_indegrees else 0
    avg_outdegree = sum(all_outdegrees) / len(all_outdegrees) if all_outdegrees else 0
    avg_median_path_len = sum(median_path_lengths) / len(median_path_lengths) if median_path_lengths else 0
    avg_mean_path_len = sum(mean_path_lengths) / len(mean_path_lengths) if mean_path_lengths else 0
    avg_avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) if avg_path_lengths else 0

    print("\n====== Dataset-wide averages ======")
    print(f"Average indegree across graphs: {avg_indegree:.4f}")
    print(f"Average outdegree across graphs: {avg_outdegree:.4f}")
    print(f"Average of median shortest path length: {avg_median_path_len:.4f}")
    print(f"Average of mean shortest path length: {avg_mean_path_len:.4f}")
    print(f"Average of average shortest path length: {avg_avg_path_len:.4f}")

if __name__ == "__main__":
    main()


Found 40 .pt files
Processing file: R2021_World Duty Free v. Commission.pt
Indegrees: {0: 1, 1: 1, 2: 0, 3: 1, 4: 1, 5: 1, 6: 1, 7: 29, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 0, 23: 0, 24: 0, 25: 1, 26: 1, 27: 1, 28: 1, 29: 0, 30: 1, 31: 0, 32: 0, 33: 0, 34: 1, 35: 1, 36: 1, 37: 2, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 2, 44: 2, 45: 0, 46: 0, 47: 1, 48: 1, 49: 1, 50: 4, 51: 1, 52: 0, 53: 0, 54: 0, 55: 1, 56: 1, 57: 1, 58: 1, 59: 1, 60: 1, 61: 1, 62: 1, 63: 1, 64: 1, 65: 1, 66: 1, 67: 2, 68: 1, 69: 1, 70: 1, 71: 2, 72: 1, 73: 1, 74: 3, 75: 0, 76: 0, 77: 1, 78: 1, 79: 2, 80: 1, 81: 2, 82: 2, 83: 1, 84: 1, 85: 29, 86: 0, 87: 1, 88: 1, 89: 1, 90: 2, 91: 1, 92: 1, 93: 1, 94: 1, 95: 1, 96: 29, 97: 1, 98: 1, 99: 1, 100: 1, 101: 1, 102: 1, 103: 1, 104: 1, 105: 1, 106: 1, 107: 1, 108: 0, 109: 1, 110: 1, 111: 1, 112: 1, 113: 0, 114: 1, 115: 1, 116: 1, 117: 30, 118: 1, 119: 1, 120: 1, 121: 0, 122: 0, 123: 1, 124: 29, 125: 0, 126: 1, 12

# PCNA

In [5]:
import torch
from torch_geometric.utils import to_networkx
from torch_geometric.data import Data
import networkx as nx
import statistics
from pathlib import Path

PT_FILES_DIR = Path("PCNA finetuned_legalBERT_pcna_processed_graph_data_for_joint_prediction_csv")  # Change to your directory path

def load_data(file_path):
    return torch.load(file_path, weights_only=False)

def get_graph_stats(data):
    G_nx = to_networkx(data, to_undirected=False)

    indegrees = dict(G_nx.in_degree())
    outdegrees = dict(G_nx.out_degree())

    # Average shortest path length on largest SCC if graph not strongly connected
    if nx.is_strongly_connected(G_nx):
        avg_path_len = nx.average_shortest_path_length(G_nx)
    else:
        largest_cc = max(nx.strongly_connected_components(G_nx), key=len)
        subgraph = G_nx.subgraph(largest_cc)
        avg_path_len = nx.average_shortest_path_length(subgraph)

    lengths = []
    for _, target_dict in nx.all_pairs_shortest_path_length(G_nx):
        lengths.extend(target_dict.values())

    median_path_len = statistics.median(lengths)
    mean_path_len = statistics.mean(lengths)

    return indegrees, outdegrees, median_path_len, mean_path_len, avg_path_len

def print_graph_stats(indegrees, outdegrees, median_path_len, mean_path_len, avg_path_len):
    print("Indegrees:", indegrees)
    print("Outdegrees:", outdegrees)
    print(f"Median shortest path length: {median_path_len:.4f}")
    print(f"Mean shortest path length: {mean_path_len:.4f}")
    print(f"Average shortest path length: {avg_path_len:.4f}")
    print("-------------------------------------------------------")

def main():
    pt_files = list(PT_FILES_DIR.glob("*.pt"))
    print(f"Found {len(pt_files)} .pt files")

    # Accumulators for statistics
    all_indegrees = []
    all_outdegrees = []
    median_path_lengths = []
    mean_path_lengths = []
    avg_path_lengths = []

    for file in pt_files:
        print(f"Processing file: {file.name}")
        data = load_data(file)
        indegrees, outdegrees, median_path_len, mean_path_len, avg_path_len = get_graph_stats(data)
        print_graph_stats(indegrees, outdegrees, median_path_len, mean_path_len, avg_path_len)

        # Accumulate average in/outdegree for summary (mean per graph)
        if indegrees:
            all_indegrees.append(sum(indegrees.values()) / len(indegrees))
        if outdegrees:
            all_outdegrees.append(sum(outdegrees.values()) / len(outdegrees))
        median_path_lengths.append(median_path_len)
        mean_path_lengths.append(mean_path_len)
        avg_path_lengths.append(avg_path_len)

    # Compute averages over all graphs
    avg_indegree = sum(all_indegrees) / len(all_indegrees) if all_indegrees else 0
    avg_outdegree = sum(all_outdegrees) / len(all_outdegrees) if all_outdegrees else 0
    avg_median_path_len = sum(median_path_lengths) / len(median_path_lengths) if median_path_lengths else 0
    avg_mean_path_len = sum(mean_path_lengths) / len(mean_path_lengths) if mean_path_lengths else 0
    avg_avg_path_len = sum(avg_path_lengths) / len(avg_path_lengths) if avg_path_lengths else 0

    print("\n====== Dataset-wide averages ======")
    print(f"Average indegree across graphs: {avg_indegree:.4f}")
    print(f"Average outdegree across graphs: {avg_outdegree:.4f}")
    print(f"Average of median shortest path length: {avg_median_path_len:.4f}")
    print(f"Average of mean shortest path length: {avg_mean_path_len:.4f}")
    print(f"Average of average shortest path length: {avg_avg_path_len:.4f}")

if __name__ == "__main__":
    main()


Found 40 .pt files
Processing file: R2021_World Duty Free v. Commission.pt
Indegrees: {0: 2, 1: 1, 2: 0, 3: 1, 4: 1, 5: 1, 6: 1, 7: 2, 8: 1, 9: 2, 10: 2, 11: 0, 12: 0, 13: 0, 14: 3, 15: 5, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 3, 22: 1, 23: 0, 24: 2, 25: 2, 26: 1, 27: 0, 28: 1, 29: 1, 30: 2, 31: 1, 32: 1, 33: 1, 34: 1, 35: 0, 36: 1, 37: 1, 38: 0, 39: 2, 40: 3, 41: 1, 42: 2, 43: 1, 44: 1, 45: 3, 46: 1, 47: 0, 48: 1, 49: 0, 50: 1, 51: 0, 52: 0, 53: 0, 54: 1, 55: 0, 56: 3, 57: 2, 58: 0, 59: 0, 60: 0, 61: 1, 62: 0, 63: 1, 64: 1, 65: 0, 66: 1, 67: 0, 68: 2, 69: 1, 70: 0, 71: 1, 72: 1, 73: 1, 74: 0, 75: 3, 76: 1, 77: 1, 78: 2, 79: 1, 80: 1, 81: 1, 82: 1, 83: 0, 84: 1, 85: 2, 86: 1, 87: 1, 88: 0, 89: 1, 90: 1, 91: 1, 92: 0, 93: 0, 94: 0, 95: 0, 96: 0, 97: 2, 98: 1, 99: 2, 100: 2, 101: 1, 102: 1, 103: 2, 104: 1, 105: 0, 106: 1, 107: 1, 108: 1, 109: 1, 110: 1, 111: 0, 112: 1, 113: 1, 114: 0, 115: 2, 116: 2, 117: 1, 118: 2, 119: 0, 120: 1, 121: 0, 122: 1, 123: 0, 124: 2, 125: 1, 126: 0, 127: 1,

# PCNA Individual nodes (Premise, Conclusion, and Non-argumentative)

In [6]:
import torch
from torch_geometric.utils import to_networkx
from torch_geometric.data import Data
import networkx as nx
import statistics
from pathlib import Path

PT_FILES_DIR = Path("PCNA finetuned_legalBERT_pcna_processed_graph_data_for_joint_prediction_csv")

# Node label to name mapping (adjust if needed)
label_to_type = {0: "Premise", 1: "Conclusion", 2: "Non-argumentative"}

def load_data(file_path):
    return torch.load(file_path, weights_only=False)

def get_type_stats(G_nx, node_labels):
    # node_labels is a tensor or list of node types indexed by node idx
    # Create dictionaries for degrees and shortest path lengths per node type
    
    # Compute degrees
    indegrees = dict(G_nx.in_degree())
    outdegrees = dict(G_nx.out_degree())
    
    # Prepare containers keyed by node type
    type_indegrees = {0: [], 1: [], 2: []}
    type_outdegrees = {0: [], 1: [], 2: []}
    type_shortest_paths = {0: [], 1: [], 2: []}
    
    # Assign degree values per node type
    for node, ntype in enumerate(node_labels):
        type_indegrees[ntype].append(indegrees.get(node, 0))
        type_outdegrees[ntype].append(outdegrees.get(node, 0))
    
    # Compute all pairs shortest path lengths
    for source, target_dict in nx.all_pairs_shortest_path_length(G_nx):
        source_type = node_labels[source]
        for dist in target_dict.values():
            type_shortest_paths[source_type].append(dist)

    # Calculate statistics per node type
    stats = {}
    for ntype in [0, 1, 2]:
        indeg_list = type_indegrees[ntype]
        outdeg_list = type_outdegrees[ntype]
        sp_list = type_shortest_paths[ntype]
        stats[ntype] = {
            "avg_indegree": statistics.mean(indeg_list) if indeg_list else 0,
            "avg_outdegree": statistics.mean(outdeg_list) if outdeg_list else 0,
            "median_shortest_path": statistics.median(sp_list) if sp_list else 0,
            "mean_shortest_path": statistics.mean(sp_list) if sp_list else 0,
        }
    return stats

def print_type_stats(stats, file_name):
    print(f"Stats for file: {file_name}")
    for ntype, metrics in stats.items():
        print(f"  {label_to_type[ntype]} nodes:")
        print(f"    Avg. Indegree: {metrics['avg_indegree']:.4f}")
        print(f"    Avg. Outdegree: {metrics['avg_outdegree']:.4f}")
        print(f"    Median Shortest Path Length: {metrics['median_shortest_path']:.4f}")
        print(f"    Mean Shortest Path Length: {metrics['mean_shortest_path']:.4f}")
    print("-------------------------------------------------------")

def main():
    pt_files = list(PT_FILES_DIR.glob("*.pt"))
    print(f"Found {len(pt_files)} .pt files")

    # Accumulators for averages across dataset per node type
    accum_stats = {
        0: {"avg_indegree": [], "avg_outdegree": [], "median_shortest_path": [], "mean_shortest_path": []},
        1: {"avg_indegree": [], "avg_outdegree": [], "median_shortest_path": [], "mean_shortest_path": []},
        2: {"avg_indegree": [], "avg_outdegree": [], "median_shortest_path": [], "mean_shortest_path": []},
    }

    for file in pt_files:
        data = load_data(file)
        G_nx = to_networkx(data, to_undirected=False)
        node_labels = data.y.tolist()  # Assuming y is in node order

        stats = get_type_stats(G_nx, node_labels)
        print_type_stats(stats, file.name)

        # Accumulate dataset stats
        for ntype in [0, 1, 2]:
            for key in accum_stats[ntype]:
                accum_stats[ntype][key].append(stats[ntype][key])

    # Print average stats across dataset
    print("\n===== Dataset-wide averages per node type =====")
    for ntype in [0, 1, 2]:
        print(f"{label_to_type[ntype]} nodes:")
        for key, values in accum_stats[ntype].items():
            avg_val = statistics.mean(values) if values else 0
            print(f"  Average {key.replace('_', ' ').capitalize()}: {avg_val:.4f}")
        print()

if __name__ == "__main__":
    main()


Found 40 .pt files
Stats for file: R2021_World Duty Free v. Commission.pt
  Premise nodes:
    Avg. Indegree: 1.0204
    Avg. Outdegree: 1.1327
    Median Shortest Path Length: 2.0000
    Mean Shortest Path Length: 2.9304
  Conclusion nodes:
    Avg. Indegree: 1.7500
    Avg. Outdegree: 0.0000
    Median Shortest Path Length: 0.0000
    Mean Shortest Path Length: 0.0000
  Non-argumentative nodes:
    Avg. Indegree: 0.8684
    Avg. Outdegree: 0.7632
    Median Shortest Path Length: 2.0000
    Mean Shortest Path Length: 2.6105
-------------------------------------------------------
Stats for file: R2013_Telefónica SA v European Commission.pt
  Premise nodes:
    Avg. Indegree: 1.5333
    Avg. Outdegree: 1.8000
    Median Shortest Path Length: 3.0000
    Mean Shortest Path Length: 2.9708
  Conclusion nodes:
    Avg. Indegree: 0.0000
    Avg. Outdegree: 1.0000
    Median Shortest Path Length: 0.5000
    Mean Shortest Path Length: 0.5000
  Non-argumentative nodes:
    Avg. Indegree: 1.0000
