# graph data preparation

In [1]:
import os
from pathlib import Path
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from torch_geometric.data import Data
from tqdm import tqdm


  from scipy.sparse import csr_matrix, issparse
  return torch._C._show_config()


In [2]:
CSV_PATH = Path("filtered_all_removed_conclusion_source.csv")
MODEL_DIR = Path("best_model_legalbert_pc") #suyamoonpathak/legalbert-pcna-finetuned
OUTPUT_DIR = Path("GAUSSIAN_graph_data_for_joint_prediction_csv")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


In [3]:
tokenizer = AutoTokenizer.from_pretrained('suyamoonpathak/legalbert-pcna-finetuned')
model = AutoModel.from_pretrained('suyamoonpathak/legalbert-pcna-finetuned')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


2025-09-20 22:58:29.648549: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-20 22:58:29.710436: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758389309.730146  406523 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758389309.739085  406523 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1758389309.765801  406523 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [4]:
# Edge type mapping from CSV relations to integers compatible with previous code convention
relation_to_edge_type = {
    "support": 0,
    "attack": 1,
    "no-relation": 2
}

# Node label mapping for node classification
node_type_to_label = {
    "premise": 0,
    "conclusion": 1,
    "non-argumentative": 2
}

# Priority order for node type when a node appears as both source and target with different types
node_type_priority = {
    "conclusion": 3,
    "premise": 2,
    "non-argumentative": 1,
}


In [5]:
def generate_embeddings(texts, batch_size=4):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch,
                           padding=True,
                           truncation=True,
                           max_length=512,
                           return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu())
    return torch.cat(embeddings, dim=0)

def generate_raw_embeddings_from_word_embeddings(texts, batch_size=8):
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch, 
            padding=True, 
            truncation=True, 
            max_length=512, 
            return_tensors="pt"
        ).to(device)
        
        # Extract token IDs from the tokenizer output
        input_ids = inputs['input_ids']
        
        with torch.no_grad():
            # Get the embeddings directly from the embedding layer
            word_embeddings = model.embeddings.word_embeddings(input_ids)
            
            cls_embeddings = word_embeddings[:, 0, :]  
            
            embeddings.append(cls_embeddings.cpu())
    
    return torch.cat(embeddings, dim=0)


def generate_gaussian_embeddings(texts, batch_size=4):
    hidden_size = model.config.hidden_size
    # hidden_size should be set to match model.config.hidden_size
    if hidden_size is None:
        raise ValueError("hidden_size must be specified to match the model's output dimensionality.")
    
    embeddings = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_len = len(texts[i:i+batch_size])
        # Generate gaussian noise with shape [batch_len, hidden_size]
        noise = torch.randn(batch_len, hidden_size).to(device)
        embeddings.append(noise.cpu())
    
    return torch.cat(embeddings, dim=0)

In [6]:
def determine_node_types(source_infos, target_infos):
    """
    Given dictionaries mapping text to set of source_types and target_types,
    determine final node type by priority:
    conclusion > premise > non-argumentative
    """
    node_types = {}
    all_nodes = set(list(source_infos.keys()) + list(target_infos.keys()))
    
    for text in all_nodes:
        source_types = source_infos.get(text, set())
        target_types = target_infos.get(text, set())
        combined_types = source_types.union(target_types)
        
        # Pick type by priority order
        best_type = None
        best_priority = 0
        for t in combined_types:
            prio = node_type_priority.get(t, 0)
            if prio > best_priority:
                best_priority = prio
                best_type = t
        if best_type is None:
            # fallback to non-argumentative if something unexpected
            best_type = "non-argumentative"
        node_types[text] = best_type
    return node_types


In [7]:
df = pd.read_csv(CSV_PATH)
file_names = df['file_name'].unique()
assert len(file_names) == 40, f"Expected 40 files, found {len(file_names)}"


In [None]:
all_data = []

for file_name in tqdm(file_names, desc="Processing cases"):
    sub_df = df[df['file_name'] == file_name]

    # 1. Extract all unique node texts (source and target)
    source_texts = sub_df['source_text'].tolist()
    target_texts = sub_df['target_text'].tolist()
    unique_texts = list(set(source_texts).union(set(target_texts)))

    # 2. Collect node types from source_type and target_type
    # Map text to set of source_types or target_types (because text can appear multiple times)
    source_type_map = {}
    target_type_map = {}
    for _, row in sub_df.iterrows():
        # source
        st = row['source_text']
        s_type = row['source_type'].strip().lower()
        source_type_map.setdefault(st, set()).add(s_type)
        # target
        tt = row['target_text']
        t_type = row['target_type'].strip().lower()
        target_type_map.setdefault(tt, set()).add(t_type)

    # 3. Determine final node types by priority of presence among source and target types
    node_types = determine_node_types(source_type_map, target_type_map)

    # 4. Map each unique text to index
    text_to_idx = {text: idx for idx, text in enumerate(unique_texts)}

    # 5. Generate embeddings for nodes
    embeddings = generate_embeddings(unique_texts)

    # 6. Create one-hot encoded node type features (3 classes)
    node_features_type = torch.zeros((len(unique_texts), 3))
    node_labels = []
    for i, text in enumerate(unique_texts):
        ntype = node_types[text]
        label = node_type_to_label.get(ntype, 2)  # Default non-argumentative if missing
        node_labels.append(label)
        if ntype == "premise":
            node_features_type[i, 0] = 1
        elif ntype == "conclusion":
            node_features_type[i, 1] = 1
        else:
            node_features_type[i, 2] = 1

    node_labels = torch.tensor(node_labels, dtype=torch.long)
    node_features = torch.cat([embeddings, node_features_type], dim=1)

    # 7. Build edges - edge indices and edge types
    edge_indices = []
    edge_types = []
    for _, row in sub_df.iterrows():
        src_text = row['source_text']
        tgt_text = row['target_text']
        rel = row['relation'].strip().lower()
        if src_text in text_to_idx and tgt_text in text_to_idx:
            edge_indices.append([text_to_idx[src_text], text_to_idx[tgt_text]])
            edge_types.append(relation_to_edge_type[rel])
        else:
            print(f"Warning: Missing node index for edge {src_text} -> {tgt_text} in file {file_name}")

    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
    edge_type = torch.tensor(edge_types, dtype=torch.long)

    # 8. Create Data object
    data = Data(
        x=node_features,
        edge_index=edge_index,
        edge_type=edge_type,
        y=node_labels,
        xml_file=file_name
    )

    # 9. Save Data object as .pt file with filename exactly as file_name.pt
    output_path = OUTPUT_DIR / f"{file_name}.pt"
    torch.save(data, output_path)
    all_data.append(data)




100%|██████████| 40/40 [00:00<00:00, 32902.95it/s]?it/s]
100%|██████████| 19/19 [00:00<00:00, 27460.98it/s]
100%|██████████| 25/25 [00:00<00:00, 30822.34it/s]
100%|██████████| 28/28 [00:00<00:00, 34100.03it/s]
100%|██████████| 20/20 [00:00<00:00, 25906.76it/s]
100%|██████████| 23/23 [00:00<00:00, 23488.92it/s]
100%|██████████| 26/26 [00:00<00:00, 31139.89it/s]00, 55.20it/s]
100%|██████████| 33/33 [00:00<00:00, 31579.29it/s]
100%|██████████| 32/32 [00:00<00:00, 34213.03it/s]
100%|██████████| 21/21 [00:00<00:00, 25500.98it/s]
100%|██████████| 32/32 [00:00<00:00, 29865.98it/s]
100%|██████████| 22/22 [00:00<00:00, 41583.91it/s]
100%|██████████| 22/22 [00:00<00:00, 39585.88it/s]:00, 56.58it/s]
100%|██████████| 19/19 [00:00<00:00, 43287.22it/s]
100%|██████████| 37/37 [00:00<00:00, 39021.69it/s]
100%|██████████| 39/39 [00:00<00:00, 41391.16it/s]
100%|██████████| 32/32 [00:00<00:00, 36393.09it/s]
100%|██████████| 26/26 [00:00<00:00, 30936.71it/s]
100%|██████████| 20/20 [00:00<00:00, 27530.71it

In [9]:
print(f"Processed and saved {len(all_data)} files.")

assert len(all_data) == 40, f"Warning: processed file count mismatch, expected 40 but got {len(all_data)}"

Processed and saved 40 files.
