In [1]:

import os
import sys
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)
# Set the parent directory as the current directory
os.chdir(parent_dir)

In [6]:
#!/usr/bin/env python3
import json
import os
from pathlib import Path

def process_dataset(input_path, output_path):
    """
    Process a dataset by removing the 'text' field from each document.
    
    Args:
        input_path: Path to the input JSON file
        output_path: Path to save the processed JSON file
    """
    print(f"Processing {input_path}...")
    
    try:
        # Read the JSON file
        with open(input_path, 'r') as f:
            data = json.load(f)
        
        print(f"Dataset contains {len(data)} documents")
        
        # Process each document to remove the text field
        processed_data = {}
        total_annotations = 0
        
        for doc_id, doc_data in data.items():
            # Create a copy of the document data
            processed_doc = dict(doc_data)
            
            # Remove the text field if it exists
            if 'note_details' in processed_doc and 'text' in processed_doc['note_details']:
                del processed_doc['note_details']['text']
            
            # Count annotations
            if 'annotations' in processed_doc:
                total_annotations += len(processed_doc['annotations'])
            
            # Store processed document
            processed_data[doc_id] = processed_doc
        
        print(f"Total annotations: {total_annotations}")
        
        # Make sure the output directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        # Save the processed data
        with open(output_path, 'w') as f:
            json.dump(processed_data, f, indent=2)
        
        print(f"Processed dataset saved to {output_path}")
        return True
    
    except Exception as e:
        print(f"Error processing {input_path}: {e}")
        return False

def main():
    # Create the output directory
    os.makedirs("public_data", exist_ok=True)
    
    # Define the files to process
    files_to_process = [
        ("/home/johnwu3/projects/rare_disease/workspace/repos/RareDiseaseMention/mimic_rare_disease_annotations.json", "public_data/rd_annos_public.json"),
        ("data/dataset/filtered_rd_annos_updated_adam.json", "public_data/filtered_rd_annos_public.json"),
        ("data/dataset/reannotated_rd_annos.json", "public_data/reannotated_rd_annos_public.json")
    ]
    
    # Process each file
    for input_path, output_path in files_to_process:
        if os.path.exists(input_path):
            process_dataset(input_path, output_path)
        else:
            print(f"File not found: {input_path}")
    
    print("Processing complete!")

if __name__ == "__main__":
    main()

Processing /home/johnwu3/projects/rare_disease/workspace/repos/RareDiseaseMention/mimic_rare_disease_annotations.json...
Dataset contains 312 documents
Total annotations: 1073
Processed dataset saved to public_data/rd_annos_public.json
Processing data/dataset/filtered_rd_annos_updated_adam.json...
Dataset contains 117 documents
Total annotations: 333
Processed dataset saved to public_data/filtered_rd_annos_public.json
File not found: data/dataset/reannotated_rd_annos.json
Processing complete!
