In [1]:
import json
from collections import defaultdict
from typing import Dict, Set, List

def analyze_jsonl_structure(file_path: str) -> Dict:
    """
    Analyzes the structure of a JSONL file and returns detailed information about its contents.
    
    Args:
        file_path (str): Path to the JSONL file
        
    Returns:
        Dict containing:
        - total_records: Number of records in the file
        - fields: Set of all unique fields across all records
        - field_types: Dictionary of field names and their observed data types
        - sample_record: First record from the file
        - field_presence: Percentage of records each field appears in
    """
    
    # Initialize tracking variables
    fields = set()
    field_types = defaultdict(set)
    field_count = defaultdict(int)
    total_records = 0
    sample_record = None
    
    # Read and analyze the file
    with open(file_path, 'r') as file:
        for line in file:
            total_records += 1
            try:
                record = json.loads(line.strip())
                
                # Store first record as sample
                if total_records == 1:
                    sample_record = record
                
                # Track fields and their types
                for field, value in record.items():
                    fields.add(field)
                    field_count[field] += 1
                    field_types[field].add(
                        type(value).__name__ if value is not None else 'null'
                    )
                    
            except json.JSONDecodeError as e:
                print(f"Warning: Invalid JSON on line {total_records}: {e}")
                continue
    
    # Calculate field presence percentages
    field_presence = {
        field: (count / total_records) * 100 
        for field, count in field_count.items()
    }
    
    # Convert field_types from sets to lists for better readability
    field_types = {k: list(v) for k, v in field_types.items()}
    
    return {
        'total_records': total_records,
        'fields': sorted(list(fields)),
        'field_types': field_types,
        'sample_record': sample_record,
        'field_presence': field_presence
    }

def print_analysis(analysis: Dict) -> None:
    """
    Prints the analysis results in a readable format.
    
    Args:
        analysis (Dict): Output from analyze_jsonl_structure
    """
    print(f"Total records: {analysis['total_records']}\n")
    
    print("Sample record:")
    print(json.dumps(analysis['sample_record'], indent=2))
    print("\nField analysis:")
    print("-" * 80)
    
    # Print detailed field information
    for field in analysis['fields']:
        presence = analysis['field_presence'][field]
        types = analysis['field_types'][field]
        print(f"Field: {field}")
        print(f"Present in: {presence:.1f}% of records")
        print(f"Data types: {', '.join(types)}")
        print("-" * 80)

# Usage example:
if __name__ == "__main__":
    file_path = "/home/johnwu3/projects/rare_disease/workspace/repos/simulated_data/simulated_patients_formatted.jsonl"
    analysis = analyze_jsonl_structure(file_path)
    print_analysis(analysis)

Total records: 42680

Sample record:
{
  "disease_id": "254864",
  "true_genes": [
    "ENSG00000210194"
  ],
  "age": "Onset_Infant",
  "positive_phenotypes": {
    "HP:0001290": [
      "init_phenotypes"
    ],
    "HP:0001324": [
      "init_phenotypes"
    ],
    "HP:0003198": [
      "init_phenotypes"
    ],
    "HP:0003200": [
      "init_phenotypes"
    ],
    "HP:0003688": [
      "init_phenotypes"
    ],
    "HP:0009051": [
      "init_phenotypes"
    ],
    "HP:0008180": [
      "init_phenotypes"
    ],
    "HP:0011923": [
      "init_phenotypes"
    ],
    "HP:0000218": [
      "phenotype_distractor.1",
      "phenotype_distractor.3",
      "phenotype_distractor.6",
      "phenotype_distractor.7",
      "phenotype_distractor.8"
    ],
    "HP:0000316": [
      "noisy_phenotype"
    ],
    "HP:0000510": [
      "noisy_phenotype"
    ],
    "HP:0100257": [
      "noisy_phenotype"
    ],
    "HP:0100336": [
      "noisy_phenotype"
    ],
    "HP:0100495": [
      "noisy_phenoty

In [1]:
# interesting sanity check

import json
import os
from typing import Dict, List

def extract_subset_data(input_file: str, output_file: str) -> None:
    """
    Extracts specific fields from a JSONL file and creates a new dataset.
    
    Args:
        input_file (str): Path to input JSONL file
        output_file (str): Path to output JSONL file
    """
    
    # Initialize list to store processed records
    processed_records = []
    
    # Read and process the input file
    with open(input_file, 'r') as file:
        for line in file:
            try:
                record = json.loads(line.strip())
                
                # Extract only the fields we want
                new_record = {
                    'disease_id': record['disease_id'],
                    'id': record['id'],
                    'age': record['age'],
                    'positive_phenotypes': record['positive_phenotypes']
                }
                
                processed_records.append(new_record)
                
            except json.JSONDecodeError as e:
                print(f"Warning: Invalid JSON in line: {e}")
                continue
    
    # Write the processed records to the output file
    with open(output_file, 'w') as file:
        for record in processed_records:
            json.dump(record, file)
            file.write('\n')
    
    print(f"Processed {len(processed_records)} records")
    print(f"Output written to {output_file}")

# Example usage
if __name__ == "__main__":
    input_file = "/home/johnwu3/projects/rare_disease/workspace/repos/simulated_data/simulated_patients_formatted.jsonl"  # Update with your input file path
    output_file = "data/dataset/rd_phenotype_simulated_data.jsonl"  # Update with your desired output file path
    extract_subset_data(input_file, output_file)

Processed 42680 records
Output written to data/dataset/rd_phenotype_simulated_data.jsonl


In [3]:
import json
import random
from typing import List, Dict, Tuple, Optional
from collections import defaultdict

class DataLoader:
    def __init__(self, file_path: str, batch_size: Optional[int] = None, seed: Optional[int] = None):
        """
        Initialize the data loader.
        
        Args:
            file_path (str): Path to the JSONL file
            batch_size (int, optional): Size of batches to return
            seed (int, optional): Random seed for reproducibility
        """
        self.file_path = file_path
        self.batch_size = batch_size
        self.data = []
        self.phenotype_to_idx = {}  # Cache for phenotype IDs to indices
        
        # Set random seed if provided
        if seed is not None:
            random.seed(seed)
            
        # Load and process the data
        self._load_data()
        
    def _load_data(self) -> None:
        """Load and preprocess the data from the JSONL file."""
        with open(self.file_path, 'r') as file:
            for line in file:
                record = json.loads(line.strip())
                
                # Extract relevant fields
                processed_record = {
                    'age': record['age'],
                    'disease_id': record['disease_id'],
                    # Get all positive phenotypes
                    'phenotypes': list(record['positive_phenotypes'].keys())
                }
                self.data.append(processed_record)
                
                # Update phenotype vocabulary
                for phenotype in processed_record['phenotypes']:
                    if phenotype not in self.phenotype_to_idx:
                        self.phenotype_to_idx[phenotype] = len(self.phenotype_to_idx)
    
    def _convert_phenotypes_to_indices(self, phenotypes: List[str]) -> List[int]:
        """Convert phenotype IDs to indices."""
        return [self.phenotype_to_idx[p] for p in phenotypes]
    
    def get_vocab_size(self) -> int:
        """Return the size of the phenotype vocabulary."""
        return len(self.phenotype_to_idx)
    
    def get_sample(self, num_samples: int = 1) -> List[Dict]:
        """
        Get random samples from the dataset.
        
        Args:
            num_samples (int): Number of samples to return
            
        Returns:
            List of dictionaries containing age, phenotypes, and disease_id
        """
        return random.sample(self.data, num_samples)
    
    def get_batch(self) -> List[Dict]:
        """
        Get a batch of samples from the dataset.
        
        Returns:
            List of dictionaries containing age, phenotypes, and disease_id
        """
        if self.batch_size is None:
            raise ValueError("Batch size not specified during initialization")
        
        return self.get_sample(self.batch_size)
    
    def get_indexed_sample(self, num_samples: int = 1) -> List[Dict]:
        """
        Get random samples with phenotypes converted to indices.
        
        Args:
            num_samples (int): Number of samples to return
            
        Returns:
            List of dictionaries with phenotypes as indices
        """
        samples = self.get_sample(num_samples)
        for sample in samples:
            sample['phenotype_indices'] = self._convert_phenotypes_to_indices(sample['phenotypes'])
        return samples

# Example usage:
if __name__ == "__main__":
    # Initialize the data loader
    loader = DataLoader("data/dataset/rd_phenotype_simulated_data.jsonl", batch_size=32, seed=42)
    
    # Get a single sample
    single_sample = loader.get_sample(1)
    print("\nSingle sample:")
    print(json.dumps(single_sample, indent=2))
    
    # Get a batch with indexed phenotypes
    batch = loader.get_indexed_sample(3)
    print("\nBatch with indexed phenotypes:")
    print(json.dumps(batch, indent=2))
    
    # Print vocabulary size
    print(f"\nTotal number of unique phenotypes: {loader.get_vocab_size()}")


Single sample:
[
  {
    "age": "Onset_Adult",
    "disease_id": "2300",
    "phenotypes": [
      "HP:0001561",
      "HP:0100867",
      "HP:0100014",
      "HP:0001249",
      "HP:0004326",
      "HP:0002380",
      "HP:0002032",
      "HP:0012395",
      "HP:0001126",
      "HP:0009797"
    ]
  }
]

Batch with indexed phenotypes:
[
  {
    "age": "Onset_Infant",
    "disease_id": "485",
    "phenotypes": [
      "HP:0000162",
      "HP:0000175",
      "HP:0000488",
      "HP:0000545",
      "HP:0000926",
      "HP:0001373",
      "HP:0002652",
      "HP:0002758",
      "HP:0002983",
      "HP:0005280",
      "HP:0005930",
      "HP:0010306",
      "HP:0100625",
      "HP:0003016",
      "HP:0010674",
      "HP:0003071",
      "HP:0009103",
      "HP:0000309",
      "HP:0000518",
      "HP:0100256",
      "HP:0000365",
      "HP:0000508",
      "HP:0005268",
      "HP:0000568",
      "HP:0001947",
      "HP:0012587"
    ],
    "phenotype_indices": [
      1783,
      433,
      640