In [1]:
import os
import sys
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)
# Set the parent directory as the current directory
os.chdir(parent_dir)

In [4]:
import json
import re
import os
from typing import List, Dict, Any
from tqdm import tqdm  # For progress tracking

def extract_clinical_sentences(input_file: str) -> List[str]:
    """
    Extract sentences from clinical texts in a JSON file.
    
    Args:
        input_file: Path to JSON file with clinical notes
        
    Returns:
        List of sentences extracted from clinical texts
    """
    # Check if file exists
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")
    
    # Load JSON data
    try:
        with open(input_file, 'r') as f:
            data = json.load(f)
            
        # Handle different possible input formats
        if isinstance(data, dict):
            # Format is {case_id: case_data}
            # Convert string keys to integers if necessary
            if all(isinstance(k, str) for k in data.keys()):
                data = {int(k): v for k, v in data.items()}
        else:
            raise ValueError(f"Unsupported input format in {input_file}")
            
        print(f"Loaded clinical data with {len(data)} cases")
    except json.JSONDecodeError:
        raise ValueError(f"Invalid JSON format in file: {input_file}")
    
    all_sentences = []
    
    # Process each clinical case
    for case_id, case_data in tqdm(data.items(), desc="Processing clinical notes"):
        if not isinstance(case_data, dict):
            print(f"Warning: Case {case_id} data is not a dictionary. Skipping.")
            continue
            
        # Extract clinical text
        clinical_text = case_data.get("clinical_text", "")
        if not clinical_text:
            print(f"Warning: Case {case_id} missing 'clinical_text' field. Skipping.")
            continue
        
        # Split into sentences
        case_sentences = split_into_sentences(clinical_text)
        
        # Add to all sentences
        all_sentences.extend(case_sentences)
    
    # Remove duplicates while preserving order
    unique_sentences = []
    seen = set()
    for sentence in all_sentences:
        if sentence and sentence not in seen:
            seen.add(sentence)
            unique_sentences.append(sentence)
    
    print(f"Extracted {len(unique_sentences)} unique sentences from clinical texts")
    return unique_sentences

def split_into_sentences(text: str) -> List[str]:
    """
    Split clinical text into sentences using multiple delimiter rules.
    
    Args:
        text: Clinical text to split into sentences
        
    Returns:
        List of sentences
    """
    if not text or not isinstance(text, str):
        return []
    
    # First split by common sentence terminators while preserving them
    sentence_parts = []
    for part in re.split(r'([.!?])', text):
        if part.strip():
            if part in '.!?':
                if sentence_parts:
                    sentence_parts[-1] += part
            else:
                sentence_parts.append(part.strip())
    
    # Then handle other clinical note delimiters like line breaks and semicolons
    sentences = []
    for part in sentence_parts:
        # Split by semicolons and newlines
        for subpart in re.split(r'[;\n]', part):
            if subpart.strip():
                sentences.append(subpart.strip())
    
    return sentences

# Example usage
if __name__ == "__main__":
    # Path to your JSON file with clinical notes
    input_file = "data/dataset/mine_hpo.json"
    
    try:
        sentences = extract_clinical_sentences(input_file)
        print(f"Total sentences extracted: {len(sentences)}")
        
        # Print a few examples
        print("\nExample sentences:")
        for i, sentence in enumerate(sentences[:5]):
            print(f"{i+1}. {sentence}")
            
        # Optionally save to a text file
        output_file = "clinical_sentences.txt"
        with open(output_file, 'w') as f:
            for sentence in sentences:
                f.write(f"{sentence}\n")
        print(f"\nSaved all sentences to {output_file}")
        
    except Exception as e:
        print(f"Error: {e}")

Loaded clinical data with 116 cases


Processing clinical notes: 100%|██████████| 116/116 [00:00<00:00, 20485.86it/s]

Extracted 2210 unique sentences from clinical texts
Total sentences extracted: 2210

Example sentences:
1. A 44-year- old super-morbidly- obese man body mass index (BMI 63) underwent sleeve gastrectomy for weight loss and was found to have multiple adenomatous fundic gland polyps on final pathology.
2. Subsequent workup included esophagogastroduodenoscopy which revealed innumerable polyps of the remaining gastric fundus and body consistent with fundic gland polyps, normal duodenum without polyps, and Barrett’s oesophagus.
3. Colonoscopy was significant for innumerable polyps of varying sizes up to 1.
4. 5 cm throughout the colon, with relative rectal sparing.
5. Biopsies were consistent with tubular adenoma and hyperplastic polyps.

Saved all sentences to clinical_sentences.txt





In [None]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
import json
import re
from typing import List, Dict, Any
import os
import csv
from tqdm.notebook import tqdm

def extract_clinical_sentences(input_file: str) -> List[str]:
    """
    Extract sentences from clinical texts in a JSON file.
    
    Args:
        input_file: Path to JSON file with clinical notes
        
    Returns:
        List of sentences extracted from clinical texts
    """
    # Check if file exists
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")
    
    # Load JSON data
    try:
        with open(input_file, 'r') as f:
            data = json.load(f)
            
        # Handle different possible input formats
        if isinstance(data, dict):
            # Format is {case_id: case_data}
            # Convert string keys to integers if necessary
            if all(isinstance(k, str) for k in data.keys()):
                data = {int(k): v for k, v in data.items()}
        else:
            raise ValueError(f"Unsupported input format in {input_file}")
            
        print(f"Loaded clinical data with {len(data)} cases")
    except json.JSONDecodeError:
        raise ValueError(f"Invalid JSON format in file: {input_file}")
    
    all_sentences = []
    
    # Process each clinical case
    for case_id, case_data in tqdm(data.items(), desc="Processing clinical notes"):
        if not isinstance(case_data, dict):
            print(f"Warning: Case {case_id} data is not a dictionary. Skipping.")
            continue
            
        # Extract clinical text
        clinical_text = case_data.get("clinical_text", "")
        if not clinical_text:
            print(f"Warning: Case {case_id} missing 'clinical_text' field. Skipping.")
            continue
        
        # Split into sentences
        case_sentences = split_into_sentences(clinical_text)
        
        # Add to all sentences with case ID
        for sentence in case_sentences:
            all_sentences.append({
                'case_id': str(case_id),
                'sentence': sentence
            })
    
    # Remove duplicates while preserving order
    unique_sentences = []
    seen = set()
    for sentence_info in all_sentences:
        sentence = sentence_info['sentence']
        if sentence and sentence not in seen:
            seen.add(sentence)
            unique_sentences.append(sentence_info)
    
    print(f"Extracted {len(unique_sentences)} unique sentences from clinical texts")
    return unique_sentences

def split_into_sentences(text: str) -> List[str]:
    """
    Split clinical text into sentences using multiple delimiter rules.
    
    Args:
        text: Clinical text to split into sentences
        
    Returns:
        List of sentences
    """
    if not text or not isinstance(text, str):
        return []
    
    # First split by common sentence terminators while preserving them
    sentence_parts = []
    for part in re.split(r'([.!?])', text):
        if part.strip():
            if part in '.!?':
                if sentence_parts:
                    sentence_parts[-1] += part
            else:
                sentence_parts.append(part.strip())
    
    # Then handle other clinical note delimiters like line breaks and semicolons
    sentences = []
    for part in sentence_parts:
        # Split by semicolons and newlines
        for subpart in re.split(r'[;\n]', part):
            if subpart.strip():
                sentences.append(subpart.strip())
    
    return sentences

class PhenotypeAnnotator:
    def __init__(self, sentences, output_file='phenotype_annotations.csv'):
        """
        Initialize the phenotype annotator.
        
        Args:
            sentences: List of sentence dictionaries with case_id and sentence
            output_file: Path to output CSV file
        """
        self.sentences = sentences
        self.output_file = output_file
        self.current_index = 0
        self.annotations = []
        
        # Check if output file exists and load previous annotations
        self.load_existing_annotations()
        
        # Create widgets
        self.setup_widgets()
        
    def load_existing_annotations(self):
        """Load existing annotations if output file exists."""
        if os.path.exists(self.output_file):
            try:
                df = pd.read_csv(self.output_file)
                self.annotations = df.to_dict('records')
                
                # Find the highest index that has been annotated
                annotated_sentences = set(df['sentence'].tolist())
                for i, sentence_info in enumerate(self.sentences):
                    if sentence_info['sentence'] not in annotated_sentences:
                        self.current_index = i
                        break
                else:
                    # All sentences have been annotated
                    self.current_index = len(self.sentences)
                    
                print(f"Loaded {len(self.annotations)} previous annotations")
                print(f"Continuing from index {self.current_index}")
            except Exception as e:
                print(f"Error loading previous annotations: {e}")
    
    def setup_widgets(self):
        """Set up the Jupyter widgets for annotation."""
        # Sentence display
        self.sentence_display = widgets.HTML(
            value="<h3>Sentence:</h3><p style='background-color: #f0f0f0; padding: 10px;'></p>"
        )
        
        # Question 1: Does this sentence imply a phenotype?
        self.implies_phenotype = widgets.RadioButtons(
            options=[('Yes', True), ('No', False)],
            description='Does this sentence imply a phenotype not already described explicitly in text?',
            disabled=False
        )
        
        # Question 2: What text implies a phenotype?
        self.text_implies = widgets.Text(
            description='Text that implies:',
            disabled=True
        )
        
        # Question 3: What phenotype?
        self.phenotype = widgets.Text(
            description='Phenotype:',
            disabled=True
        )
        
        # Navigation buttons
        self.prev_button = widgets.Button(
            description='Previous',
            disabled=True,
            button_style='info'
        )
        self.next_button = widgets.Button(
            description='Save & Next',
            button_style='success'
        )
        self.save_button = widgets.Button(
            description='Save All',
            button_style='danger'
        )
        
        # Progress
        self.progress = widgets.IntProgress(
            value=0,
            min=0,
            max=len(self.sentences),
            description='Progress:',
            style={'bar_color': '#0080ff'}
        )
        
        # Set up callbacks
        self.implies_phenotype.observe(self._on_implies_phenotype_change, names='value')
        self.prev_button.on_click(self._on_prev_click)
        self.next_button.on_click(self._on_next_click)
        self.save_button.on_click(self._on_save_click)
        
        # Layout
        self.main_box = widgets.VBox([
            self.sentence_display,
            self.implies_phenotype,
            self.text_implies,
            self.phenotype,
            widgets.HBox([self.prev_button, self.next_button, self.save_button]),
            self.progress
        ])
    
    def _on_implies_phenotype_change(self, change):
        """Enable/disable text fields based on implies_phenotype selection."""
        if change['new']:  # If Yes
            self.text_implies.disabled = False
            self.phenotype.disabled = False
        else:  # If No
            self.text_implies.disabled = True
            self.phenotype.disabled = True
            # Clear the fields
            self.text_implies.value = ''
            self.phenotype.value = ''
    
    def _on_prev_click(self, b):
        """Handle previous button click."""
        if self.current_index > 0:
            self.current_index -= 1
            self._update_display()
    
    def _on_next_click(self, b):
        """Handle next button click."""
        self._save_current_annotation()
        if self.current_index < len(self.sentences) - 1:
            self.current_index += 1
            self._update_display()
        else:
            # End of sentences
            self._save_annotations()
            self.sentence_display.value = "<h3>Annotation Complete!</h3><p>All sentences have been annotated.</p>"
            self.implies_phenotype.disabled = True
            self.text_implies.disabled = True
            self.phenotype.disabled = True
            self.next_button.disabled = True
    
    def _on_save_click(self, b):
        """Handle save button click."""
        self._save_current_annotation()
        self._save_annotations()
        print(f"Annotations saved to {self.output_file}")
    
    def _save_current_annotation(self):
        """Save the current annotation."""
        if self.current_index < len(self.sentences):
            sentence_info = self.sentences[self.current_index]
            
            # Check if this sentence has already been annotated
            for i, annotation in enumerate(self.annotations):
                if annotation['sentence'] == sentence_info['sentence']:
                    # Update existing annotation
                    self.annotations[i] = {
                        'case_id': sentence_info['case_id'],
                        'sentence': sentence_info['sentence'],
                        'implies_phenotype': self.implies_phenotype.value,
                        'text_implies': self.text_implies.value if self.implies_phenotype.value else '',
                        'phenotype': self.phenotype.value if self.implies_phenotype.value else ''
                    }
                    break
            else:
                # Add new annotation
                self.annotations.append({
                    'case_id': sentence_info['case_id'],
                    'sentence': sentence_info['sentence'],
                    'implies_phenotype': self.implies_phenotype.value,
                    'text_implies': self.text_implies.value if self.implies_phenotype.value else '',
                    'phenotype': self.phenotype.value if self.implies_phenotype.value else ''
                })
    
    def _save_annotations(self):
        """Save all annotations to CSV file."""
        df = pd.DataFrame(self.annotations)
        df.to_csv(self.output_file, index=False)
    
    def _update_display(self):
        """Update the display for the current sentence."""
        if self.current_index < len(self.sentences):
            sentence_info = self.sentences[self.current_index]
            self.sentence_display.value = f"<h3>Sentence {self.current_index + 1}/{len(self.sentences)}:</h3>" \
                                        f"<p style='background-color: #f0f0f0; padding: 10px;'>{sentence_info['sentence']}</p>" \
                                        f"<p>Case ID: {sentence_info['case_id']}</p>"
            
            # Check if this sentence has been annotated before
            for annotation in self.annotations:
                if annotation['sentence'] == sentence_info['sentence']:
                    # Load existing annotation
                    self.implies_phenotype.value = annotation['implies_phenotype']
                    self.text_implies.value = annotation['text_implies']
                    self.phenotype.value = annotation['phenotype']
                    break
            else:
                # Reset fields
                self.implies_phenotype.value = False
                self.text_implies.value = ''
                self.phenotype.value = ''
            
            # Update UI based on implies_phenotype value
            self.text_implies.disabled = not self.implies_phenotype.value
            self.phenotype.disabled = not self.implies_phenotype.value
            
            # Update progress
            self.progress.value = self.current_index
            
            # Update button states
            self.prev_button.disabled = (self.current_index == 0)
            self.next_button.disabled = False
    
    def start(self):
        """Start the annotation process."""
        self._update_display()
        display(self.main_box)

# Example usage
def start_annotation(input_file, output_file='phenotype_annotations.csv'):
    """Start the annotation process for a clinical text file."""
    sentences = extract_clinical_sentences(input_file)
    annotator = PhenotypeAnnotator(sentences, output_file)
    annotator.start()
    return annotator

# You would use this in your notebook with:
annotator = start_annotation('data/dataset/mine_hpo.json', 'test_annotations.csv')

Loaded clinical data with 116 cases


Processing clinical notes:   0%|          | 0/116 [00:00<?, ?it/s]

Extracted 2210 unique sentences from clinical texts


VBox(children=(HTML(value="<h3>Sentence 1/2210:</h3><p style='background-color: #f0f0f0; padding: 10px;'>A 44-…

In [4]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
import json
import re
from typing import List, Dict, Any
import os
import csv
from tqdm.notebook import tqdm

def extract_clinical_sentences(input_file: str) -> List[str]:
    """
    Extract sentences from clinical texts in a JSON file.
    
    Args:
        input_file: Path to JSON file with clinical notes
        
    Returns:
        List of sentences extracted from clinical texts
    """
    # Check if file exists
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")
    
    # Load JSON data
    try:
        with open(input_file, 'r') as f:
            data = json.load(f)
            
        # Handle different possible input formats
        if isinstance(data, dict):
            # Format is {case_id: case_data}
            # Convert string keys to integers if necessary
            if all(isinstance(k, str) for k in data.keys()):
                data = {int(k): v for k, v in data.items()}
        else:
            raise ValueError(f"Unsupported input format in {input_file}")
            
        print(f"Loaded clinical data with {len(data)} cases")
    except json.JSONDecodeError:
        raise ValueError(f"Invalid JSON format in file: {input_file}")
    
    all_sentences = []
    
    # Process each clinical case
    for case_id, case_data in tqdm(data.items(), desc="Processing clinical notes"):
        if not isinstance(case_data, dict):
            print(f"Warning: Case {case_id} data is not a dictionary. Skipping.")
            continue
            
        # Extract clinical text
        clinical_text = case_data.get("clinical_text", "")
        if not clinical_text:
            print(f"Warning: Case {case_id} missing 'clinical_text' field. Skipping.")
            continue
        
        # Split into sentences
        case_sentences = split_into_sentences(clinical_text)
        
        # Add to all sentences with case ID
        for sentence in case_sentences:
            all_sentences.append({
                'case_id': str(case_id),
                'sentence': sentence
            })
    
    # Remove duplicates while preserving order
    unique_sentences = []
    seen = set()
    for sentence_info in all_sentences:
        sentence = sentence_info['sentence']
        if sentence and sentence not in seen:
            seen.add(sentence)
            unique_sentences.append(sentence_info)
    
    print(f"Extracted {len(unique_sentences)} unique sentences from clinical texts")
    return unique_sentences

def split_into_sentences(text: str) -> List[str]:
    """
    Split clinical text into sentences using multiple delimiter rules.
    
    Args:
        text: Clinical text to split into sentences
        
    Returns:
        List of sentences
    """
    if not text or not isinstance(text, str):
        return []
    
    # First split by common sentence terminators while preserving them
    sentence_parts = []
    for part in re.split(r'([.!?])', text):
        if part.strip():
            if part in '.!?':
                if sentence_parts:
                    sentence_parts[-1] += part
            else:
                sentence_parts.append(part.strip())
    
    # Then handle other clinical note delimiters like line breaks and semicolons
    sentences = []
    for part in sentence_parts:
        # Split by semicolons and newlines
        for subpart in re.split(r'[;\n]', part):
            if subpart.strip():
                sentences.append(subpart.strip())
    
    return sentences

class PhenotypeAnnotator:
    def __init__(self, sentences, output_file='phenotype_annotations.csv'):
        """
        Initialize the phenotype annotator.
        
        Args:
            sentences: List of sentence dictionaries with case_id and sentence
            output_file: Path to output CSV file
        """
        self.sentences = sentences
        self.output_file = output_file
        self.current_index = 0
        self.annotations = []
        
        # Check if output file exists and load previous annotations
        self.load_existing_annotations()
        
        # Create widgets
        self.setup_widgets()
        
    def load_existing_annotations(self):
        """Load existing annotations if output file exists."""
        if os.path.exists(self.output_file):
            try:
                df = pd.read_csv(self.output_file)
                self.annotations = df.to_dict('records')
                
                # Find the highest index that has been annotated
                annotated_sentences = set(df['sentence'].tolist())
                for i, sentence_info in enumerate(self.sentences):
                    if sentence_info['sentence'] not in annotated_sentences:
                        self.current_index = i
                        break
                else:
                    # All sentences have been annotated
                    self.current_index = len(self.sentences)
                    
                print(f"Loaded {len(self.annotations)} previous annotations")
                print(f"Continuing from index {self.current_index}")
            except Exception as e:
                print(f"Error loading previous annotations: {e}")
    
    def setup_widgets(self):
        """Set up the Jupyter widgets for annotation."""
        # Sentence display with increased height for better visibility
        self.sentence_display = widgets.HTML(
            value="<h3>Sentence:</h3><p style='background-color: #f0f0f0; padding: 15px; max-width: 100%; word-wrap: break-word;'></p>"
        )
        
        # Question 1: Does this sentence imply a phenotype?
        self.q1_label = widgets.HTML(
            value="<p style='font-weight: bold; margin-bottom: 5px;'>Question 1: Does this sentence or context imply a phenotype that doesn't already explicitly exist in the text?</p>"
        )
        self.implies_phenotype = widgets.RadioButtons(
            options=[('Yes', True), ('No', False)],
            description='Answer:',
            disabled=False,
            layout=widgets.Layout(width='300px')
        )
        
        # Question 2: What text implies a phenotype?
        self.q2_label = widgets.HTML(
            value="<p style='font-weight: bold; margin-bottom: 5px; margin-top: 15px;'>Question 2: What specific piece of text implies a phenotype?</p>"
        )
        self.text_implies = widgets.Text(
            description='Answer:',
            disabled=True,
            layout=widgets.Layout(width='80%')
        )
        
        # Question 3: What phenotype?
        self.q3_label = widgets.HTML(
            value="<p style='font-weight: bold; margin-bottom: 5px; margin-top: 15px;'>Question 3: What phenotype is implied?</p>"
        )
        self.phenotype = widgets.Text(
            description='Answer:',
            disabled=True,
            layout=widgets.Layout(width='80%')
        )
        
        # Navigation buttons with increased width
        button_layout = widgets.Layout(width='150px', height='40px')
        self.prev_button = widgets.Button(
            description='Previous',
            disabled=True,
            button_style='info',
            layout=button_layout
        )
        self.next_button = widgets.Button(
            description='Save & Next',
            button_style='success',
            layout=button_layout
        )
        self.save_button = widgets.Button(
            description='Save All',
            button_style='danger',
            layout=button_layout
        )
        
        # Progress bar with increased width
        self.progress = widgets.IntProgress(
            value=0,
            min=0,
            max=len(self.sentences),
            description='Progress:',
            style={'bar_color': '#0080ff'},
            layout=widgets.Layout(width='50%', height='30px')
        )
        
        # Set up callbacks
        self.implies_phenotype.observe(self._on_implies_phenotype_change, names='value')
        self.prev_button.on_click(self._on_prev_click)
        self.next_button.on_click(self._on_next_click)
        self.save_button.on_click(self._on_save_click)
        
        # Layout with better spacing and grouping
        self.main_box = widgets.VBox([
            self.sentence_display,
            widgets.HTML(value="<hr>"),  # Add a separator
            self.q1_label,
            self.implies_phenotype,
            self.q2_label,
            self.text_implies,
            self.q3_label,
            self.phenotype,
            widgets.HTML(value="<hr>"),  # Add a separator
            widgets.HBox([self.prev_button, self.next_button, self.save_button], 
                        layout=widgets.Layout(justify_content='space-around', margin='20px 0px')),
            self.progress
        ], layout=widgets.Layout(width='100%', padding='10px'))
    
    def _on_implies_phenotype_change(self, change):
        """Enable/disable text fields based on implies_phenotype selection."""
        if change['new']:  # If Yes
            self.text_implies.disabled = False
            self.phenotype.disabled = False
        else:  # If No
            self.text_implies.disabled = True
            self.phenotype.disabled = True
            # Clear the fields
            self.text_implies.value = ''
            self.phenotype.value = ''
    
    def _on_prev_click(self, b):
        """Handle previous button click."""
        if self.current_index > 0:
            self.current_index -= 1
            self._update_display()
    
    def _on_next_click(self, b):
        """Handle next button click."""
        self._save_current_annotation()
        if self.current_index < len(self.sentences) - 1:
            self.current_index += 1
            self._update_display()
        else:
            # End of sentences
            self._save_annotations()
            self.sentence_display.value = "<h3>Annotation Complete!</h3><p>All sentences have been annotated.</p>"
            self.implies_phenotype.disabled = True
            self.text_implies.disabled = True
            self.phenotype.disabled = True
            self.next_button.disabled = True
    
    def _on_save_click(self, b):
        """Handle save button click."""
        self._save_current_annotation()
        self._save_annotations()
        print(f"Annotations saved to {self.output_file}")
    
    def _save_current_annotation(self):
        """Save the current annotation."""
        if self.current_index < len(self.sentences):
            sentence_info = self.sentences[self.current_index]
            
            # Check if this sentence has already been annotated
            for i, annotation in enumerate(self.annotations):
                if annotation['sentence'] == sentence_info['sentence']:
                    # Update existing annotation
                    self.annotations[i] = {
                        'case_id': sentence_info['case_id'],
                        'sentence': sentence_info['sentence'],
                        'implies_phenotype': self.implies_phenotype.value,
                        'text_implies': self.text_implies.value if self.implies_phenotype.value else '',
                        'phenotype': self.phenotype.value if self.implies_phenotype.value else ''
                    }
                    break
            else:
                # Add new annotation
                self.annotations.append({
                    'case_id': sentence_info['case_id'],
                    'sentence': sentence_info['sentence'],
                    'implies_phenotype': self.implies_phenotype.value,
                    'text_implies': self.text_implies.value if self.implies_phenotype.value else '',
                    'phenotype': self.phenotype.value if self.implies_phenotype.value else ''
                })
    
    def _save_annotations(self):
        """Save all annotations to CSV file."""
        df = pd.DataFrame(self.annotations)
        df.to_csv(self.output_file, index=False)
    
    def _update_display(self):
        """Update the display for the current sentence."""
        if self.current_index < len(self.sentences):
            sentence_info = self.sentences[self.current_index]
            self.sentence_display.value = f"<h3>Sentence {self.current_index + 1}/{len(self.sentences)}:</h3>" \
                                        f"<p style='background-color: #f0f0f0; padding: 15px; max-width: 100%; word-wrap: break-word;'>{sentence_info['sentence']}</p>" \
                                        f"<p>Case ID: {sentence_info['case_id']}</p>"
            
            # Check if this sentence has been annotated before
            for annotation in self.annotations:
                if annotation['sentence'] == sentence_info['sentence']:
                    # Load existing annotation
                    self.implies_phenotype.value = annotation['implies_phenotype']
                    self.text_implies.value = annotation['text_implies']
                    self.phenotype.value = annotation['phenotype']
                    break
            else:
                # Reset fields
                self.implies_phenotype.value = False
                self.text_implies.value = ''
                self.phenotype.value = ''
            
            # Update UI based on implies_phenotype value
            self.text_implies.disabled = not self.implies_phenotype.value
            self.phenotype.disabled = not self.implies_phenotype.value
            
            # Update progress
            self.progress.value = self.current_index
            
            # Update button states
            self.prev_button.disabled = (self.current_index == 0)
            self.next_button.disabled = False
    
    def start(self):
        """Start the annotation process."""
        self._update_display()
        display(self.main_box)

# Example usage
def start_annotation(input_file, output_file='phenotype_annotations.csv', limit=None):
    """
    Start the annotation process for a clinical text file.
    
    Args:
        input_file: Path to JSON file with clinical notes
        output_file: Path to output CSV file
        limit: Optional limit on number of sentences to annotate (for testing)
    
    Returns:
        PhenotypeAnnotator instance
    """
    sentences = extract_clinical_sentences(input_file)
    
    # Optionally limit the number of sentences (for testing)
    if limit and limit > 0 and limit < len(sentences):
        print(f"Limiting to first {limit} sentences (out of {len(sentences)} total)")
        sentences = sentences[:limit]
        
    annotator = PhenotypeAnnotator(sentences, output_file)
    annotator.start()
    return annotator

# You would use this in your notebook with:
annotator = start_annotation('data/dataset/mine_hpo.json', 'test_annotations.csv')
# Or for testing with limited sentences:
# annotator = start_annotation('path/to/your/mine_hpo.json', 'your_annotations.csv', limit=20)

Loaded clinical data with 116 cases


Processing clinical notes:   0%|          | 0/116 [00:00<?, ?it/s]

Extracted 2210 unique sentences from clinical texts


VBox(children=(HTML(value="<h3>Sentence 1/2210:</h3><p style='background-color: #f0f0f0; padding: 15px; max-wi…