# Discontinuous NER Data Preparation

This notebook handles the preparation of discontinuous NER datasets with different positive/negative ratios.

In [2]:
import os
import json
import spacy
import random
from typing import List, Tuple, Dict
from pathlib import Path

In [None]:
PROJ_ROOT = Path.cwd().parent  
DATA_DIR = PROJ_ROOT / 'data' / 'discontinuous'
DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f'Project root: {PROJ_ROOT}')
print(f'Data directory: {DATA_DIR}')

Project root: /Users/gareginmazmanyan/Documents/UOFA/CSC580/FinalProject/NER
Data directory: /Users/gareginmazmanyan/Documents/UOFA/CSC580/FinalProject/NER/data/discontinuous


In [None]:
# Load existing data
def load_existing_data():
    data_path = PROJ_ROOT / 'data' / 'tagged_general_sentences.json'
    with open(data_path, 'r') as f:
        return json.load(f)

general_data = load_existing_data()
print(f'Loaded {len(general_data)} sentences from existing data')

Loaded 47959 sentences from existing data


In [5]:
class DiscontinuousNERHandler:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        self.data_dir = DATA_DIR
    
    def create_discontinuous_example(self, text: str) -> Tuple[List[Tuple[str, str]], List[str]]:
        """Create a discontinuous NER example from input text"""
        doc = self.nlp(text)
        words_with_pos = []
        labels = []
        
        entities = [(e.text, e.label_, e.start_char, e.end_char) for e in doc.ents]
        
        for token in doc:
            words_with_pos.append((token.text, token.pos_))
            
            # Default to non-entity
            label = 'O'
            
            # Check if token is part of an entity
            for ent_text, ent_label, start, end in entities:
                if token.idx >= start and token.idx < end:
                    # Create discontinuous pattern for longer entities
                    if len(ent_text.split()) > 2 and random.random() < 0.4:
                        if token.idx > start and token.idx < end - len(token.text):
                            label = 'O' if random.random() < 0.5 else f'I-{ent_label}'
                        else:
                            label = f'B-{ent_label}' if token.idx == start else f'I-{ent_label}'
                    else:
                        label = f'B-{ent_label}' if token.idx == start else f'I-{ent_label}'
                    break
            
            labels.append(label)
        
        return words_with_pos, labels
    
    def prepare_dataset(self, texts: List[str], ratio: float) -> Tuple[List, List]:
        """Prepare train and validation datasets with given positive ratio"""
        processed_examples = []
        
        for text in texts:
            words_pos, labels = self.create_discontinuous_example(text)
            processed_examples.append((words_pos, labels))
        
        # Split into positive and negative examples
        positive = [ex for ex in processed_examples if any(l != 'O' for l in ex[1])]
        negative = [ex for ex in processed_examples if all(l == 'O' for l in ex[1])]
        
        # Calculate desired numbers
        total = len(processed_examples)
        num_positive = int(total * ratio)
        num_negative = total - num_positive
        
        # Sample examples
        sampled_positive = random.sample(positive, min(num_positive, len(positive)))
        sampled_negative = random.sample(negative, min(num_negative, len(negative)))
        
        # Combine and shuffle
        dataset = sampled_positive + sampled_negative
        random.shuffle(dataset)
        
        # Split into train/val
        split_idx = int(len(dataset) * 0.8)
        return dataset[:split_idx], dataset[split_idx:]
    
    def save_datasets(self, train_data: List, val_data: List, ratio: float):
        """Save datasets to files"""
        # Save training data
        train_path = self.data_dir / f'train_ratio_{ratio:.1f}.json'
        with open(train_path, 'w') as f:
            json.dump(train_data, f)
        
        # Save validation data
        val_path = self.data_dir / f'val_ratio_{ratio:.1f}.json'
        with open(val_path, 'w') as f:
            json.dump(val_data, f)
        
        print(f'Saved datasets with ratio {ratio:.1f}:')
        print(f'Training examples: {len(train_data)}')
        print(f'Validation examples: {len(val_data)}')

In [None]:
# Create datasets with different ratios
handler = DiscontinuousNERHandler()
ratios = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

processed_data = []
for item in general_data[:1000]: 
    # Create word-POS pairs
    word_pos_pairs = list(zip(item['tokens'], item['pos_tags']))
    # Get the corresponding NER tags
    ner_tags = item['ner_tags']
    # Add to processed data
    processed_data.append((word_pos_pairs, ner_tags))

print(f"Processed {len(processed_data)} sentences")
print("\nExample of processed data:")
example = processed_data[0]
print("\nWord-POS pairs:", example[0][:5], "...")  # Show first 5 pairs
print("NER tags:", example[1][:5], "...")  # Show first 5 tags

for ratio in ratios:
    train_data, val_data = handler.prepare_dataset([item['sentence'] for item in general_data[:1000]], ratio)
    handler.save_datasets(train_data, val_data, ratio)
    print(f"\nCreated dataset with ratio {ratio:.1f}")
    print(f"Training examples: {len(train_data)}")
    print(f"Validation examples: {len(val_data)}")
    print('-' * 50)

Processed 1000 sentences

Example of processed data:

Word-POS pairs: [('Thousands', 'NOUN'), ('of', 'ADP'), ('demonstrators', 'NOUN'), ('have', 'AUX'), ('marched', 'VERB')] ...
NER tags: ['B-CARDINAL', 'O', 'O', 'O', 'O'] ...
Saved datasets with ratio 0.2:
Training examples: 237
Validation examples: 60

Created dataset with ratio 0.2
Training examples: 237
Validation examples: 60
--------------------------------------------------
Saved datasets with ratio 0.3:
Training examples: 317
Validation examples: 80

Created dataset with ratio 0.3
Training examples: 317
Validation examples: 80
--------------------------------------------------
Saved datasets with ratio 0.4:
Training examples: 397
Validation examples: 100

Created dataset with ratio 0.4
Training examples: 397
Validation examples: 100
--------------------------------------------------
Saved datasets with ratio 0.5:
Training examples: 477
Validation examples: 120

Created dataset with ratio 0.5
Training examples: 477
Validation ex

In [None]:
# Verify the results
def inspect_dataset(ratio: float):
    train_path = DATA_DIR / f'train_ratio_{ratio:.1f}.json'
    with open(train_path, 'r') as f:
        data = json.load(f)
    
    print(f'\nExample from ratio {ratio:.1f} dataset:')
    example = random.choice(data)
    words = [word for word, _ in example[0]]
    labels = example[1]
    
    print('Text:', ' '.join(words))
    print('Labels:', labels)

for ratio in ratios:
    inspect_dataset(ratio)


Example from ratio 0.2 dataset:
Text: He said no outlawed organization will be allowed to collect donations and action will be taken against those who preach hate in mosques .
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-FAC', 'O']

Example from ratio 0.3 dataset:
Text: Officials say the aircraft was supporting a NATO mission in the country 's south , but there is no indication of enemy action causing the crash .
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Example from ratio 0.4 dataset:
Text: Japan 's Foreign Minister , Nobutaka Machimura , arrives Sunday in Beijing for talks with his Chinese counterpart , Li Zhaoxing , to discuss relations between the two countries .
Labels: ['B-GPE', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'B-DATE', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'B-