#### 1. Load and Parse the CoNLL Dataset

In [2]:
import pandas as pd
from collections import defaultdict

# Load the CoNLL data from a text file
def load_conll_data(file_path):
    sentences = []
    sentence = []
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.split()
                if len(parts) >= 2:  # Ensure there's both token and tag
                    sentence.append((parts[0], parts[-1])) # Last column contains the tag
    if sentence:
        sentences.append(sentence)  # Add last sentence if the file ends without a newline
    
    return sentences

# Load the dataset
conll_file_path = "dataset_conll.txt"
sentences = load_conll_data(conll_file_path)


#### 2. Convert to Sentence-Level Dataset with Multilabels

In [8]:
# Extract all entity types for each sentence
def extract_entity_types(sentence):
    entity_types = set(tag.split('-')[-1] for _, tag in sentence if tag != 'O')
    return list(entity_types)  # Convert to list to represent as multilabels

# Prepare a structured dataset
data = []
for sentence in sentences:
    tokens = [token for token, tag in sentence]
    tags = [tag for token, tag in sentence]
    entity_types = extract_entity_types(sentence)  # List of unique entity types
    # sentence = " ".join(tokens)
    data.append({
        'sentence': sentence,
        'tokens': tokens,
        'tags': tags,
        'entity_types': entity_types  # Multilabels
    })

# Convert to a DataFrame for easier manipulation
df = pd.DataFrame(data)
df

Unnamed: 0,sentence,tokens,tags,entity_types
0,"[(A, B-Event), (passenger, I-Event), (aircraft...","[A, passenger, aircraft, is, approaching, ., D...","[B-Event, I-Event, I-Event, I-Event, E-Event, ...","[NonEvent, Event]"
1,"[(A, B-Event), (passenger, I-Event), (aircraft...","[A, passenger, aircraft, is, nearby, ., Fly, w...","[B-Event, I-Event, I-Event, I-Event, E-Event, ...","[NonEvent, Event]"
2,"[(Abnormal, B-Event), (compass, I-Event), (fun...","[Abnormal, compass, function, or, GPS, signal,...","[B-Event, I-Event, I-Event, I-Event, I-Event, ...",[Event]
3,"[(Accelerator, B-Event), (is, I-Event), (Over,...","[Accelerator, is, Over, Range, .]","[B-Event, I-Event, I-Event, E-Event, O]",[Event]
4,"[(Account, B-NonEvent), (not, I-NonEvent), (lo...","[Account, not, logged, in, ., Flight, altitude...","[B-NonEvent, I-NonEvent, I-NonEvent, E-NonEven...","[NonEvent, Event]"
...,...,...,...,...
573,"[(Your, B-Event), (aircraft, I-Event), (is, I-...","[Your, aircraft, is, at, the, boundary, of, an...","[B-Event, I-Event, I-Event, I-Event, I-Event, ...","[NonEvent, Event]"
574,"[(Your, B-Event), (aircraft, I-Event), (is, I-...","[Your, aircraft, is, at, the, boundary, of, an...","[B-Event, I-Event, I-Event, I-Event, I-Event, ...","[NonEvent, Event]"
575,"[(Your, B-Event), (aircraft, I-Event), (is, I-...","[Your, aircraft, is, flying, in, an, Altitude,...","[B-Event, I-Event, I-Event, I-Event, I-Event, ...","[NonEvent, Event]"
576,"[(Your, B-Event), (palm, I-Event), (is, I-Even...","[Your, palm, is, too, close, to, the, aircraft...","[B-Event, I-Event, I-Event, I-Event, I-Event, ...","[NonEvent, Event]"


#### 3. Encode Multilabels for Stratified Sampling

In [5]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Collecting scipy (from iterative-stratification)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting scikit-learn (from iterative-stratification)
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting joblib>=1.2.0 (from scikit-learn->iterative-stratification)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->iterative-stratification)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ----------------- ---------------------- 4.7/11.0 MB 35.7 MB/s eta 0:00:01
   ---------------------------------------- 11.0/11.0 MB 36.2 MB/s e

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

# Convert the list of entity types to a multilabel binary indicator matrix
mlb = MultiLabelBinarizer()
entity_type_matrix = mlb.fit_transform(df['entity_types'])

# Add the matrix as a new DataFrame for easy splitting
entity_type_df = pd.DataFrame(entity_type_matrix, columns=mlb.classes_)
df = pd.concat([df, entity_type_df], axis=1)


#### 4. Perform Stratified Sampling

In [10]:
# Set up the stratified multilabel split
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

# Perform the split
for train_idx, test_idx in msss.split(df, entity_type_matrix):
    train_df = df.iloc[train_idx]
    test_df = df.iloc[test_idx]

# Check distribution of entity types in train and test sets
print("Train distribution:", train_df[mlb.classes_].mean())
print("Test distribution:", test_df[mlb.classes_].mean())


Train distribution: Event       0.908867
NonEvent    0.512315
dtype: float64
Test distribution: Event       0.924419
NonEvent    0.517442
dtype: float64


#### 5. Save the Split Data Back to CoNLL Format

In [11]:
def save_to_conll(dataframe, file_path):
    with open(file_path, 'w') as file:
        for _, row in dataframe.iterrows():
            sentence = row['sentence']
            for token, tag in sentence:
                file.write(f"{token} {tag}\n")
            file.write("\n")  # Blank line between sentences

# Save the train and test sets
save_to_conll(train_df, "train_conll_data.txt")
save_to_conll(test_df, "test_conll_data.txt")


#### 6. Verify the Result

In [12]:
# Original dataset distribution for comparison
original_distribution = df[mlb.classes_].mean()
print("Original distribution:", original_distribution)

train_distribution = train_df[mlb.classes_].mean()
test_distribution = test_df[mlb.classes_].mean()

print("Train distribution:", train_distribution)
print("Test distribution:", test_distribution)


Original distribution: Event       0.913495
NonEvent    0.513841
dtype: float64
Train distribution: Event       0.908867
NonEvent    0.512315
dtype: float64
Test distribution: Event       0.924419
NonEvent    0.517442
dtype: float64
