In [1]:
import random

def read_conll_file(file_path):
    """Read CoNLL formatted file and return a list of sentences."""
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        for line in f:
            if line.strip():  # Non-empty line
                sentence.append(line.strip())
            else:  # Empty line indicates end of a sentence
                if sentence:
                    sentences.append(sentence)
                    sentence = []
        if sentence:  # Add last sentence if the file doesn't end with a new line
            sentences.append(sentence)
    return sentences

def split_data(sentences, train_ratio=0.8, val_ratio=0.1):
    """Split the dataset into training, validation, and test sets."""
    random.shuffle(sentences)  # Shuffle sentences for random distribution
    total = len(sentences)
    train_end = int(train_ratio * total)
    val_end = train_end + int(val_ratio * total)
    
    train_set = sentences[:train_end]
    val_set = sentences[train_end:val_end]
    test_set = sentences[val_end:]

    return train_set, val_set, test_set

def write_conll_file(sentences, file_path):
    """Write sentences to a CoNLL formatted file."""
    with open(file_path, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            for line in sentence:
                f.write(line + '\n')
            f.write('\n')  # New line to separate sentences

# File paths
input_file_path = 'C:/Users/elbet/OneDrive/Desktop/Ten/week5/Telegram-based-e-commerce/labeled_data_conll.conll'  # Replace with your file path
train_file_path = 'train_data.conll'
val_file_path = 'val_data.conll'
test_file_path = 'test_data.conll'

# Read data
sentences = read_conll_file(input_file_path)

# Split data
train_set, val_set, test_set = split_data(sentences)

# Write to new files
write_conll_file(train_set, train_file_path)
write_conll_file(val_set, val_file_path)
write_conll_file(test_set, test_file_path)

print("Data has been split into train, validation, and test sets.")


Data has been split into train, validation, and test sets.
