In [18]:
import random
from tqdm.notebook import tqdm
from datasets import load_dataset
import concurrent.futures

In [19]:
def process_text(args):
    text, output_file = args
    characters = set(text)
    with open(output_file, "a", encoding="utf-8") as outfile:
        outfile.write(text + "\n")
    return characters

In [20]:
def process_batch(batch, output_file):
    vocab = set()
    args = [(text, output_file) for text in batch['text']]
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for characters in executor.map(process_text, args):
            vocab.update(characters)
    return vocab

In [21]:
def process_dataset_in_batches(dataset, output_file, batch_size=100000):
    vocab = set()
    total_batches = len(dataset) // batch_size + (1 if len(dataset) % batch_size != 0 else 0)
    
    for i in tqdm(range(0, len(dataset), batch_size), total=total_batches, desc="Processing batches"):
        batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
        batch_vocab = process_batch(batch, output_file)
        vocab.update(batch_vocab)
    
    return vocab

In [22]:
dataset = load_dataset("openwebtext", trust_remote_code=True)

Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [23]:
# Calculate split sizes
total_samples = len(dataset['train'])
split_index = int(total_samples * 0.9)  # 90% for training

# Create train and validation splits
train_dataset = dataset['train'].select(range(split_index))
val_dataset = dataset['train'].select(range(split_index, total_samples))

In [24]:
output_file_train = "data/raw/OpenWebText/train.txt"
output_file_val = "data/raw/OpenWebText/val.txt"
vocab_file = "data/raw/OpenWebText/vocab.txt"

In [25]:
open(output_file_train, 'w').close()
open(output_file_val, 'w').close()

In [26]:
print("Processing training data...")
vocab_train = process_dataset_in_batches(train_dataset, output_file_train)

Processing training data...


Processing batches:   0%|          | 0/73 [00:00<?, ?it/s]

Process ForkProcess-55241:
Process ForkProcess-55234:
Process ForkProcess-55246:
Process ForkProcess-55236:
Process ForkProcess-55238:
Process ForkProcess-55237:
Process ForkProcess-55242:
Process ForkProcess-55245:
Process ForkProcess-55244:
Process ForkProcess-55240:
Process ForkProcess-55235:
Process ForkProcess-55243:
Process ForkProcess-55247:
Process ForkProcess-55239:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/tathagat/anaconda3/envs/Midterm-Bonus/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/tathagat/anaconda3

KeyboardInterrupt: 

: 

In [16]:
print("Processing validation data...")
vocab_val = process_dataset_in_batches(val_dataset, output_file_val)

Processing validation data...


  0%|          | 0/8013 [00:00<?, ?it/s]

In [17]:
# Combine vocabularies and write to vocab.txt
vocab = vocab_train.union(vocab_val)
with open(vocab_file, "w", encoding="utf-8") as vfile:
    for char in sorted(vocab):
        vfile.write(char + '\n')

print("Processing complete. Files created: output_train.txt, output_val.txt, vocab.txt")

Processing complete. Files created: output_train.txt, output_val.txt, vocab.txt
