# DataManager Development Notebook

Testing data loading with DataManager

In [11]:
%load_ext autoreload
%autoreload 2

from scaling_llms.data import DataManager, DATA_SOURCES

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create DataManager and Load Tiny Shakespeare

In [12]:
# Create DataManager
dm = DataManager(data_dir="../data", seed=1337)

# List available sources
print("Available data sources:")
print(dm.list_data_sources())

Available data sources:
['tiny_shakespeare', 'wikitext103', 'openwebtext']


In [13]:
# Load Tiny Shakespeare
train_iter, val_iter = dm.get_loaders(
    DATA_SOURCES.openwebtext,
    seq_len=128,
    batch_size=4,
    val_frac=0.1,
    as_iterable=True,
    num_workers=0  # Use 0 for notebook to avoid multiprocessing issues
)

print("Loaders created successfully!")

Downloading data:  21%|██▏       | 17/80 [03:11<11:48, 11.24s/files]


KeyboardInterrupt: 

In [8]:
# Get a batch from train iterator
x_batch, y_batch = next(train_iter)

print(f"Batch shapes: x={x_batch.shape}, y={y_batch.shape}")
print(f"\nFirst sequence (x):")
print(x_batch[0])
print(f"\nTarget sequence (y):")
print(y_batch[0])
print(f"\nDecoded text (first 100 chars):")
text = bytes(x_batch[0].tolist()).decode('utf-8', errors='replace')
print(text[:100])

Batch shapes: x=torch.Size([4, 128]), y=torch.Size([4, 128])

First sequence (x):
tensor([128, 148,  32, 105, 110,  32, 111, 110, 101,  32, 110, 105, 103, 104,
        116,  32,  44,  32,  98, 117, 116,  32, 104, 105, 115,  32,  97, 105,
        114,  99, 114,  97, 102, 116,  32, 119,  97, 115,  32, 100,  97, 109,
         97, 103, 101, 100,  32,  98, 121,  32, 114, 101, 116, 117, 114, 110,
         32, 102, 105, 114, 101,  32,  44,  32, 114, 101, 113, 117, 105, 114,
        105, 110, 103,  32,  97,  32, 102, 111, 114,  99, 101, 100,  32, 108,
         97, 110, 100, 105, 110, 103,  32,  46,  32,  72, 101,  32, 117, 115,
        101, 100,  32, 111, 110, 108, 121,  32,  50,  50,  32,  99,  97, 110,
        110, 111, 110,  32, 115, 104, 101, 108, 108, 115,  32, 116, 111,  32,
        100, 111])

Target sequence (y):
tensor([148,  32, 105, 110,  32, 111, 110, 101,  32, 110, 105, 103, 104, 116,
         32,  44,  32,  98, 117, 116,  32, 104, 105, 115,  32,  97, 105, 114,
         99, 114,  