# Quick Start

## Create Configuration File

First, create a configuration file to define your storage providers. The default configuration file is located at `~/.msc_config.yaml`, but you can specify a different path using the `MSC_CONFIG` environment variable.

```yaml
  profiles:
    swift-pdx:
      storage_provider:
        type: s3
        options:
          region_name: us-east-1
          endpoint_url: https://pdx.s8k.io
          base_path: webdataset_samples
      credentials_provider:
        type: S3Credentials
        options:
          access_key: *****
          secret_key: *****
    swift-pbss:
      storage_provider:
        type: s3
        options:
          region_name: us-east-1
          endpoint_url: https://pbss.s8k.io
          base_path: zarr_examples
      credentials_provider:
        type: S3Credentials
        options:
          access_key: *****
          secret_key: *****
```

## List Files

Once your configuration is in place, you can access files using `msc.open` and `msc.glob` functions.

In [1]:
import multistorageclient as msc

files = msc.glob("msc://swift-pdx/*.tar")
files[:10]

['msc://swift-pdx/dataset_000.tar',
 'msc://swift-pdx/dataset_001.tar',
 'msc://swift-pdx/dataset_002.tar',
 'msc://swift-pdx/dataset_003.tar',
 'msc://swift-pdx/dataset_004.tar',
 'msc://swift-pdx/dataset_005.tar',
 'msc://swift-pdx/dataset_006.tar',
 'msc://swift-pdx/dataset_007.tar',
 'msc://swift-pdx/dataset_008.tar',
 'msc://swift-pdx/dataset_009.tar']

## Open File - Read

In [2]:
with msc.open("msc://swift-pdx/dataset_000.tar", "rb") as fp:
    content = fp.read()

print(f"File Size = {len(content)}, Content = {content[:80]}...")

File Size = 62986240, Content = b'././@PaxHeader\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'...


## Open File - Write

In [3]:
# 32mb file
body = b"A" * 32 * 1024 * 1024

with msc.open("msc://swift-pdx/testfile.bin", "wb") as fp:
    fp.write(body)

with msc.open("msc://swift-pdx/testfile.bin", "rb") as fp:
    content = fp.read()

print(f"File Size = {len(content)}, Content = {content[:80]}...")

File Size = 33554432, Content = b'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'...


In [4]:
msc.glob("msc://swift-pdx/*.bin")

['msc://swift-pdx/testfile.bin']

# Framework Integration

## Use Webdataset

In [None]:
import webdataset as wds

w = wds.WebDataset("msc://swift-pdx/dataset_{000..010}.tar").shuffle(True)

for data in w:
    print(data)
    break

## Use Megatron-Energon

In [None]:
from megatron.energon import get_train_dataset, get_loader

dataset_name = "msc://swift-pdx/"

train_loader = get_loader(
    get_train_dataset(
        dataset_name,
        batch_size=64,
        shuffle_buffer_size=None,
        max_samples_per_sequence=None,
    )
)

train_loader = iter(train_loader)
for i in range(10):
    _ = next(train_loader)
    print(f"Step = {i}")
    break

## Use Zarr

In [23]:
import zarr
import numpy as np

# Create a Zarr array in your PBSS account under bucket zarr_examples
zarr_array = zarr.create(shape=(4, 4), dtype="float32", store="msc://swift-pbss/array.zarr", overwrite=True)

print(f'zarr_array = {zarr_array}')

# Open a Zarr array in your PBSS account under bucket zarr_examples
zarr_array_opened = zarr.open("msc://swift-pbss/array.zarr")

print(f'zarr_array_opened = {zarr_array_opened}')

# Create a Zarr group with 2 arrays in your PBSS account under bucket zarr_examples
zarr_group = zarr.open_group("msc://swift-pbss/group.zarr", mode="w")
zarr_group.create_dataset("array1", shape=(4, 4), dtype="float32", data=np.eye(4), overwrite=True)
zarr_group.create_dataset("array2", shape=(8, 8), dtype="float64", overwrite=True)

print(f'zarr_group = {zarr_group}')
print(f'zarr_group.array1: {zarr_group["array1"][:]}')

# Open a Zarr group in your PBSS account under bucket zarr_examples
zarr_group_opened = zarr.open("msc://swift-pbss/group.zarr")

print(f'zarr_group_opened structure: {zarr_group_opened}')
print(f'zarr_group_opened.array1: {zarr_group_opened["array1"][:]}')

zarr_array = <zarr.core.Array (4, 4) float32>
zarr_array_opened = <zarr.core.Array (4, 4) float32>
zarr_group = <zarr.hierarchy.Group '/'>
zarr_group.array1: [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
zarr_group_opened structure: <zarr.hierarchy.Group '/'>
zarr_group_opened.array1: [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
