In [None]:
%%capture
!pip install datasets
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset

# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

KeyboardInterrupt: ignored

# Load Conceputal Captions

In [None]:
# Specify dataset set sizes
train_size = 100000
val_size = 1000
dataset_train, dataset_val = load_dataset("conceptual_captions", split=[f"train[:{train_size}]", f"validation[:{val_size}]"])

In [None]:
batch_size = 1000

# Helpers

In [None]:
# Check fetch rate
def print_summary(dataset, key, action = "Fetched"):
    img_count_w_embedding = sum(1 for entry in dataset if entry[key] is not None)
    total_img_count = len(dataset)

    percentage_embedded = (img_count_w_embedding / total_img_count * 100) if total_img_count > 0 else 0

    print(f"{action} {img_count_w_embedding} out of {total_img_count} images ({percentage_embedded:.2f}%)")

## Fetch Image Helpers

In [None]:
import requests
from PIL import Image
from io import BytesIO
from functools import partial
from concurrent.futures import ThreadPoolExecutor
from requests.adapters import HTTPAdapter

In [None]:
# Maintain session
session = requests.Session()
adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [None]:
from datasets.utils.file_utils import get_datasets_user_agent

USER_AGENT = get_datasets_user_agent()

In [None]:
def fetch_single_image(image_url, timeout=None, retries=0):
    for _ in range(retries + 1):
        try:
            response = session.get(
                image_url, headers={"user-agent": USER_AGENT},
                timeout=timeout
            )
            response.raise_for_status()
            image = Image.open(BytesIO(response.content))
            return image
        except Exception as e:
            return None

def fetch_images(batch, num_threads, timeout=None, retries=0):
    fetch_image_with_args = partial(
        fetch_single_image,
        timeout=timeout,
        retries=retries
    )
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        batch["image"] = list(
            executor.map(
                fetch_image_with_args,
                batch["image_url"]
              )
            )
    return batch

## Embed Image Helpers

In [None]:
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoImageProcessor, BeitModel, ViTModel

In [None]:
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [None]:
import os
from datasets import Dataset

def embed_single_image(image, image_processor, model, device):
    try:
        inputs = image_processor(image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[0, 0, :].cpu()
        avg_embedding = outputs.last_hidden_state.squeeze().mean(axis=0)
        #return cls_embedding
        return avg_embedding
    except Exception as e:
        return None


def embed_images_and_save(batch, image_processor, model, device):
    image_urls = []
    captions = []
    embeddings = []

    for item in batch:
        image_url = item['image_url']
        caption = item['caption']
        image = item['image']

        embedding = None
        if image is not None:
            embedding = embed_single_image(image, image_processor, model, device)

        image_urls.append(image_url)
        captions.append(caption)
        embeddings.append(embedding)

    # Convert lists to a HuggingFace Dataset
    dataset = Dataset.from_dict({
        'image_url': image_urls,
        'caption': captions,
        'image_embedding': embeddings
    })

    return dataset

# Process Data

In [None]:
drive_folder = "MIT 6.8610"

In [None]:
num_threads = 40
fetch_fn_kwargs = {
    "num_threads": num_threads,
    "timeout": 5,
    "retries": 1
}

if torch.cuda.is_available():
    device = torch.device("cuda")
    model.to(device)
else:
    device = torch.device("cpu")

In [None]:
import math
from datasets import load_from_disk, concatenate_datasets
import time
import shutil
from tqdm import tqdm

def process_and_save_dataset(
    dataset,
    drive_base_folder,
    batch_folder,
    agg_folder,
    file_name_base,
    image_processor,
    model,
    device,
    batch_size=batch_size,
    fetch_fn_kwargs=fetch_fn_kwargs,
):
    total_items = len(dataset)
    num_batches = math.ceil(total_items / batch_size)

    folder_path_batches = os.path.join(
          '/content/drive/My Drive',
          f"{drive_base_folder}/{batch_folder}"
    )

    def get_batch_file_path(batch_num):
      file_name = f"{file_name_base}_batch_{batch_num}.hf"
      return os.path.join(folder_path_batches, file_name)

    for i in tqdm(range(num_batches), desc="Processing batches"):
        # Process data in batches
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, total_items)
        indices = range(start_idx, end_idx)
        batch = dataset.select(indices)

        print(f"Fetching images for batch {i+1}/{num_batches}...")
        batch_with_images = batch.map(
            fetch_images,
            batched=True,
            batch_size=100,
            fn_kwargs=fetch_fn_kwargs
        )
        print_summary(batch_with_images, key="image", action="Fetched")

        print(f"Embedding images for batch {i+1}/{num_batches}...")
        batch_with_embeddings = embed_images_and_save(
            batch_with_images,
            image_processor,
            model,
            device
        )

        full_file_path_batch = get_batch_file_path(i)
        batch_with_embeddings.save_to_disk(full_file_path_batch)

    # Aggregating datasets
    datasets = []

    for batch_num in range(num_batches):
        full_file_path_batch = get_batch_file_path(batch_num)
        batch_data = load_from_disk(full_file_path_batch)
        datasets.append(batch_data)

    aggregated_dataset = concatenate_datasets(datasets)
    # aggregated_dataset = aggregated_dataset.filter(lambda item: item["image_embedding"] is not None)

    ts = time.time()
    file_name_agg_dataset = f"{file_name_base}_{total_items}_{ts}.hf"
    folder_path_agg = os.path.join(
        '/content/drive/My Drive',
        f"{drive_base_folder}/{agg_folder}"
    )
    full_file_path = os.path.join(folder_path_agg, file_name_agg_dataset)

    aggregated_dataset.save_to_disk(full_file_path)

In [None]:
def delete_batch_data(
    dataset,
    file_name_base,
    drive_base_folder,
    batch_folder,
):
    print(f"Deleting batch folders...")

    total_items = len(dataset)
    num_batches = math.ceil(total_items / batch_size)

    folder_path_batches = os.path.join(
          '/content/drive/My Drive',
          f"{drive_base_folder}/{batch_folder}"
    )

    for batch_num in range(num_batches):
        dataset_name = f"{file_name_base}_batch_{batch_num}.hf"
        batch_folder_path = os.path.join(folder_path_batches, dataset_name)

        if os.path.exists(batch_folder_path) and os.path.isdir(batch_folder_path):
            shutil.rmtree(batch_folder_path)
            print(f"Deleted folder: {batch_folder_path}")

# Process Train Dataset

In [None]:
total_items_train = len(dataset_train)
num_batches_train = math.ceil(total_items_train / batch_size)

process_and_save_dataset(
    dataset=dataset_train,
    drive_base_folder=drive_folder,
    batch_folder = "Train Data Batches",
    agg_folder = "Train Data Agg",
    file_name_base="train_data",
    image_processor=image_processor,
    model=model,
    device=device,
    batch_size=batch_size,
    fetch_fn_kwargs=fetch_fn_kwargs
)

Processing batches:   0%|          | 0/100 [00:00<?, ?it/s]

Fetching images for batch 1/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 837 out of 1000 images (83.70%)
Embedding images for batch 1/100...


It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:   1%|          | 1/100 [02:20<3:51:04, 140.05s/it]

Fetching images for batch 2/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 821 out of 1000 images (82.10%)
Embedding images for batch 2/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:   2%|▏         | 2/100 [04:19<3:28:36, 127.72s/it]

Fetching images for batch 3/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 828 out of 1000 images (82.80%)
Embedding images for batch 3/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:   3%|▎         | 3/100 [06:25<3:25:17, 126.99s/it]

Fetching images for batch 4/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 804 out of 1000 images (80.40%)
Embedding images for batch 4/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:   4%|▍         | 4/100 [08:29<3:21:18, 125.82s/it]

Fetching images for batch 5/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 823 out of 1000 images (82.30%)
Embedding images for batch 5/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:   5%|▌         | 5/100 [10:29<3:15:58, 123.78s/it]

Fetching images for batch 6/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 823 out of 1000 images (82.30%)
Embedding images for batch 6/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:   6%|▌         | 6/100 [12:21<3:07:48, 119.88s/it]

Fetching images for batch 7/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 846 out of 1000 images (84.60%)
Embedding images for batch 7/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:   7%|▋         | 7/100 [14:20<3:05:02, 119.39s/it]

Fetching images for batch 8/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 844 out of 1000 images (84.40%)
Embedding images for batch 8/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:   8%|▊         | 8/100 [16:13<3:00:02, 117.42s/it]

Fetching images for batch 9/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 840 out of 1000 images (84.00%)
Embedding images for batch 9/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:   9%|▉         | 9/100 [18:11<2:58:38, 117.79s/it]

Fetching images for batch 10/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 838 out of 1000 images (83.80%)
Embedding images for batch 10/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  10%|█         | 10/100 [20:17<3:00:07, 120.09s/it]

Fetching images for batch 11/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 827 out of 1000 images (82.70%)
Embedding images for batch 11/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  11%|█         | 11/100 [22:16<2:57:36, 119.74s/it]

Fetching images for batch 12/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 822 out of 1000 images (82.20%)
Embedding images for batch 12/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  12%|█▏        | 12/100 [24:12<2:54:17, 118.84s/it]

Fetching images for batch 13/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 833 out of 1000 images (83.30%)
Embedding images for batch 13/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  13%|█▎        | 13/100 [26:07<2:50:38, 117.68s/it]

Fetching images for batch 14/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 830 out of 1000 images (83.00%)
Embedding images for batch 14/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  14%|█▍        | 14/100 [28:18<2:54:14, 121.56s/it]

Fetching images for batch 15/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 822 out of 1000 images (82.20%)
Embedding images for batch 15/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  15%|█▌        | 15/100 [30:20<2:52:33, 121.81s/it]

Fetching images for batch 16/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 842 out of 1000 images (84.20%)
Embedding images for batch 16/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  16%|█▌        | 16/100 [32:03<2:42:23, 116.00s/it]

Fetching images for batch 17/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 829 out of 1000 images (82.90%)
Embedding images for batch 17/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  17%|█▋        | 17/100 [34:11<2:45:25, 119.58s/it]

Fetching images for batch 18/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 813 out of 1000 images (81.30%)
Embedding images for batch 18/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  18%|█▊        | 18/100 [36:06<2:41:42, 118.32s/it]

Fetching images for batch 19/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 824 out of 1000 images (82.40%)
Embedding images for batch 19/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  19%|█▉        | 19/100 [38:15<2:43:49, 121.35s/it]

Fetching images for batch 20/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 822 out of 1000 images (82.20%)
Embedding images for batch 20/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  20%|██        | 20/100 [40:08<2:38:40, 119.00s/it]

Fetching images for batch 21/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 825 out of 1000 images (82.50%)
Embedding images for batch 21/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  21%|██        | 21/100 [42:04<2:35:38, 118.21s/it]

Fetching images for batch 22/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 849 out of 1000 images (84.90%)
Embedding images for batch 22/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  22%|██▏       | 22/100 [44:14<2:38:15, 121.74s/it]

Fetching images for batch 23/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 819 out of 1000 images (81.90%)
Embedding images for batch 23/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  23%|██▎       | 23/100 [46:17<2:36:31, 121.96s/it]

Fetching images for batch 24/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 816 out of 1000 images (81.60%)
Embedding images for batch 24/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  24%|██▍       | 24/100 [48:15<2:33:00, 120.80s/it]

Fetching images for batch 25/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 816 out of 1000 images (81.60%)
Embedding images for batch 25/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  25%|██▌       | 25/100 [50:12<2:29:41, 119.75s/it]

Fetching images for batch 26/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 827 out of 1000 images (82.70%)
Embedding images for batch 26/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  26%|██▌       | 26/100 [52:18<2:29:46, 121.43s/it]

Fetching images for batch 27/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 840 out of 1000 images (84.00%)
Embedding images for batch 27/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  27%|██▋       | 27/100 [54:24<2:29:33, 122.92s/it]

Fetching images for batch 28/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 824 out of 1000 images (82.40%)
Embedding images for batch 28/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  28%|██▊       | 28/100 [56:21<2:25:32, 121.29s/it]

Fetching images for batch 29/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 809 out of 1000 images (80.90%)
Embedding images for batch 29/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  29%|██▉       | 29/100 [58:22<2:23:13, 121.03s/it]

Fetching images for batch 30/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 824 out of 1000 images (82.40%)
Embedding images for batch 30/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  30%|███       | 30/100 [1:00:19<2:19:58, 119.98s/it]

Fetching images for batch 31/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 837 out of 1000 images (83.70%)
Embedding images for batch 31/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  31%|███       | 31/100 [1:02:30<2:21:30, 123.05s/it]

Fetching images for batch 32/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 828 out of 1000 images (82.80%)
Embedding images for batch 32/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  32%|███▏      | 32/100 [1:04:25<2:16:41, 120.61s/it]

Fetching images for batch 33/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 814 out of 1000 images (81.40%)
Embedding images for batch 33/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  33%|███▎      | 33/100 [1:06:30<2:16:11, 121.96s/it]

Fetching images for batch 34/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 829 out of 1000 images (82.90%)
Embedding images for batch 34/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  34%|███▍      | 34/100 [1:08:25<2:12:06, 120.11s/it]

Fetching images for batch 35/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 831 out of 1000 images (83.10%)
Embedding images for batch 35/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  35%|███▌      | 35/100 [1:10:31<2:11:48, 121.67s/it]

Fetching images for batch 36/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 822 out of 1000 images (82.20%)
Embedding images for batch 36/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  36%|███▌      | 36/100 [1:13:34<2:29:36, 140.26s/it]

Fetching images for batch 37/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 824 out of 1000 images (82.40%)
Embedding images for batch 37/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  37%|███▋      | 37/100 [1:16:08<2:31:31, 144.31s/it]

Fetching images for batch 38/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 846 out of 1000 images (84.60%)
Embedding images for batch 38/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  38%|███▊      | 38/100 [1:18:13<2:22:57, 138.35s/it]

Fetching images for batch 39/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 826 out of 1000 images (82.60%)
Embedding images for batch 39/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  39%|███▉      | 39/100 [1:20:35<2:21:55, 139.59s/it]

Fetching images for batch 40/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 848 out of 1000 images (84.80%)
Embedding images for batch 40/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  40%|████      | 40/100 [1:22:50<2:18:14, 138.24s/it]

Fetching images for batch 41/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 836 out of 1000 images (83.60%)
Embedding images for batch 41/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  41%|████      | 41/100 [1:25:18<2:18:42, 141.06s/it]

Fetching images for batch 42/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Fetched 839 out of 1000 images (83.90%)
Embedding images for batch 42/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  42%|████▏     | 42/100 [1:27:20<2:11:01, 135.54s/it]

Fetching images for batch 43/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 823 out of 1000 images (82.30%)
Embedding images for batch 43/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  43%|████▎     | 43/100 [1:29:42<2:10:23, 137.25s/it]

Fetching images for batch 44/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 817 out of 1000 images (81.70%)
Embedding images for batch 44/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  44%|████▍     | 44/100 [1:31:50<2:05:31, 134.49s/it]

Fetching images for batch 45/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 827 out of 1000 images (82.70%)
Embedding images for batch 45/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  45%|████▌     | 45/100 [1:34:00<2:02:08, 133.24s/it]

Fetching images for batch 46/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 838 out of 1000 images (83.80%)
Embedding images for batch 46/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  46%|████▌     | 46/100 [1:35:59<1:56:00, 128.89s/it]

Fetching images for batch 47/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 836 out of 1000 images (83.60%)
Embedding images for batch 47/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  47%|████▋     | 47/100 [1:38:17<1:56:15, 131.62s/it]

Fetching images for batch 48/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 831 out of 1000 images (83.10%)
Embedding images for batch 48/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  48%|████▊     | 48/100 [1:40:39<1:56:42, 134.67s/it]

Fetching images for batch 49/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 832 out of 1000 images (83.20%)
Embedding images for batch 49/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  49%|████▉     | 49/100 [1:43:00<1:56:12, 136.72s/it]

Fetching images for batch 50/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 800 out of 1000 images (80.00%)
Embedding images for batch 50/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  50%|█████     | 50/100 [1:45:14<1:53:11, 135.83s/it]

Fetching images for batch 51/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 845 out of 1000 images (84.50%)
Embedding images for batch 51/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  51%|█████     | 51/100 [1:47:22<1:49:00, 133.47s/it]

Fetching images for batch 52/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 834 out of 1000 images (83.40%)
Embedding images for batch 52/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  52%|█████▏    | 52/100 [1:49:34<1:46:28, 133.09s/it]

Fetching images for batch 53/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 823 out of 1000 images (82.30%)
Embedding images for batch 53/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  53%|█████▎    | 53/100 [1:52:01<1:47:32, 137.29s/it]

Fetching images for batch 54/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 836 out of 1000 images (83.60%)
Embedding images for batch 54/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  54%|█████▍    | 54/100 [1:54:03<1:41:45, 132.72s/it]

Fetching images for batch 55/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 840 out of 1000 images (84.00%)
Embedding images for batch 55/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  55%|█████▌    | 55/100 [1:56:16<1:39:32, 132.72s/it]

Fetching images for batch 56/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 817 out of 1000 images (81.70%)
Embedding images for batch 56/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  56%|█████▌    | 56/100 [1:58:39<1:39:32, 135.73s/it]

Fetching images for batch 57/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 815 out of 1000 images (81.50%)
Embedding images for batch 57/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  57%|█████▋    | 57/100 [2:00:52<1:36:49, 135.11s/it]

Fetching images for batch 58/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 823 out of 1000 images (82.30%)
Embedding images for batch 58/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  58%|█████▊    | 58/100 [2:03:06<1:34:19, 134.76s/it]

Fetching images for batch 59/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 829 out of 1000 images (82.90%)
Embedding images for batch 59/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  59%|█████▉    | 59/100 [2:05:30<1:33:57, 137.50s/it]

Fetching images for batch 60/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 831 out of 1000 images (83.10%)
Embedding images for batch 60/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  60%|██████    | 60/100 [2:07:54<1:32:53, 139.35s/it]

Fetching images for batch 61/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 833 out of 1000 images (83.30%)
Embedding images for batch 61/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  61%|██████    | 61/100 [2:10:12<1:30:25, 139.12s/it]

Fetching images for batch 62/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 816 out of 1000 images (81.60%)
Embedding images for batch 62/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  62%|██████▏   | 62/100 [2:12:40<1:29:43, 141.68s/it]

Fetching images for batch 63/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 834 out of 1000 images (83.40%)
Embedding images for batch 63/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  63%|██████▎   | 63/100 [2:15:32<1:33:00, 150.81s/it]

Fetching images for batch 64/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 464, in getresponse
    assert_header_parsing(httplib_response.msg)
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/response.py", line 88, in assert_header_parsing
    raise HeaderParsingError(defects=defects, unparsed_data=unparsed_data)
urllib3.exceptions.HeaderParsingError: [MissingHeaderBodySeparatorDefect()], unparsed data: 'Cache-Control : no-cache, no-store, must-revalidate\r\nStrict-Transport-Security: max-age=63072000; preload\r\nX-AspNet-Version: 4.0.30319\r\nSet-Cookie: LongSession=Currency=USD&Language=EN; path=/\r\nX-Powered-By: ASP.NET\r\nDate: Fri, 01 Dec 2023 22:20:39 GMT\r\nContent-Length: 14862\r\n\r\n'


Fetched 813 out of 1000 images (81.30%)
Embedding images for batch 64/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  64%|██████▍   | 64/100 [2:17:57<1:29:21, 148.92s/it]

Fetching images for batch 65/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 821 out of 1000 images (82.10%)
Embedding images for batch 65/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  65%|██████▌   | 65/100 [2:20:25<1:26:42, 148.64s/it]

Fetching images for batch 66/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 848 out of 1000 images (84.80%)
Embedding images for batch 66/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  66%|██████▌   | 66/100 [2:22:52<1:23:58, 148.20s/it]

Fetching images for batch 67/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 823 out of 1000 images (82.30%)
Embedding images for batch 67/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  67%|██████▋   | 67/100 [2:25:15<1:20:40, 146.67s/it]

Fetching images for batch 68/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 823 out of 1000 images (82.30%)
Embedding images for batch 68/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  68%|██████▊   | 68/100 [2:27:46<1:18:58, 148.08s/it]

Fetching images for batch 69/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 825 out of 1000 images (82.50%)
Embedding images for batch 69/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  69%|██████▉   | 69/100 [2:30:27<1:18:26, 151.83s/it]

Fetching images for batch 70/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 816 out of 1000 images (81.60%)
Embedding images for batch 70/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  70%|███████   | 70/100 [2:32:54<1:15:10, 150.36s/it]

Fetching images for batch 71/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 827 out of 1000 images (82.70%)
Embedding images for batch 71/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  71%|███████   | 71/100 [2:35:26<1:12:55, 150.88s/it]

Fetching images for batch 72/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 860 out of 1000 images (86.00%)
Embedding images for batch 72/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  72%|███████▏  | 72/100 [2:37:57<1:10:22, 150.80s/it]

Fetching images for batch 73/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 825 out of 1000 images (82.50%)
Embedding images for batch 73/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  73%|███████▎  | 73/100 [2:40:18<1:06:38, 148.09s/it]

Fetching images for batch 74/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 824 out of 1000 images (82.40%)
Embedding images for batch 74/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  74%|███████▍  | 74/100 [2:42:44<1:03:54, 147.49s/it]

Fetching images for batch 75/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 851 out of 1000 images (85.10%)
Embedding images for batch 75/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  75%|███████▌  | 75/100 [2:45:18<1:02:13, 149.34s/it]

Fetching images for batch 76/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 848 out of 1000 images (84.80%)
Embedding images for batch 76/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  76%|███████▌  | 76/100 [2:47:49<59:57, 149.91s/it]  

Fetching images for batch 77/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 834 out of 1000 images (83.40%)
Embedding images for batch 77/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  77%|███████▋  | 77/100 [2:50:25<58:04, 151.51s/it]

Fetching images for batch 78/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 822 out of 1000 images (82.20%)
Embedding images for batch 78/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  78%|███████▊  | 78/100 [2:53:06<56:36, 154.37s/it]

Fetching images for batch 79/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 823 out of 1000 images (82.30%)
Embedding images for batch 79/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  79%|███████▉  | 79/100 [2:56:00<56:10, 160.51s/it]

Fetching images for batch 80/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 838 out of 1000 images (83.80%)
Embedding images for batch 80/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  80%|████████  | 80/100 [2:58:51<54:29, 163.46s/it]

Fetching images for batch 81/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 810 out of 1000 images (81.00%)
Embedding images for batch 81/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  81%|████████  | 81/100 [3:01:26<50:58, 160.98s/it]

Fetching images for batch 82/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 821 out of 1000 images (82.10%)
Embedding images for batch 82/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  82%|████████▏ | 82/100 [3:04:24<49:51, 166.17s/it]

Fetching images for batch 83/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 828 out of 1000 images (82.80%)
Embedding images for batch 83/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  83%|████████▎ | 83/100 [3:06:59<46:07, 162.77s/it]

Fetching images for batch 84/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 837 out of 1000 images (83.70%)
Embedding images for batch 84/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  84%|████████▍ | 84/100 [3:09:39<43:11, 161.96s/it]

Fetching images for batch 85/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 831 out of 1000 images (83.10%)
Embedding images for batch 85/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  85%|████████▌ | 85/100 [3:12:23<40:36, 162.41s/it]

Fetching images for batch 86/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 819 out of 1000 images (81.90%)
Embedding images for batch 86/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  86%|████████▌ | 86/100 [3:14:59<37:29, 160.67s/it]

Fetching images for batch 87/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 827 out of 1000 images (82.70%)
Embedding images for batch 87/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  87%|████████▋ | 87/100 [3:17:49<35:25, 163.51s/it]

Fetching images for batch 88/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 795 out of 1000 images (79.50%)
Embedding images for batch 88/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  88%|████████▊ | 88/100 [3:20:56<34:05, 170.44s/it]

Fetching images for batch 89/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 840 out of 1000 images (84.00%)
Embedding images for batch 89/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  89%|████████▉ | 89/100 [3:23:38<30:46, 167.88s/it]

Fetching images for batch 90/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 830 out of 1000 images (83.00%)
Embedding images for batch 90/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  90%|█████████ | 90/100 [3:26:32<28:16, 169.69s/it]

Fetching images for batch 91/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 841 out of 1000 images (84.10%)
Embedding images for batch 91/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  91%|█████████ | 91/100 [3:29:10<24:55, 166.17s/it]

Fetching images for batch 92/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 839 out of 1000 images (83.90%)
Embedding images for batch 92/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  92%|█████████▏| 92/100 [3:31:45<21:42, 162.80s/it]

Fetching images for batch 93/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 820 out of 1000 images (82.00%)
Embedding images for batch 93/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  93%|█████████▎| 93/100 [3:34:29<19:02, 163.26s/it]

Fetching images for batch 94/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 820 out of 1000 images (82.00%)
Embedding images for batch 94/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  94%|█████████▍| 94/100 [3:37:18<16:29, 164.87s/it]

Fetching images for batch 95/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 464, in getresponse
    assert_header_parsing(httplib_response.msg)
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/response.py", line 88, in assert_header_parsing
    raise HeaderParsingError(defects=defects, unparsed_data=unparsed_data)
urllib3.exceptions.HeaderParsingError: [MissingHeaderBodySeparatorDefect()], unparsed data: 'proxy-revalidate\r\nAccept-Ranges: bytes\r\n\r\n'


Fetched 824 out of 1000 images (82.40%)
Embedding images for batch 95/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  95%|█████████▌| 95/100 [3:40:02<13:42, 164.60s/it]

Fetching images for batch 96/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 832 out of 1000 images (83.20%)
Embedding images for batch 96/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  96%|█████████▌| 96/100 [3:42:51<11:03, 165.96s/it]

Fetching images for batch 97/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 854 out of 1000 images (85.40%)
Embedding images for batch 97/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  97%|█████████▋| 97/100 [3:46:06<08:44, 174.81s/it]

Fetching images for batch 98/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 814 out of 1000 images (81.40%)
Embedding images for batch 98/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  98%|█████████▊| 98/100 [3:49:14<05:57, 178.76s/it]

Fetching images for batch 99/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 818 out of 1000 images (81.80%)
Embedding images for batch 99/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches:  99%|█████████▉| 99/100 [3:52:01<02:55, 175.05s/it]

Fetching images for batch 100/100...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Fetched 794 out of 1000 images (79.40%)
Embedding images for batch 100/100...


Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Processing batches: 100%|██████████| 100/100 [3:54:53<00:00, 140.93s/it]


Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
delete_batch_data(
    dataset=dataset_train,
    file_name_base="train_data",
    drive_base_folder=drive_folder,
    batch_folder = "Train Data Batches",
)

Deleting batch folders...
Deleted folder: /content/drive/My Drive/MIT 6.8610/Train Data Batches/train_data_batch_0.hf
Deleted folder: /content/drive/My Drive/MIT 6.8610/Train Data Batches/train_data_batch_1.hf
Deleted folder: /content/drive/My Drive/MIT 6.8610/Train Data Batches/train_data_batch_2.hf
Deleted folder: /content/drive/My Drive/MIT 6.8610/Train Data Batches/train_data_batch_3.hf
Deleted folder: /content/drive/My Drive/MIT 6.8610/Train Data Batches/train_data_batch_4.hf
Deleted folder: /content/drive/My Drive/MIT 6.8610/Train Data Batches/train_data_batch_5.hf
Deleted folder: /content/drive/My Drive/MIT 6.8610/Train Data Batches/train_data_batch_6.hf
Deleted folder: /content/drive/My Drive/MIT 6.8610/Train Data Batches/train_data_batch_7.hf
Deleted folder: /content/drive/My Drive/MIT 6.8610/Train Data Batches/train_data_batch_8.hf
Deleted folder: /content/drive/My Drive/MIT 6.8610/Train Data Batches/train_data_batch_9.hf
Deleted folder: /content/drive/My Drive/MIT 6.8610/Tra

# Process Validation Dataset

In [None]:
total_items_val = len(dataset_val)
num_batches_val = math.ceil(total_items_val / batch_size)

process_and_save_dataset(
    dataset=dataset_val,
    drive_base_folder=drive_folder,
    batch_folder = "Val Data Batches",
    agg_folder = "Val Data Agg",
    file_name_base="val_data",
    image_processor=image_processor,
    model=model,
    device=device,
    batch_size=batch_size,
    fetch_fn_kwargs=fetch_fn_kwargs
)

In [None]:
delete_batch_data(
    dataset=dataset_val,
    file_name_base="val_data",
    drive_base_folder=drive_folder,
    batch_folder = "Val Data Batches",
)

# Loading Data

## Standard Loading

In [None]:
# from datasets import load_from_disk

# def load_dataset_from_colab(dataset_name, train_data_folder):
#     drive_base_folder = "MIT 6.8610"

#     dataset_path = os.path.join(
#             '/content/drive/My Drive',
#             f"{drive_base_folder}/{train_data_folder}/{dataset_name}"
#     )
#     return load_from_disk(dataset_path)

In [None]:
# # Load data
# val_data_name = "val_data_10000_1700173986.82443.hf"
# train_data_name = "train_data_10000_1700173778.1307948.hf"

# val_data = load_dataset_from_colab(val_data_name, train_data_folder="Val Data Agg")
# train_data = load_dataset_from_colab(train_data_name, train_data_folder="Train Data Agg")

In [None]:
# val_data

In [None]:
#  train_data