In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import zipfile, os

zip_path = '/content/drive/MyDrive/AI final/data/images.zip'
extract_path = '/content/images_unzipped'

# unzip
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_path)

print("Done unzipping!")


Done unzipping!


In [None]:
import os, random, shutil
from glob import glob

input_folder = extract_path
output_folder = '/content/images_sampled'
os.makedirs(output_folder, exist_ok=True)

# get every file recursively
all_files = glob(os.path.join(input_folder, '**', '*.*'), recursive=True)

# filter to only image extensions
valid_exts = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp']
image_files = [f for f in all_files if os.path.splitext(f)[1].lower() in valid_exts]

print("Total images found:", len(image_files))

# 50% random sample
sample_size = len(image_files) // 2
sampled_images = random.sample(image_files, sample_size)

print("Sampling", sample_size, "images")

for f in sampled_images:
    rel_path = os.path.relpath(f, input_folder)
    dest_path = os.path.join(output_folder, rel_path)
    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    shutil.copy2(f, dest_path)

print("Done sampling!")


Total images found: 44096
Sampling 22048 images
Done sampling!


In [None]:
sample_zip_path = '/content/images_50percent.zip'

shutil.make_archive('/content/images_50percent', 'zip', output_folder)

print("Created:", sample_zip_path)


Created: /content/images_50percent.zip


In [None]:
shutil.move('/content/images_50percent.zip',
            '/content/drive/MyDrive/AI final/data/images_50percent.zip')

print("Saved to Google Drive!")


Saved to Google Drive!


In [None]:
import os, random, shutil
from glob import glob

input_folder = '/content/images_sampled'     # CHANGE if needed
output_folder = '/content/images_10k'
os.makedirs(output_folder, exist_ok=True)

valid_exts = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp']

# Collect images
all_files = glob(os.path.join(input_folder, '**', '*.*'), recursive=True)
image_files = [f for f in all_files if os.path.splitext(f)[1].lower() in valid_exts]

print("Images found:", len(image_files))

# Safety check
if len(image_files) < 10000:
    raise ValueError("Not enough images â€” you only have {} images".format(len(image_files)))

# Sample exactly 10,000
sampled_images = random.sample(image_files, 10000)

print("Sampling 10,000 images...")

for f in sampled_images:
    rel_path = os.path.relpath(f, input_folder)
    dest_path = os.path.join(output_folder, rel_path)
    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    shutil.copy2(f, dest_path)

print("Done! Saved 10,000 random images.")


Images found: 22048
Sampling 10,000 images...
Done! Saved 10,000 random images.


In [None]:
import shutil
shutil.make_archive('/content/images_10k', 'zip', '/content/images_10k')

# move to Drive if you want
shutil.move('/content/images_10k.zip', '/content/drive/MyDrive/AI final/data/images_10k.zip')


'/content/drive/MyDrive/AI final/data/images_10k.zip'

In [2]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
import zipfile

zip_path = '/content/drive/MyDrive/AI final/data/images_10k.zip'
extract_path = '/content/images_10k'

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_path)

print("Unzipped!")


Unzipped!


In [4]:
import os
from glob import glob

# get all image file names in the folder
all_files = glob(os.path.join(extract_path, '**', '*.*'), recursive=True)

# keep only image extensions
valid_exts = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp']

image_filenames = set(os.path.basename(f) for f in all_files
                      if os.path.splitext(f)[1].lower() in valid_exts)

print("Image count:", len(image_filenames))


Image count: 10000


In [6]:
import json

captions_path = '/content/drive/MyDrive/AI final/data/captions.json'

with open(captions_path, 'r') as f:
    captions = json.load(f)

print("Total captions:", len(captions))


Total captions: 42544


In [7]:
filtered_captions = {k: v for k, v in captions.items() if k in image_filenames}

print("Filtered captions:", len(filtered_captions))


Filtered captions: 9653


In [8]:
output_path = '/content/captions_10k.json'

with open(output_path, 'w') as f:
    json.dump(filtered_captions, f, indent=4)

print("Saved:", output_path)


Saved: /content/captions_10k.json


In [9]:
import shutil

shutil.move('/content/captions_10k.json',
            '/content/drive/MyDrive/AI final/data/captions_10k.json')

print("Saved to Drive!")


Saved to Drive!


In [10]:
valid_image_names = set(filtered_captions.keys())
len(valid_image_names)


9653

In [11]:
import os, shutil
from glob import glob

input_folder = '/content/images_10k'
output_folder = '/content/images_9653'
os.makedirs(output_folder, exist_ok=True)

# Collect all actual image paths
all_image_paths = glob(os.path.join(input_folder, '**', '*.*'), recursive=True)

valid_exts = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp']

count = 0
for path in all_image_paths:
    fname = os.path.basename(path)
    if fname in valid_image_names:
        dest = os.path.join(output_folder, fname)
        shutil.copy2(path, dest)
        count += 1

print("Copied images:", count)


Copied images: 9653


In [12]:
import shutil
shutil.make_archive('/content/images_9653', 'zip', '/content/images_9653')

# Move to Drive
shutil.move('/content/images_9653.zip',
            '/content/drive/MyDrive/AI final/data/images_9653.zip')


'/content/drive/MyDrive/AI final/data/images_9653.zip'