Extract "102flowers.tgz" to the "102flowers_extracted" folder inside the "102flowers_data_extracted" folder

In [1]:
import tarfile
import os

tgz_path = "../102flowers_data/102flowers.tgz"
extract_folder = "../102flowers_data/102flowers_extracted"

# create the folder if it doesn't exist
os.makedirs(extract_folder, exist_ok=True)

# extract .tgz file
with tarfile.open(tgz_path, "r:gz") as tar:
    tar.extractall(path=extract_folder)

print(f"Extracted {tgz_path} to {extract_folder}")

# both `102flowers_data/102flowers.tgz` and `102flowers_data/jpg/` should be in .gitignore so large files do not get pushed up to remote

Extracted ../102flowers_data/102flowers.tgz to ../102flowers_data/102flowers_extracted


Add "102flowers_extracted" to .gitignore

In [2]:
gitignore_path = "../.gitignore"
ignore_entry = "102flowers_data/102flowers_extracted/\n"

# read and check existing .gitignore content
with open(gitignore_path, "r") as f:
    lines = f.readlines()

# add entry if it's not already in the file
if ignore_entry not in lines:
    with open(gitignore_path, "a") as f:
        f.write(ignore_entry)
    print("Added '102flowers_data/102flowers_extracted/' to .gitignore")
else:
    print("Entry already exists in .gitignore")

Entry already exists in .gitignore


Load and examine "imagelabels.mat"

In [5]:
from scipy.io import loadmat
import numpy as np

# load the .mat file
mat_path = "../102flowers_data/imagelabels.mat"
data = loadmat(mat_path)

# check keys
print("Keys in .mat file:", data.keys())

# access image labels
if 'labels' in data:
    labels = data['labels']
else:
    labels = None
    print("Label key not found.")

# inspect labels
if labels is not None:
    labels = np.squeeze(labels)
    print(f"Total labels: {len(labels)}")
    print(f"Label sample (first 20): {labels[:20]}")
    print(f"Unique categories: {len(np.unique(labels))}")

    # count images per category
    unique_labels, counts = np.unique(labels, return_counts=True)
    print("\nImages per category:")
    for label, count in zip(unique_labels, counts):
        print(f"Category {label}: {count} images")


Keys in .mat file: dict_keys(['__header__', '__version__', '__globals__', 'labels'])
Total labels: 8189
Label sample (first 20): [77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77 77]
Unique categories: 102

Images per category:
Category 1: 40 images
Category 2: 60 images
Category 3: 40 images
Category 4: 56 images
Category 5: 65 images
Category 6: 45 images
Category 7: 40 images
Category 8: 85 images
Category 9: 46 images
Category 10: 45 images
Category 11: 87 images
Category 12: 87 images
Category 13: 49 images
Category 14: 48 images
Category 15: 49 images
Category 16: 41 images
Category 17: 85 images
Category 18: 82 images
Category 19: 49 images
Category 20: 56 images
Category 21: 40 images
Category 22: 59 images
Category 23: 91 images
Category 24: 42 images
Category 25: 41 images
Category 26: 41 images
Category 27: 40 images
Category 28: 66 images
Category 29: 78 images
Category 30: 85 images
Category 31: 52 images
Category 32: 45 images
Category 33: 46 images
Category 34: 4

In [6]:
# information from the 102flowers dataset meta data
# count images per category
unique_labels, counts = np.unique(labels, return_counts=True)

# flower category names (indexed by label - 1)
category_names = [
    "pink primrose", "hard-leaved pocket orchid", "canterbury bells", "sweet pea",
    "english marigold", "tiger lily", "moon orchid", "bird of paradise", "monkshood",
    "globe thistle", "snapdragon", "colt's foot", "king protea", "spear thistle",
    "yellow iris", "globe-flower", "purple coneflower", "peruvian lily", "balloon flower",
    "giant white arum lily", "fire lily", "pincushion flower", "fritillary", "red ginger",
    "grape hyacinth", "corn poppy", "prince of wales feathers", "stemless gentian",
    "artichoke", "sweet william", "carnation", "garden phlox", "love in the mist",
    "mexican aster", "alpine sea holly", "ruby-lipped cattleya", "cape flower",
    "great masterwort", "siam tulip", "lenten rose", "barbeton daisy", "daffodil",
    "sword lily", "petunia", "bee balm", "wild pansy", "primula", "sunflower", "pelargonium",
    "bishop of llandaff", "gaura", "geranium", "orange dahlia", "pink-yellow dahlia",
    "cautleya spicata", "japanese anemone", "black-eyed susan", "silverbush",
    "californian poppy", "osteospermum", "spring crocus", "iris", "windflower",
    "tree poppy", "gazania", "azalea", "water lily", "rose", "thorn apple",
    "morning glory", "passion flower", "lotus", "toad lily", "anthurium", "frangipani",
    "clematis", "hibiscus", "columbine", "desert-rose", "tree mallow", "magnolia",
    "cyclamen", "watercress", "canna lily", "hippeastrum", "bee orchid", "fairy slipper",
    "snapdragon", "daisy", "yellow daisy", "cautleya spicata", "crown imperial",
    "dandelion", "fritillary", "freesia", "gerbera", "poinsettia", "rudbeckia",
    "gazania", "azalea", "cyclamen", "king protea", "red ginger", "banana", "orange daisy",
    "bird of paradise"
]

print("\nImages per flower category:")
for label, count in zip(unique_labels, counts):
    name = category_names[label - 1] if 1 <= label <= len(category_names) else "Unknown"
    print(f"{label:03d} - {name}: {count} images")


Images per flower category:
001 - pink primrose: 40 images
002 - hard-leaved pocket orchid: 60 images
003 - canterbury bells: 40 images
004 - sweet pea: 56 images
005 - english marigold: 65 images
006 - tiger lily: 45 images
007 - moon orchid: 40 images
008 - bird of paradise: 85 images
009 - monkshood: 46 images
010 - globe thistle: 45 images
011 - snapdragon: 87 images
012 - colt's foot: 87 images
013 - king protea: 49 images
014 - spear thistle: 48 images
015 - yellow iris: 49 images
016 - globe-flower: 41 images
017 - purple coneflower: 85 images
018 - peruvian lily: 82 images
019 - balloon flower: 49 images
020 - giant white arum lily: 56 images
021 - fire lily: 40 images
022 - pincushion flower: 59 images
023 - fritillary: 91 images
024 - red ginger: 42 images
025 - grape hyacinth: 41 images
026 - corn poppy: 41 images
027 - prince of wales feathers: 40 images
028 - stemless gentian: 66 images
029 - artichoke: 78 images
030 - sweet william: 85 images
031 - carnation: 52 images
0