In [8]:
# Importing necessary libraries
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

In [9]:
# Constants Definition
RAW_IMAGES = "raw/train"
LABELS_CSV = "raw/labels.csv"
BASE_DIR = "data"

TRAIN_DIR = os.path.join(BASE_DIR, "train")
VAL_DIR = os.path.join(BASE_DIR, "val")
TOP_N = 100

In [10]:
# Making sure the folders exist
for folder in [TRAIN_DIR, VAL_DIR]:
    os.makedirs(folder, exist_ok=True)

In [15]:
# Read the CSV file and filter top breeds
df = pd.read_csv(LABELS_CSV)
top_breeds = df['breed'].value_counts().index
df = df[df['breed'].isin(top_breeds)]

df.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [16]:
# Splitting the dataset into training and validation sets
train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df["breed"], random_state=42
)

In [17]:
# Copy images to the train and validation directories
def copy_images(df_subset, dest_dir):
    for _, row in df_subset.iterrows():
        breed = row['breed']
        img_id = row['id']
        src = os.path.join(RAW_IMAGES, f"{img_id}.jpg")
        dst_folder = os.path.join(dest_dir, breed)
        os.makedirs(dst_folder, exist_ok=True)
        dst = os.path.join(dst_folder, f"{img_id}.jpg")
        shutil.copy(src, dst)

In [18]:
print("Copying training images...")
copy_images(train_df, TRAIN_DIR)
print("Copying validation images...")
copy_images(val_df, VAL_DIR)
print("Dataset preparation complete.")

Copying training images...
Copying validation images...
Dataset preparation complete.
