In [1]:
PROJECT_ROOT = r"C:\Users\Omen\Desktop\A5 ia conda"

In [2]:
import os
import pandas as pd

def get_image_data(base_path, relative_path, label_name):
    """
    Scans a folder and returns a DataFrame with image paths and a specific label.
    
    Args:
        base_path (str): The root project path (varies by PC).
        relative_path (str): The specific folder (e.g., 'Dataset Livrable 1/Photos').
        label_name (str): The tag to assign (e.g., 'photo', 'schematic').
        
    Returns:
        pd.DataFrame: DataFrame with columns ['path', 'label']
    """
    # 1. Construct the full path safely (works on Windows/Mac/Linux)
    target_directory = os.path.join(base_path, relative_path)
    
    data = []
    valid_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff')
    
    # 2. Safety Check
    if not os.path.exists(target_directory):
        print(f"⚠️ Warning: Directory not found at: {target_directory}")
        return pd.DataFrame()

    # 3. Loop and Label
    for filename in os.listdir(target_directory):
        if filename.lower().endswith(valid_extensions):
            full_path = os.path.join(target_directory, filename)
            data.append({
                'path': full_path,
                'label': label_name
            })
            
    df = pd.DataFrame(data)
    print(f"✅ Loaded {len(df)} images labeled as '{label_name}' from {relative_path}")
    return df

In [3]:
# Load Photos (Label = 'photo')
df_photos = get_image_data(
    base_path=PROJECT_ROOT, 
    relative_path=r"Dataset Livrable 1\Photo", 
    label_name="photo"
)

✅ Loaded 9993 images labeled as 'photo' from Dataset Livrable 1\Photo


In [4]:
df_photos

Unnamed: 0,path,label
0,C:\Users\Omen\Desktop\A5 ia conda\Dataset Livr...,photo
1,C:\Users\Omen\Desktop\A5 ia conda\Dataset Livr...,photo
2,C:\Users\Omen\Desktop\A5 ia conda\Dataset Livr...,photo
3,C:\Users\Omen\Desktop\A5 ia conda\Dataset Livr...,photo
4,C:\Users\Omen\Desktop\A5 ia conda\Dataset Livr...,photo
...,...,...
9988,C:\Users\Omen\Desktop\A5 ia conda\Dataset Livr...,photo
9989,C:\Users\Omen\Desktop\A5 ia conda\Dataset Livr...,photo
9990,C:\Users\Omen\Desktop\A5 ia conda\Dataset Livr...,photo
9991,C:\Users\Omen\Desktop\A5 ia conda\Dataset Livr...,photo


In [5]:
df_schematics = get_image_data(
    base_path=PROJECT_ROOT, 
    relative_path=r"Dataset Livrable 1\Schematics", 
    label_name="other"
)

✅ Loaded 10000 images labeled as 'other' from Dataset Livrable 1\Schematics


In [6]:
df_painting = get_image_data(
    base_path=PROJECT_ROOT, 
    relative_path=r"Dataset Livrable 1\Painting", 
    label_name="other"
)

✅ Loaded 9999 images labeled as 'other' from Dataset Livrable 1\Painting


In [7]:
df_sketch = get_image_data(
    base_path=PROJECT_ROOT, 
    relative_path=r"Dataset Livrable 1\Sketch", 
    label_name="other"
)

✅ Loaded 1406 images labeled as 'other' from Dataset Livrable 1\Sketch


In [8]:
df_text = get_image_data(
    base_path=PROJECT_ROOT, 
    relative_path=r"Dataset Livrable 1\Text", 
    label_name="other"
)

✅ Loaded 10000 images labeled as 'other' from Dataset Livrable 1\Text


In [9]:
# 1. List all the dataframes you want to combine
all_dfs = [df_photos, df_schematics, df_painting, df_sketch, df_text]

# 2. Concatenate them into one big DataFrame
# ignore_index=True creates a fresh index from 0 to N
master_df = pd.concat(all_dfs, axis=0, ignore_index=True)

# 3. Randomize (Shuffle) the dataset
# frac=1 means "return 100% of the rows", but in random order
master_df = master_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 4. Check the balance (Important for your Report!)
print("--- Class Distribution ---")
print(master_df['label'].value_counts())

# 5. Export to Excel
# We construct the path using PROJECT_ROOT so it saves in your project folder
output_excel_path = os.path.join(PROJECT_ROOT, "dataset_metadata.xlsx")

# index=False prevents pandas from writing the row numbers (0, 1, 2...) into the Excel file
master_df.to_excel(output_excel_path, index=False)

print(f"\n✅ Dataset successfully exported to: {output_excel_path}")

--- Class Distribution ---
label
other    31405
photo     9993
Name: count, dtype: int64

✅ Dataset successfully exported to: C:\Users\Omen\Desktop\A5 ia conda\dataset_metadata.xlsx
