# SmartSign RAG: Data Ingestion & Preprocessing

This notebook handles the acquisition of the GTSRB dataset, extracts sample images for each traffic sign class, and prepares the catalog for multimodal indexing

Ingestion & Data Preparation

In [None]:
import os
import shutil
import pandas as pd
from glob import glob

# 1. Kaggle API Configuration
os.environ['KAGGLE_CONFIG_DIR'] = "." 

try:
    from kaggle.api.kaggle_api_extended import KaggleApi
except ImportError:
    print("Error: Please install kaggle library via 'pip install kaggle'")
    exit()

def setup_data():
    # Authenticate with Kaggle
    api = KaggleApi()
    api.authenticate()

    dataset = "meowmeowmeowmeowmeow/gtsrb-german-traffic-sign"
    zip_file = "gtsrb-german-traffic-sign.zip"

    # 2. Download and Unzip Dataset
    if not os.path.exists(zip_file):
        print(f"Downloading dataset: {dataset}...")
        try:
            api.dataset_download_files(dataset, path='.', unzip=True)
            print("Download and extraction complete.")
        except Exception as e:
            print(f"Error downloading dataset: {e}")
            return

    # 3. Create Output Directory
    output_dir = "data/samples"
    os.makedirs(output_dir, exist_ok=True)

    # 4. Locate the 'Train' folder (handling nested structures)
    # This searches for the folder containing the '0' or '00000' subfolder
    search_paths = glob("**/Train", recursive=True) + glob("**/train", recursive=True)
    if not search_paths:
        print("Error: Could not find 'Train' directory. Check your dataset structure.")
        return
    
    train_path = search_paths[0]
    print(f"Source data found at: {train_path}")

    # 5. Extract One Sample per Class
    sample_mapping = []
    # Identify subdirectories that are digits (classes 0-42)
    classes = sorted([d for d in os.listdir(train_path) if d.isdigit()])

    print(f"Copying samples for {len(classes)} classes...")

    for cls in classes:
        cls_dir = os.path.join(train_path, cls)
        # Find images with supported extensions
        images = [f for f in os.listdir(cls_dir) if f.lower().endswith(('.png', '.jpg', '.ppm'))]
        
        if images:
            images.sort() # Take the first image alphabetically
            src_path = os.path.join(cls_dir, images[0])
            
            # Save as class_X.png (converting '00014' to '14')
            clean_id = str(int(cls))
            dest_filename = f"class_{clean_id}.png"
            dest_path = os.path.join(output_dir, dest_filename)
            
            shutil.copy(src_path, dest_path)
            sample_mapping.append({"class_id": clean_id, "image_path": dest_path})

    # 6. Save Catalog for LangChain
    df = pd.DataFrame(sample_mapping)
    df.to_csv("data/image_catalog.csv", index=False)
    
    print("-" * 30)
    print(f"Success! {len(sample_mapping)} images saved to '{output_dir}'")
    print("Catalog created: 'data/image_catalog.csv'")

if __name__ == "__main__":
    setup_data()

Visualization

In [None]:
# Load the created catalog
catalog_path = "data/image_catalog.csv"

if os.path.exists(catalog_path):
    df_preview = pd.read_csv(catalog_path)
    
    # Display the first 10 signs for verification
    num_samples = 10
    fig, axes = plt.subplots(2, 5, figsize=(20, 8))
    axes = axes.flatten()

    for i in range(num_samples):
        if i < len(df_preview):
            img_path = df_preview.iloc[i]['image_path']
            class_id = df_preview.iloc[i]['class_id']
            
            img = mpimg.imread(img_path)
            axes[i].imshow(img)
            axes[i].set_title(f"Class ID: {class_id}")
            axes[i].axis('off')

    plt.tight_layout()
    plt.show()
    
    print("Sample mapping from CSV:")
    print(df_preview.head(10))
else:
    print("Error: image_catalog.csv not found. Run the ingestion cell first.")