In [1]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

In [3]:
def download_image(image_link, entity_name, output_folder, index):
    """
    Downloads an image and saves it in the appropriate folder based on entity_name.
    
    Parameters:
    - image_link: URL of the image to be downloaded.
    - entity_name: The entity name to categorize the image.
    - output_folder: Path to the main folder where images will be saved.
    - index: Index to ensure unique filenames.
    """
    try:
        # Create a subfolder for each entity_name if it doesn't exist
        entity_folder = os.path.join(output_folder, entity_name)
        if not os.path.exists(entity_folder):
            os.makedirs(entity_folder)

        # Download the image
        response = requests.get(image_link)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Save the image
        image_filename = os.path.join(entity_folder, f"image_{index}.jpg")
        with open(image_filename, 'wb') as file:
            file.write(response.content)
    except Exception as e:
        print(f"Error downloading {image_link}: {e}")

In [5]:
def download_images_by_entity(image_links, entity_names, output_folder, max_workers=10):
    """
    Downloads images and categorizes them into separate folders based on entity_name.
    
    Parameters:
    - image_links: List of image URLs.
    - entity_names: List of entity names corresponding to each image.
    - output_folder: Path to the main folder where images will be saved.
    - max_workers: Maximum number of threads to use for downloading images.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for index, (image_link, entity_name) in enumerate(zip(image_links, entity_names)):
            futures.append(executor.submit(download_image, image_link, entity_name, output_folder, index))
        
        # Wait for all futures to complete
        for future in as_completed(futures):
            future.result()  # This will re-raise any exceptions caught during execution


In [7]:
# Example usage
if __name__ == "__main__":
    DATASET_FOLDER = '../dataset/'
    sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
    image_links = sample_test['image_link'].tolist()
    entity_names = sample_test['entity_name'].tolist()

    download_images_by_entity(image_links, entity_names, '../images')