In [1]:
import os
import glob
import requests
from loguru import logger
from typing import List, Dict
from hcmus.core import appconfig

[32m2025-06-09 11:37:22.812[0m | [1mINFO    [0m | [36mhcmus.core.appconfig[0m:[36m<module>[0m:[36m7[0m - [1mLoad DotEnv: True[0m


In [2]:
INPUT_FOLDER = "/Volumes/Cucumber/Scripts/20250608_resized"
PROJECT_ID = "train"

In [3]:
def chunk_list(lst, chunk_size):
    result = []
    for i in range(0, len(lst), chunk_size):
        result.append(lst[i:i + chunk_size])
    return result

In [4]:
def list_all_images(folder_path: str) -> List[str]:
    image_files = []
    extensions = ["jpg", "jpeg", "png", "JPG", "JPEG", "PNG"]
    for ext in extensions:
        pattern = f"{folder_path}/**/*.{ext}"
        logger.info(pattern)
        image_files.extend(glob.glob(pattern, recursive=True))

    return image_files

In [13]:
files = list_all_images(INPUT_FOLDER)
files = sorted(files)
logger.info(f"Number of images: {len(files)}")
chunks = chunk_list(files, 50)
logger.info(f"Number of chunks: {len(chunks)}")

[32m2025-06-09 11:39:43.360[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Volumes/Cucumber/Scripts/20250608_resized/**/*.jpg[0m
[32m2025-06-09 11:39:43.363[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Volumes/Cucumber/Scripts/20250608_resized/**/*.jpeg[0m
[32m2025-06-09 11:39:43.365[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Volumes/Cucumber/Scripts/20250608_resized/**/*.png[0m
[32m2025-06-09 11:39:43.367[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Volumes/Cucumber/Scripts/20250608_resized/**/*.JPG[0m
[32m2025-06-09 11:39:43.368[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Volumes/Cucumber/Scripts/20250608_resized/**/*.JPEG[0m
[32m2025-06-09 11:39:43.369[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Volumes/Cucumber/Scripts/20250608

In [15]:
def get_filename_from_path(path: str) -> str:
    return os.path.basename(path)

In [16]:
def get_label_studio_headers() -> Dict[str, str]:
    headers = {
        "Authorization": f"Token {appconfig.LABEL_STUDIO_API_KEY}"
    }
    return headers

In [17]:
def get_uploaded_tasks(dataset_name: str):
    """Fetch all tasks from Label Studio with pagination"""
    uploaded_files = set()
    page = 1
    page_size = 100  # You can increase this if needed
    headers = get_label_studio_headers()
    project_id = appconfig.LABEL_STUDIO_PROJECT_MAPPING[dataset_name]
    endpoint = f"{appconfig.LABEL_STUDIO_URL}/api/projects/{project_id}/tasks/"
    logger.info(f"Endpoint: {endpoint}")
    while True:
        response = requests.get(
            endpoint,
            headers=headers,
            params={"page": page, "page_size": page_size},
        )

        if response.status_code != 200:
            logger.warning("Error fetching tasks:" + response.text)
            break

        tasks = response.json()
        if not tasks:  # Stop when no more tasks are returned
            break

        uploaded_files.update(task["data"].get("image", "") for task in tasks)
        page += 1  # Move to the next page

    return uploaded_files

In [9]:
def upload_file(project_id: int, file_path: str, uploaded_tasks):
    filename = get_filename_from_path(file_path)
    for task in uploaded_tasks:
        if filename not in task: continue
        logger.info(f"File already exists: {filename}")
        return

    headers = get_label_studio_headers()
    with open(file_path, "rb") as f:
        files = {"file": f}
        endpoint = f"{appconfig.LABEL_STUDIO_URL}/api/projects/{project_id}/import"
        try:
            response = requests.post(
                endpoint,
                files=files,
                headers=headers,
            )
        except Exception as e:
            logger.warning(e)
            return None


    if response.status_code == 201:
        print(f"Uploaded: {file_path}")
        return response.json()
    else:
        print(f"Error uploading {file_path}: {response.text}")
        return None

def upload_multiple_files(chunk: List[str], uploaded_tasks, dataset_name: str):
    project_id = appconfig.LABEL_STUDIO_PROJECT_MAPPING[dataset_name]
    for file_path in chunk:
        upload_file(project_id, file_path, uploaded_tasks)

In [18]:
uploaded_tasks = get_uploaded_tasks(PROJECT_ID)
logger.info(f"Number of uploaded tasks: {len(uploaded_tasks)}")

[32m2025-06-09 11:39:58.965[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_uploaded_tasks[0m:[36m9[0m - [1mEndpoint: http://jimica.ddns.net:8080/api/projects/1/tasks/[0m
[32m2025-06-09 11:40:06.119[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mNumber of uploaded tasks: 2811[0m


In [19]:
for chunk in chunks:
    upload_multiple_files(chunk, uploaded_tasks, PROJECT_ID)

Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0810.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0811.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0812.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0813.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0814.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0815.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0816.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0817.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0818.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0819.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0820.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0821.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0822.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0823.jpeg
Uploaded: /Volumes/Cucumber/Scripts/20250608_resized/IMG_0824.