In [1]:
import os
import glob
import requests
from loguru import logger
from typing import List, Dict
from hcmus.core import appconfig

[32m2025-06-01 20:07:43.237[0m | [1mINFO    [0m | [36mhcmus.core.appconfig[0m:[36m<module>[0m:[36m7[0m - [1mLoad DotEnv: True[0m


In [2]:
INPUT_FOLDER = "/Users/keith/Downloads/resized"
PROJECT_ID = "train-shelves"

In [3]:
def chunk_list(lst, chunk_size):
    result = []
    for i in range(0, len(lst), chunk_size):
        result.append(lst[i:i + chunk_size])
    return result

In [4]:
def list_all_images(folder_path: str) -> List[str]:
    image_files = []
    extensions = ["jpg", "jpeg", "png", "JPG", "JPEG", "PNG"]
    for ext in extensions:
        pattern = f"{folder_path}/**/*.{ext}"
        logger.info(pattern)
        image_files.extend(glob.glob(pattern, recursive=True))

    return image_files

In [5]:
files = list_all_images(INPUT_FOLDER)
logger.info(f"Number of images: {len(files)}")
chunks = chunk_list(files, 50)
logger.info(f"Number of chunks: {len(chunks)}")

[32m2025-06-01 20:07:43.799[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Users/keith/Downloads/resized/**/*.jpg[0m
[32m2025-06-01 20:07:43.802[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Users/keith/Downloads/resized/**/*.jpeg[0m
[32m2025-06-01 20:07:43.803[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Users/keith/Downloads/resized/**/*.png[0m
[32m2025-06-01 20:07:43.804[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Users/keith/Downloads/resized/**/*.JPG[0m
[32m2025-06-01 20:07:43.804[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Users/keith/Downloads/resized/**/*.JPEG[0m
[32m2025-06-01 20:07:43.805[0m | [1mINFO    [0m | [36m__main__[0m:[36mlist_all_images[0m:[36m6[0m - [1m/Users/keith/Downloads/resized/**/*.PNG[0m
[32m2025-06-01 20:07:43.805[0m | [1mINFO    [0

In [6]:
def get_filename_from_path(path: str) -> str:
    return os.path.basename(path)

In [7]:
def get_label_studio_headers() -> Dict[str, str]:
    headers = {
        "Authorization": f"Token {appconfig.LABEL_STUDIO_API_KEY}"
    }
    return headers

In [8]:
def get_uploaded_tasks(dataset_name: str):
    """Fetch all tasks from Label Studio with pagination"""
    uploaded_files = set()
    page = 1
    page_size = 100  # You can increase this if needed
    headers = get_label_studio_headers()
    project_id = appconfig.LABEL_STUDIO_PROJECT_MAPPING[dataset_name]
    endpoint = f"{appconfig.LABEL_STUDIO_URL}/api/projects/{project_id}/tasks/"
    logger.info(f"Endpoint: {endpoint}")
    while True:
        response = requests.get(
            endpoint,
            headers=headers,
            params={"page": page, "page_size": page_size},
        )

        if response.status_code != 200:
            logger.warning("Error fetching tasks:" + response.text)
            break

        tasks = response.json()
        if not tasks:  # Stop when no more tasks are returned
            break

        uploaded_files.update(task["data"].get("image", "") for task in tasks)
        page += 1  # Move to the next page

    return uploaded_files

In [9]:
def upload_file(project_id: int, file_path: str, uploaded_tasks):
    filename = get_filename_from_path(file_path)
    for task in uploaded_tasks:
        if filename not in task: continue
        logger.info(f"File already exists: {filename}")
        return

    headers = get_label_studio_headers()
    with open(file_path, "rb") as f:
        files = {"file": f}
        endpoint = f"{appconfig.LABEL_STUDIO_URL}/api/projects/{project_id}/import"
        try:
            response = requests.post(
                endpoint,
                files=files,
                headers=headers,
            )
        except Exception as e:
            logger.warning(e)
            return None


    if response.status_code == 201:
        print(f"Uploaded: {file_path}")
        return response.json()
    else:
        print(f"Error uploading {file_path}: {response.text}")
        return None

def upload_multiple_files(chunk: List[str], uploaded_tasks, dataset_name: str):
    project_id = appconfig.LABEL_STUDIO_PROJECT_MAPPING[dataset_name]
    for file_path in chunk:
        upload_file(project_id, file_path, uploaded_tasks)

In [10]:
uploaded_tasks =  get_uploaded_tasks(PROJECT_ID)
logger.info(f"Number of uploaded tasks: {len(uploaded_tasks)}")

[32m2025-06-01 20:07:43.825[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_uploaded_tasks[0m:[36m9[0m - [1mEndpoint: http://jimica.ddns.net:8080/api/projects/10/tasks/[0m
[32m2025-06-01 20:07:43.869[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mNumber of uploaded tasks: 0[0m


In [11]:
for chunk in chunks:
    upload_multiple_files(chunk, uploaded_tasks, PROJECT_ID)

Uploaded: /Users/keith/Downloads/resized/1748079909426.jpg
Uploaded: /Users/keith/Downloads/resized/_storage_emulated_0_Android_data_com.miui.gallery_cache_SecurityShare_1748079907598.jpg
Uploaded: /Users/keith/Downloads/resized/IMG_20250521_094726.jpg
Uploaded: /Users/keith/Downloads/resized/1748079907740.jpg
Uploaded: /Users/keith/Downloads/resized/IMG_20250521_140417.jpg
Uploaded: /Users/keith/Downloads/resized/IMG_20250521_140429.jpg
Uploaded: /Users/keith/Downloads/resized/IMG_20250521_140438.jpg
Uploaded: /Users/keith/Downloads/resized/IMG_20250521_094656.jpg
Uploaded: /Users/keith/Downloads/resized/1748079907631.jpg
Uploaded: /Users/keith/Downloads/resized/1748079909436.jpg
Uploaded: /Users/keith/Downloads/resized/IMG_20250521_094828.jpg
Uploaded: /Users/keith/Downloads/resized/1748079508449.jpg
Uploaded: /Users/keith/Downloads/resized/1748079508458.jpg
Uploaded: /Users/keith/Downloads/resized/1748079908212.jpg
Uploaded: /Users/keith/Downloads/resized/1748079908171.jpg
Uploaded: