In [1]:
import os
import glob
import requests
from loguru import logger
from typing import List, Dict
from hcmus.core import appconfig

[32m2025-04-12 15:07:15.025[0m | [1mINFO    [0m | [36mhcmus.core.appconfig[0m:[36m<module>[0m:[36m7[0m - [1mLoad DotEnv: True[0m


In [2]:
def chunk_list(lst, chunk_size):
    result = []
    for i in range(0, len(lst), chunk_size):
        result.append(lst[i:i + chunk_size])
    return result

In [3]:
def list_all_images(folder_path: str) -> List[str]:
    image_files = []
    extensions = ["jpg", "png", "JPG", "PNG"]
    for ext in extensions:
        image_files.extend(glob.glob(f"{folder_path}/**/*.{ext}", recursive=True))

    return image_files

In [4]:
def get_filename_from_path(path: str) -> str:
    return os.path.basename(path)

In [5]:
def get_label_studio_headers() -> Dict[str, str]:
    headers = {
        "Authorization": f"Token {appconfig.LABEL_STUDIO_API_KEY}"
    }
    return headers

In [6]:
def get_uploaded_tasks(dataset_name: str):
    """Fetch all tasks from Label Studio with pagination"""
    uploaded_files = set()
    page = 1
    page_size = 100  # You can increase this if needed
    headers = get_label_studio_headers()
    project_id = appconfig.LABEL_STUDIO_PROJECT_MAPPING[dataset_name]
    endpoint = f"{appconfig.LABEL_STUDIO_URL}/api/projects/{project_id}/tasks/"
    logger.info(f"Endpoint: {endpoint}")
    while True:
        response = requests.get(
            endpoint,
            headers=headers,
            params={"page": page, "page_size": page_size},
        )

        if response.status_code != 200:
            logger.warning("Error fetching tasks:" + response.text)
            break

        tasks = response.json()
        if not tasks:  # Stop when no more tasks are returned
            break

        uploaded_files.update(task["data"].get("image", "") for task in tasks)
        page += 1  # Move to the next page

    return uploaded_files

In [7]:
def upload_file(project_id: int, file_path: str, uploaded_tasks):
    filename = get_filename_from_path(file_path)
    for task in uploaded_tasks:
        if filename not in task: continue
        logger.info(f"File already exists: {filename}")
        return

    headers = get_label_studio_headers()
    with open(file_path, "rb") as f:
        files = {"file": f}
        endpoint = f"{appconfig.LABEL_STUDIO_URL}/api/projects/{project_id}/import"
        try:
            response = requests.post(
                endpoint,
                files=files,
                headers=headers,
            )
        except Exception as e:
            logger.warning(e)
            return None


    if response.status_code == 201:
        print(f"Uploaded: {file_path}")
        return response.json()
    else:
        print(f"Error uploading {file_path}: {response.text}")
        return None

def upload_multiple_files(chunk: List[str], uploaded_tasks, dataset_name: str):
    project_id = appconfig.LABEL_STUDIO_PROJECT_MAPPING[dataset_name]
    for file_path in chunk:
        upload_file(project_id, file_path, uploaded_tasks)

In [8]:
files = list_all_images(appconfig.IMPORT_DATA_DIR)
chunks = chunk_list(files, 50)
logger.info(f"Number of chunks: {len(chunks)}")

[32m2025-04-12 15:07:16.163[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mNumber of chunks: 8[0m


In [9]:
uploaded_tasks =  get_uploaded_tasks("validation")
logger.info(f"Number of uploaded tasks: {len(uploaded_tasks)}")

[32m2025-04-12 15:07:16.167[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_uploaded_tasks[0m:[36m9[0m - [1mEndpoint: http://jimica.ddns.net:8080/api/projects/7/tasks/[0m
[32m2025-04-12 15:07:16.255[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mNumber of uploaded tasks: 0[0m


In [10]:
for chunk in chunks:
    upload_multiple_files(chunk, uploaded_tasks, "validation")

Uploaded: /Volumes/Cucumber/Projects/item-identification/local/shelves/20250303_124140.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/shelves/20250224_131256.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/shelves/20250224_132039.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/shelves/z6354732198691_76593582e8bb118d1837563c2d037387.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/shelves/20250303_125048.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/shelves/20250224_132549.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/shelves/20250224_133131.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/shelves/20250224_131929.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/shelves/20250224_124505.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/shelves/IMG_20250303_123318_345.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identifi