In [1]:
import os
import glob
import requests
from loguru import logger
from typing import List, Dict
from hcmus.core import pconf

[32m2025-03-08 11:32:13.522[0m | [1mINFO    [0m | [36mhcmus.core.pconf[0m:[36m<module>[0m:[36m5[0m - [1mLoad DotEnv: True[0m


In [2]:
def chunk_list(lst, chunk_size):
    result = []
    for i in range(0, len(lst), chunk_size):
        result.append(lst[i:i + chunk_size])
    return result

In [12]:
def list_all_images(folder_path: str) -> List[str]: 
    image_files = []
    extensions = ["jpg", "png", "JPG", "PNG"]
    for ext in extensions:
        image_files.extend(glob.glob(f"{folder_path}/**/*.{ext}", recursive=True))

    return image_files

In [13]:
def get_filename_from_path(path: str) -> str:
    return os.path.basename(path)

In [14]:
def get_label_studio_headers() -> Dict[str, str]:
    headers = {
        "Authorization": f"Token {pconf.LABEL_STUDIO_API_KEY}"
    }
    return headers

In [36]:
# def get_uploaded_tasks():
#     headers = get_label_studio_headers()
#     endpoint = f"{pconf.LABEL_STUDIO_URL}/api/projects/{pconf.LABEL_STUDIO_PROJECT_ID}/tasks/"
#     response = requests.get(
#         endpoint,
#         headers=headers,
#     )
#     if response.status_code == 200:
#         tasks = response.json()
#         files = {task["data"].get("image", "") for task in tasks}  # Extract uploaded file paths
#         files = {get_filename_from_path(x) for x in files}
#         return files
#     else:
#         print("Error fetching tasks:", response.text)
#         return set()
def get_uploaded_tasks():
    """Fetch all tasks from Label Studio with pagination"""
    uploaded_files = set()
    page = 1
    page_size = 100  # You can increase this if needed
    headers = get_label_studio_headers()
    endpoint = f"{pconf.LABEL_STUDIO_URL}/api/projects/{pconf.LABEL_STUDIO_PROJECT_ID}/tasks/"

    while True:
        response = requests.get(
            endpoint,
            headers=headers,
            params={"page": page, "page_size": page_size},
        )

        if response.status_code != 200:
            logger.warning("Error fetching tasks:" + response.text)
            break

        tasks = response.json()
        if not tasks:  # Stop when no more tasks are returned
            break

        uploaded_files.update(task["data"].get("image", "") for task in tasks)
        page += 1  # Move to the next page

    return uploaded_files

In [37]:
def upload_file(file_path: str, uploaded_tasks):
    filename = get_filename_from_path(file_path)
    for task in uploaded_tasks:
        if filename not in task: continue
        logger.info(f"File already exists: {filename}") 
        return

    headers = get_label_studio_headers()
    with open(file_path, "rb") as f:
        files = {"file": f}
        endpoint = f"{pconf.LABEL_STUDIO_URL}/api/projects/{pconf.LABEL_STUDIO_PROJECT_ID}/import"
        response = requests.post(
            endpoint,
            files=files,
            headers=headers,
        )
    
    if response.status_code == 201:
        print(f"Uploaded: {file_path}")
        return response.json()
    else:
        print(f"Error uploading {file_path}: {response.text}")
        return None

def upload_multiple_files(chunk: List[str], uploaded_tasks):
    for file_path in chunk:
        upload_file(file_path, uploaded_tasks)

In [38]:
files = list_all_images(pconf.IMPORT_DATA_DIR) 
chunks = chunk_list(files, 50)
logger.info(f"Number of chunks: {len(chunks)}")

[32m2025-03-08 11:42:54.482[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mNumber of chunks: 32[0m


In [39]:
uploaded_tasks =  get_uploaded_tasks()
logger.info(f"Number of uploaded tasks: {len(uploaded_tasks)}")

[32m2025-03-08 11:42:56.393[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mNumber of uploaded tasks: 189[0m


In [None]:
for chunk in chunks:
    upload_multiple_files(chunk, uploaded_tasks)

[32m2025-03-08 11:42:59.342[0m | [1mINFO    [0m | [36m__main__[0m:[36mupload_file[0m:[36m5[0m - [1mFile already exists: 20250303_122532.jpg[0m
[32m2025-03-08 11:42:59.344[0m | [1mINFO    [0m | [36m__main__[0m:[36mupload_file[0m:[36m5[0m - [1mFile already exists: IMG_20250224_131820.jpg[0m
[32m2025-03-08 11:42:59.344[0m | [1mINFO    [0m | [36m__main__[0m:[36mupload_file[0m:[36m5[0m - [1mFile already exists: 20250303_122533.jpg[0m
[32m2025-03-08 11:42:59.346[0m | [1mINFO    [0m | [36m__main__[0m:[36mupload_file[0m:[36m5[0m - [1mFile already exists: IMG_20250224_131944.jpg[0m
[32m2025-03-08 11:42:59.346[0m | [1mINFO    [0m | [36m__main__[0m:[36mupload_file[0m:[36m5[0m - [1mFile already exists: IMG_20250224_131945.jpg[0m
[32m2025-03-08 11:42:59.347[0m | [1mINFO    [0m | [36m__main__[0m:[36mupload_file[0m:[36m5[0m - [1mFile already exists: IMG_20250224_131951.jpg[0m
[32m2025-03-08 11:42:59.347[0m | [1mINFO    [0m | 

Uploaded: /Volumes/Cucumber/Projects/item-identification/local/Beverages - Juice/IMG_20250303_123358_428.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/Beverages - Juice/IMG_20250303_123419_655.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/Beverages - Juice/z6354735949699_11161808441deaa12fae74e3da93e146.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/Beverages - Juice/IMG_20250303_123253_613.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/Beverages - Juice/IMG_20250303_123245_319.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/Beverages - Juice/z6354735949715_976e3f26d07b9a68ea778941e1de69cb.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/Beverages - Juice/z6354735978681_325ce3c3fe38b5fcac78c637a453816b.jpg
Uploaded: /Volumes/Cucumber/Projects/item-identification/local/Beverages - Juice/z6354735965264_7e9047ce8bfc940ec341f1dd2e61c3a0.jpg
Uploaded: /Volumes/Cucumber/Proj