# Download the SKINgpt dataset to the local machine and generate JSON data.

In [None]:
import os
import json
import requests
import random
from urllib.parse import urlparse
from datetime import datetime


BASE_DIR     = "/home/william/dataset/skin/SKINgpt"
INPUT_JSON   = os.path.join(BASE_DIR, "output.json")
IMAGE_DIR    = os.path.join(BASE_DIR, "image")
OUT_JSON_DIR = BASE_DIR


os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(OUT_JSON_DIR, exist_ok=True)


with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)


all_labels = sorted({ item["label"] for item in data if "label" in item })

# data = data[:100]

results = []
for item in data:
    url   = item.get("url")
    label = item.get("label")
    if not url or not label:
        continue


    filename = os.path.basename(urlparse(url).path)
    save_path = os.path.join(IMAGE_DIR, filename)


    if not os.path.exists(save_path):
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                              "AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/115.0.0.0 Safari/537.36",
                "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
            }
            resp = requests.get(url, headers=headers, timeout=10)
            resp.raise_for_status()
            with open(save_path, "wb") as img_f:
                img_f.write(resp.content)
            print(f"Downloaded {filename}")
        except Exception as e:
            print(f"[Error] download fail: {url} -> {e}")

            continue


    distractors = random.sample([l for l in all_labels if l != label], k=3)
    choices = [label] + distractors
    random.shuffle(choices)


    option_strs = []
    for idx, choice in enumerate(choices):
        letter = chr(ord('A') + idx)
        option_strs.append(f"{letter}:{choice}")
    option_field = ". ".join(option_strs) + "."


    results.append({
        "image_name": filename,
        "label":      label,
        "url":        url,
        "option":     option_field
    })


timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
out_json  = os.path.join(OUT_JSON_DIR, f"{timestamp}_SKINgpt.json")
with open(out_json, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\nDone! Generated {len(results)} entries → {out_json}")


In [None]:
from urllib.parse import urlparse, parse_qs
import mimetypes

parsed = urlparse(url)
filename = os.path.basename(parsed.path)


if '.' not in filename:
    try:
        content_type = resp.headers.get("Content-Type", "")
        ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg"
        query = parse_qs(parsed.query)
        image_id = query.get("imageId", [str(random.randint(1000, 9999))])[0]
        filename = f"{image_id}{ext}"
    except:
        filename = f"unknown_{random.randint(1000,9999)}.jpg"


In [None]:
import os
import json
import requests
import random
from urllib.parse import urlparse
from datetime import datetime
from urllib.parse import urlparse, parse_qs
import mimetypes


BASE_DIR     = "/home/william/dataset/skin/SKINgpt"
INPUT_JSON   = os.path.join(BASE_DIR, "output.json")
IMAGE_DIR    = os.path.join(BASE_DIR, "image1")
OUT_JSON_DIR = BASE_DIR


os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(OUT_JSON_DIR, exist_ok=True)


with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)


all_labels = sorted({ item["label"] for item in data if "label" in item })

# data = data[:100]


results = []
for item in data:
    url   = item.get("url")
    label = item.get("label")
    if not url or not label:
        continue


    filename = os.path.basename(urlparse(url).path)
    save_path = os.path.join(IMAGE_DIR, filename)


    if not os.path.exists(save_path):
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                              "AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/115.0.0.0 Safari/537.36",
                "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
            }
            parsed = urlparse(url)
            filename = os.path.basename(parsed.path)
            if '.' not in filename:
                try:
                    content_type = resp.headers.get("Content-Type", "")
                    ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg"
                    query = parse_qs(parsed.query)
                    image_id = query.get("imageId", [str(random.randint(1000, 9999))])[0]
                    filename = f"{image_id}{ext}"
                except:
                    filename = f"unknown_{random.randint(1000,9999)}.jpg"
            resp = requests.get(url, headers=headers, timeout=10)
            resp.raise_for_status()
            with open(save_path, "wb") as img_f:
                img_f.write(resp.content)
            print(f"Downloaded {filename}")
        except Exception as e:
            print(f"[Error] Downloaded fail: {url} -> {e}")

            continue


    distractors = random.sample([l for l in all_labels if l != label], k=3)
    choices = [label] + distractors
    random.shuffle(choices)


    option_strs = []
    for idx, choice in enumerate(choices):
        letter = chr(ord('A') + idx)
        option_strs.append(f"{letter}:{choice}")
    option_field = ". ".join(option_strs) + "."


    results.append({
        "image_name": filename,
        "label":      label,
        "url":        url,
        "option":     option_field
    })


timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
out_json  = os.path.join(OUT_JSON_DIR, f"{timestamp}_SKINgpt.json")
with open(out_json, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\nDone! Generated {len(results)} entries → {out_json}")

In [None]:
import os
import json
import requests
import random
from urllib.parse import urlparse, parse_qs
from datetime import datetime
import mimetypes
import re

# Configuration
BASE_DIR     = "/home/william/dataset/skin/SKINgpt"
INPUT_JSON   = os.path.join(BASE_DIR, "output_df.json")
IMAGE_DIR    = os.path.join(BASE_DIR, "image3")
OUT_JSON_DIR = BASE_DIR

# Ensure directories exist
os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(OUT_JSON_DIR, exist_ok=True)

# 1. Load input data
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

# 2. Collect all distinct labels
all_labels = sorted({ item["label"] for item in data if "label" in item })

# 3. Download images and construct result entries
results = []
for item in data:
    url   = item.get("url")
    label = item.get("label")
    if not url or not label:
        continue

    try:
        # Prepare request
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/115.0.0.0 Safari/537.36",
            "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
        }

        # Make request
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Construct filename using Content-Disposition or query
        parsed = urlparse(url)
        content_type = response.headers.get("Content-Type", "")
        ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg"

        content_disposition = response.headers.get("Content-Disposition", "")
        match = re.search(r'filename="([^"]+)"', content_disposition)
        base_name = match.group(1) if match else None

        if base_name and not base_name.endswith(ext):
            filename = f"{base_name}{ext}"
        elif not base_name:
            query = parse_qs(parsed.query)
            image_id = query.get("imageId", [str(random.randint(1000, 9999))])[0]
            filename = f"{image_id}{ext}"
        else:
            filename = base_name

        save_path = os.path.join(IMAGE_DIR, filename)

        # Save image
        with open(save_path, "wb") as img_f:
            img_f.write(response.content)
        print(f"Downloaded {filename}")

    except Exception as e:
        print(f"[Error] Failed to download: {url} -> {e}")
        continue

    # Construct multiple-choice options
    distractors = random.sample([l for l in all_labels if l != label], k=3)
    choices = [label] + distractors
    random.shuffle(choices)

    # Format as "A:xxx. B:yyy. C:zzz. D:www."
    option_strs = []
    for idx, choice in enumerate(choices):
        letter = chr(ord('A') + idx)
        option_strs.append(f"{letter}:{choice}")
    option_field = ". ".join(option_strs) + "."

    # Append to result
    results.append({
        "image_name": filename,
        "answer":      label,
        "url":        url,
        "question":     option_field,
        "question_type": "multiple_choice_QA"
    })

# 4. Save output JSON with timestamp
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
out_json  = os.path.join(OUT_JSON_DIR, f"{timestamp}_SKINgpt_multiple_choice_QA.json")
with open(out_json, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\nDone! Generated {len(results)} entries → {out_json}")


In [None]:
import json
from datetime import datetime
import os
print("Current working directory:", os.getcwd())

input_path = "../../../../dataset/skin/SKINgpt/20250717115404_SKINgpt.json"


with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)


converted = []
for item in data:
    new_item = {
        "image_name": item.get("image_name"),
        "answer": item.get("label"),
        "question": item.get("option"),
        "url": item.get("url"),
        "question_type": "multiple_choice_QA"
    }
    converted.append(new_item)


timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
output_path = f"../../../../dataset/skin/SKINgpt/{timestamp}_SKINgpt_multiple_choice_QA.json"


with open(output_path, "w", encoding="utf-8") as f:
    json.dump(converted, f, ensure_ascii=False, indent=2)

print(f"Saved to: {output_path}")


# download all image

In [None]:
import os
import json
import requests
import random
from urllib.parse import urlparse, parse_qs
from datetime import datetime
import mimetypes
import re

# Configuration
BASE_DIR       = "/home/william/dataset/skin/SKINgpt"
INPUT_JSON     = os.path.join(BASE_DIR, "output.json")  # or "output_df.json"
IMAGE_DIR      = os.path.join(BASE_DIR, "image3")
OUT_JSON_DIR   = BASE_DIR


os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(OUT_JSON_DIR, exist_ok=True)

# Load JSON data
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

# Collect all unique labels
all_labels = sorted({item["label"] for item in data if "label" in item})

results = []

for item in data:
    url   = item.get("url")
    label = item.get("label")
    if not url or not label:
        continue

    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/115.0.0.0 Safari/537.36",
            "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        parsed = urlparse(url)
        content_type = response.headers.get("Content-Type", "")
        ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) or ".jpg"

        content_disposition = response.headers.get("Content-Disposition", "")
        match = re.search(r'filename="([^"]+)"', content_disposition)
        base_name = match.group(1) if match else None

        # Determine final filename
        if base_name and not base_name.endswith(ext):
            filename = f"{base_name}{ext}"
        elif not base_name:
            if parsed.netloc == "www.dermaamin.com":
                filename = os.path.basename(parsed.path)
            else:
                query = parse_qs(parsed.query)
                image_id = query.get("imageId", [str(random.randint(1000, 9999))])[0]
                filename = f"{image_id}{ext}"
        else:
            filename = base_name

        save_path = os.path.join(IMAGE_DIR, filename)

        # Save image
        with open(save_path, "wb") as img_f:
            img_f.write(response.content)
        print(f"[✓] Downloaded: {filename}")

    except Exception as e:
        print(f"[×] Failed to download: {url} -> {e}")
        continue

    # Generate multiple-choice options
    distractors = random.sample([l for l in all_labels if l != label], k=3)
    choices = [label] + distractors
    random.shuffle(choices)

    option_strs = [f"{chr(65+i)}:{c}" for i, c in enumerate(choices)]
    question_str = ". ".join(option_strs) + "."

    results.append({
        "image_name": filename,
        "answer": label,
        "url": url,
        "question": question_str,
        "question_type": "multiple_choice_QA"
    })

# Save new JSON
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
out_json = os.path.join(OUT_JSON_DIR, f"{timestamp}_SKINgpt_multiple_choice_QA.json")

with open(out_json, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Done! {len(results)} entries saved to {out_json}")
