# 1. Libs

In [1]:
from pathlib import Path
from typing import List, Tuple
from io import BytesIO

import random
import numpy as np
import pandas as pd
import tensorflow as tf
from PIL import Image
from tqdm import tqdm
from huggingface_hub import snapshot_download
import keras

import os
from dotenv import load_dotenv
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


# 2. Config

In [2]:
PROJECT_ROOT = Path.cwd().parents[2]

In [None]:
DATA_DIR = (
    PROJECT_ROOT
    / "agents"
    / "image_process"
    / "data"
    / "pediatric_skin_data"
)

OUTPUT_DIR = PROJECT_ROOT / "data" / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# 3. Data Loader

In [4]:
def collect_samples(
    data_dir: Path,
    max_per_label: int = 1000,
    seed: int = 42,
) -> List[Tuple[str, int, str]]:
    rng = random.Random(seed)
    samples: List[Tuple[str, int, str]] = []

    for class_dir in sorted(data_dir.iterdir()):
        if not class_dir.is_dir():
            continue

        label_id, class_name = class_dir.name.split("_", 1)

        image_paths = [
            p
            for p in class_dir.iterdir()
            if p.suffix.lower() in {".jpg", ".jpeg", ".png"}
        ]

        rng.shuffle(image_paths)

        for img_path in image_paths[:max_per_label]:
            samples.append(
                (
                    str(img_path),
                    int(label_id),
                    class_name,
                )
            )

    return samples

In [5]:
samples = collect_samples(
    DATA_DIR,
    max_per_label=1000,
)

metadata = pd.DataFrame(
    samples,
    columns=["path", "label", "class_name"],
)

metadata.to_csv(
    OUTPUT_DIR / "metadata.csv",
    index=False,
)

print("Total images:", len(metadata))
metadata.head()

Total images: 8000


Unnamed: 0,path,label,class_name
0,c:\Users\lammi\Downloads\medscreening\agents\i...,0,Eczema_Dermatitis
1,c:\Users\lammi\Downloads\medscreening\agents\i...,0,Eczema_Dermatitis
2,c:\Users\lammi\Downloads\medscreening\agents\i...,0,Eczema_Dermatitis
3,c:\Users\lammi\Downloads\medscreening\agents\i...,0,Eczema_Dermatitis
4,c:\Users\lammi\Downloads\medscreening\agents\i...,0,Eczema_Dermatitis


# 4. Embeddings Extract

In [6]:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")

if not hf_token:
    raise RuntimeError("not found HF_TOKEN in environment variables")

login(token=hf_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to C:\Users\lammi\.cache\huggingface\token
Login successful


In [7]:
model_dir = snapshot_download("google/derm-foundation")

layer = keras.layers.TFSMLayer(
    model_dir,
    call_endpoint="serving_default"
)

model = keras.Sequential([layer])

Fetching 7 files: 100%|██████████| 7/7 [00:00<?, ?it/s]


In [8]:
def encode_image_tf(path: str):
    image_bytes = tf.io.read_file(path)

    example = tf.train.Example(
        features=tf.train.Features(
            feature={
                "image/encoded": tf.train.Feature(
                    bytes_list=tf.train.BytesList(
                        value=[image_bytes.numpy()])
                )
            }
        )
    )

    serialized = example.SerializeToString()

    output = layer(inputs=tf.constant([serialized]))

    emb = output["embedding"].numpy().squeeze()
    return emb

In [9]:
embeddings = []
labels = []

for _, row in tqdm(metadata.iterrows(), total=len(metadata)):
    try:
        emb = encode_image_tf(row["path"])
        embeddings.append(emb)
        labels.append(row["label"])
    except Exception as exc:
        print(f"Skip {row['path']}: {exc}")

X = np.stack(embeddings)
y = np.array(labels)

100%|██████████| 8000/8000 [7:32:57<00:00,  3.40s/it]  


In [10]:
embedding_df = pd.DataFrame(X)
embedding_df["label"] = y

print("Embedding DataFrame shape:", embedding_df.shape)
print(embedding_df["label"].value_counts().sort_index())

Embedding DataFrame shape: (8000, 6145)
label
0    1000
1    1000
2    1000
3    1000
4    1000
5    1000
6    1000
7    1000
Name: count, dtype: int64


In [11]:
from sklearn.model_selection import train_test_split 
train_df, temp_df = train_test_split(
    embedding_df,
    test_size=0.30,
    stratify=embedding_df["label"],
    random_state=42,
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["label"],
    random_state=42,
)

print("\nSplit sizes:")
print("Train:", len(train_df))
print("Val:  ", len(val_df))
print("Test: ", len(test_df))

print("\nTrain label distribution:")
print(train_df["label"].value_counts(normalize=True).sort_index())

print("\nVal label distribution:")
print(val_df["label"].value_counts(normalize=True).sort_index())

print("\nTest label distribution:")
print(test_df["label"].value_counts(normalize=True).sort_index())


Split sizes:
Train: 5600
Val:   1200
Test:  1200

Train label distribution:
label
0    0.125
1    0.125
2    0.125
3    0.125
4    0.125
5    0.125
6    0.125
7    0.125
Name: proportion, dtype: float64

Val label distribution:
label
0    0.125
1    0.125
2    0.125
3    0.125
4    0.125
5    0.125
6    0.125
7    0.125
Name: proportion, dtype: float64

Test label distribution:
label
0    0.125
1    0.125
2    0.125
3    0.125
4    0.125
5    0.125
6    0.125
7    0.125
Name: proportion, dtype: float64


In [16]:
OUTPUT_DIR = Path(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

train_df.to_csv(OUTPUT_DIR / "train_embeddings.csv", index=False)
val_df.to_csv(OUTPUT_DIR / "val_embeddings.csv", index=False)
test_df.to_csv(OUTPUT_DIR / "test_embeddings.csv", index=False)

print("\nSaved files:")
print(OUTPUT_DIR / "train_embeddings.csv")
print(OUTPUT_DIR / "val_embeddings.csv")
print(OUTPUT_DIR / "test_embeddings.csv")




Saved files:
c:\Users\lammi\Downloads\medscreening\data\outputs\train_embeddings.csv
c:\Users\lammi\Downloads\medscreening\data\outputs\val_embeddings.csv
c:\Users\lammi\Downloads\medscreening\data\outputs\test_embeddings.csv
