In [None]:
import os
import zipfile
import pandas as pd

df = pd.read_csv("HAM10000_metadata.csv")

img_path1 = "HAM10000_images_part_1.zip"
img_path2 = "HAM10000_images_part_2.zip"

root = "."
parent_dir = "DERMASHIELDER/combinedImages"

zipfile_path_list = [os.path.join(root, img_path1), os.path.join(root, img_path2)]

if not os.path.exists(parent_dir):
    os.makedirs(parent_dir)

num_of_files = 0
extracted_files = 0

for zip_path in zipfile_path_list:
    print(f"Processing zip file: {zip_path}")
    try:
        with zipfile.ZipFile(zip_path, 'r') as myzip:
            for zipinfo in myzip.infolist():
                # Attempt decoding if necessary
                try:
                    filename = zipinfo.filename.encode('cp437').decode('utf-8')
                except UnicodeDecodeError:
                    filename = zipinfo.filename  # fallback to raw

                num_of_files += 1
                if filename.endswith(".jpg"):
                    base_name = os.path.basename(filename)
                    destination_path = os.path.join(parent_dir, base_name)

                    if os.path.exists(destination_path):
                        print(f"Skipping {base_name} (already exists).")
                        continue

                    try:
                        with myzip.open(zipinfo) as source, open(destination_path, "wb") as target:
                            target.write(source.read())
                            extracted_files += 1
                            if extracted_files % 100 == 0:
                                print(f"Extracted {extracted_files} images...")
                    except Exception as e:
                        print(f" Could not extract {base_name}: {e}")
    except zipfile.BadZipFile:
        print(f"❌ Bad ZIP file: {zip_path}")
    except Exception as e:
        print(f"❌ Failed to process {zip_path}: {e}")

print(f"\nTotal files seen in zips: {num_of_files}")
print(f"Total extracted image files: {extracted_files}")

# Final check
extracted_jpg_files = [f for f in os.listdir(parent_dir) if f.endswith(".jpg")]
print(f"Number of .jpg files in {parent_dir}: {len(extracted_jpg_files)}")


In [None]:
!file HAM10000_images_part_1_zip.zip
!file HAM10000_images_part_2_zip.zip

In [None]:
img_count = 0

print("Files and folders in parent_dir:")
print(os.listdir(parent_dir))

for image in os.listdir(parent_dir):
    if image.endswith(".jpg"):
        img_count += 1

print("Images in Parent Directory:", img_count)


In [None]:
# Match Image ID to Label

num_of_entries = 0
img_dict = {}

for i, row in df.iterrows():
    img_name = row['image_id'] + ".jpg"
    img_type = row['dx']

    if os.path.isfile(os.path.join(parent_dir, img_name)):
        img_dict[img_name] = img_type


for img_name, img_type in img_dict.items():
  num_of_entries += 1


print("Number of Dictionary Entries: ", num_of_entries)


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import shutil, os, time
import tqdm

def copy_with_retry(src_path, dest_path, img_name, max_retries=5):
    retry_count = 0
    while retry_count < max_retries:
        try:
            shutil.copy(src_path, dest_path)
            return f"Copied: {img_name}"
        except OSError as e:
            retry_count += 1
            time.sleep(2)
    return f"FAILED: {img_name}"

# Define the directory where images will be organized by label
organized_dir = parent_dir

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = []
    for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        img_name = row["image_id"] + ".jpg"
        img_label = row["dx"]
        
        src_path = os.path.join(parent_dir, img_name)
        dest_dir = os.path.join(organized_dir, img_label)
        dest_path = os.path.join(dest_dir, img_name)
        
        os.makedirs(dest_dir, exist_ok=True)
        if os.path.exists(src_path):
            futures.append(
                executor.submit(copy_with_retry, src_path, img_name, img_name)
            )
    
    for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        # Optionally print result, or log only failures
        if result.startswith("FAILED"):
            print(result)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers

# ===========================
# Settings for loading data
# ===========================

# Path to your image dataset — make sure this matches your actual folder structure
data_dir = parent_dir

# Controls how many images are loaded per step and the target input size
batch_size = 16
img_size = (224, 224)

# =============================
# Load Training & Validation
# =============================

# Load the training data (80% of total data)
train_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split=0.20,         # Reserve 20% of data for validation
    subset="training",             # This is the training portion
    seed=42,                       # Seed ensures split is reproducible
    image_size=img_size,           # Resize all images to 224x224
    batch_size=batch_size,
    label_mode="categorical"       # One-hot encode labels for multiclass classification
)

# Load the validation data (remaining 20%)
val_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split=0.20,         # Same split as above
    subset="validation",           # This is the validation portion
    seed=42,
    image_size=img_size,
    batch_size=batch_size,
    label_mode="categorical"
)

# =====================================
# Data Augmentation (only for train)
# =====================================

# This layer adds random transformations to images to make the model generalize better
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal_and_vertical"),  # Randomly flip images
    layers.RandomRotation(0.15),                   # Randomly rotate
    layers.RandomZoom(0.15),                       # Randomly zoom in
    layers.RandomContrast(0.1),                    # Slightly vary contrast
])

# Apply the augmentation only to the training set
# We map over the dataset and inject transformations into the image pipeline
train_ds = train_ds.map(
    lambda x, y: (data_augmentation(x, training=True), y),
    num_parallel_calls=tf.data.AUTOTUNE
)

# =====================================
# Performance Boost via Prefetching
# =====================================

# This allows the GPU to fetch the next batch while the current one is training
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds   = val_ds.cache().prefetch(buffer_size=AUTOTUNE)



In [None]:
# =========================================================
#  PHASE‑0  |  Imports, mixed‑precision, and parameters
# =========================================================
import os, numpy as np, tensorflow as tf
from tensorflow.keras import layers, models, mixed_precision
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt

# ⚡ Enable mixed precision
mixed_precision.set_global_policy("mixed_float16")

# ----------------------------------------
# Config
# ----------------------------------------
data_dir    = parent_dir                     # Folder containing class subdirs
img_size    = (224, 224)                     # EfficientNetB0 = 224x224
batch_size  = 16
num_classes = 7
seed        = 42


# =========================================================
#  PHASE‑1  |  Build tf.data pipelines  (train / val)
# =========================================================
train_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split=0.20,
    subset="training",
    seed=seed,
    image_size=img_size,
    batch_size=batch_size,
    label_mode="categorical"
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    validation_split=0.20,
    subset="validation",
    seed=seed,
    image_size=img_size,
    batch_size=batch_size,
    label_mode="categorical"
)

# ⚡ Cache → shuffle → prefetch
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().shuffle(1000).prefetch(AUTOTUNE)
val_ds   = val_ds.cache().prefetch(AUTOTUNE)


# =========================================================
#  PHASE‑2  |  Class Weights (handle imbalance)
# =========================================================
all_labels = np.concatenate([y for _, y in train_ds], axis=0)
y_int = np.argmax(all_labels, axis=1)

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_int),
    y=y_int
)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}
print("Class Weights:", class_weight_dict)


# =========================================================
#  PHASE‑3  |  Data Augmentation
# =========================================================
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal_and_vertical"),
    layers.RandomRotation(0.15),
    layers.RandomZoom(0.15),
    layers.RandomContrast(0.15),
    layers.RandomTranslation(0.1, 0.1),
], name="augmentation")


# =========================================================
#  PHASE‑4  |  Build Model (EfficientNetB0)
# =========================================================
base_model = EfficientNetB3(
    include_top=False,
    weights="imagenet",
    input_shape=img_size + (3,)
)
base_model.trainable = True
for layer in base_model.layers[:-20]:
    layer.trainable = False  # Freeze all but last 20

inputs = layers.Input(shape=img_size + (3,))
x = data_augmentation(inputs)
x = tf.keras.layers.Rescaling(1./127.5, offset=-1)(x)
x = base_model(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.4)(x)
x = layers.Dense(256, activation="relu")(x)

x = layers.Dropout(0.3)(x)
outputs = layers.Dense(num_classes, activation="softmax", dtype="float32")(x)

model = models.Model(inputs, outputs)
print(model.summary())


# =========================================================
#  PHASE‑5  |  Compile + Callbacks
# =========================================================
optimizer = Adam(learning_rate=1e-3)
model.compile(
    optimizer=optimizer,
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

callbacks = [
    ReduceLROnPlateau(monitor="val_loss", factor=0.3, patience=3, verbose=1),
    EarlyStopping(monitor="val_loss", patience=7, restore_best_weights=True, verbose=1),
    ModelCheckpoint("best_effb0.h5", monitor="val_accuracy", save_best_only=True, verbose=1)
]


# =========================================================
#  OPTIONAL  |  Sanity Check with Small Subset
# =========================================================
do_small_test = True 

if do_small_test:
    print(" Running small sanity test (5 batches only)...")
    small_ds = train_ds.take(5)
    model.fit(small_ds, epochs=10)
    print("Sanity test complete.")
    exit()


# =========================================================
#  PHASE‑6  |  Warm-Up Training
# =========================================================
warmup_epochs = 5
history_warm = model.fit(
    train_ds,
    epochs=warmup_epochs,
    validation_data=val_ds,
    callbacks=callbacks,
    class_weight=class_weight_dict
)


# =========================================================
#  PHASE‑7  |  Fine‑Tune (unfreeze more layers)
# =========================================================
unfreeze_from = len(base_model.layers) - 60
for layer in base_model.layers[unfreeze_from:]:
    layer.trainable = True

model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

fine_tune_epochs = 17
total_epochs = warmup_epochs + fine_tune_epochs

history_fine = model.fit(
    train_ds,
    initial_epoch=history_warm.epoch[-1] + 1,
    epochs=total_epochs,
    validation_data=val_ds,
    callbacks=callbacks,
    class_weight=class_weight_dict
)


# =========================================================
#  PHASE‑8  |  Training Plots
# =========================================================
def plot_training(history, title="Accuracy"):
    plt.plot(history.history["accuracy"], label="train")
    plt.plot(history.history["val_accuracy"], label="val")
    plt.title(title)
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()

plot_training(history_warm, "Warm-up Phase")
plot_training(history_fine, "Fine-tuning Phase")


# =========================================================
#  PHASE‑9  |  Evaluate
# =========================================================
val_loss, val_acc = model.evaluate(val_ds, verbose=2)
print(f"Validation Accuracy: {val_acc:.4f}")


# =========================================================
#  PHASE‑10  |  Export to TFLite
# =========================================================
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.target_spec.supported_types = [tf.float16]
tflite_model = converter.convert()

with open("model.tflite", "wb") as f:
    f.write(tflite_model)

print("Exported to TFLite.")
