# Create labelled datasets from images

In [4]:
# imports
import tensorflow as tf

import pandas as pd
import numpy as np

from tqdm import tqdm
from PIL import Image

In [None]:
# Paths to data
DATA_DIR = "C:/isic/"
DATASET_DIR = DATA_DIR + "datasets/"
IMAGE_DIR = DATA_DIR + "images/"
METADATA_DIR = DATA_DIR + "metadata/"
MODELS_DIR = DATA_DIR + "models/"

In [12]:
# CSV files
isic_trimmed_csv = METADATA_DIR + "isic_trimmed.csv"
isic_trimmed_balanced_csv = METADATA_DIR + "isic_trimmed_balanced.csv"

# Load CSVs
isic_df = pd.read_csv(isic_trimmed_csv)
isic_bal_df = pd.read_csv(isic_trimmed_balanced_csv)

In [10]:
# Function to preprocess the images
def preprocess_image(image_path):
    img = Image.open(image_path)
    img = img.resize((224, 224))  # Standardize size to 244x244 pixels - common in CNN models
    img = np.array(img) / 255.0  # Normalize pixel values to [0, 1]

    return img

In [18]:
# Function to write TFRecords from a dataframe
# Assumes the dataframe has columns "image_name" and "target"
# Assumes the images are in the image_folder with .jpg extension
def write_tfrecord_from_dataframe(df, image_folder, tfrecord_filename):
    with tf.io.TFRecordWriter(tfrecord_filename) as writer:
        for index, row in tqdm(df.iterrows(), total=len(df)):
            image_path = image_folder + row["image_name"] + ".jpg"
            img = preprocess_image(image_path)

            label = int(row["target"])

            feature = {
                "image": tf.train.Feature(
                    float_list=tf.train.FloatList(value=img.flatten())
                ),
                "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label])),
            }

            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())

In [13]:
# Paths to TFRecords
isic_tf = DATASET_DIR + "isic_no_duplicates.tfrecord"
isic_bal_tf = DATASET_DIR + "isic_no_duplicates_balanced.tfrecord"

In [19]:
# Create TFRecords from balanaced dataframe
write_tfrecord_from_dataframe(isic_bal_df, IMAGE_DIR, isic_bal_tf)

100%|██████████| 1162/1162 [02:16<00:00,  8.53it/s]


In [None]:
# Create TFRecords from full dataframe
# very slow 1+ hours - 19GB file
# write_tfrecord_from_dataframe(isic_df, IMAGE_DIR, isic_tf)