# Hair Type Classification using VGG16

In [116]:
import numpy as np
import pandas as pd
import tensorflow as tf
import cv2
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing import image_dataset_from_directory

## Load Cleaned Dataset

In [117]:
df = pd.read_csv("cleaned_dataset_paths.csv")
df.head()

Unnamed: 0,image_path,label,height,width,channels,aspect_ratio,brightness
0,C:\Users\nisa2\.cache\kagglehub\datasets\kavya...,straight,591,473,3,0.800338,114.276347
1,C:\Users\nisa2\.cache\kagglehub\datasets\kavya...,straight,177,284,3,1.60452,200.348731
2,C:\Users\nisa2\.cache\kagglehub\datasets\kavya...,straight,279,181,3,0.648746,145.681063
3,C:\Users\nisa2\.cache\kagglehub\datasets\kavya...,straight,202,249,3,1.232673,157.705575
4,C:\Users\nisa2\.cache\kagglehub\datasets\kavya...,straight,172,151,3,0.877907,148.798552


## Prepare Dataset

In [118]:
# Encode labels
class_order = ["straight", "wavy", "curly", "kinky"]
df["label"] = df["label"].str.lower().str.strip()
label_to_int = {label: idx for idx, label in enumerate(class_order)} 
# map encoded labels
df["label_int"] = df["label"].map(label_to_int)
df.head()

Unnamed: 0,image_path,label,height,width,channels,aspect_ratio,brightness,label_int
0,C:\Users\nisa2\.cache\kagglehub\datasets\kavya...,straight,591,473,3,0.800338,114.276347,0
1,C:\Users\nisa2\.cache\kagglehub\datasets\kavya...,straight,177,284,3,1.60452,200.348731,0
2,C:\Users\nisa2\.cache\kagglehub\datasets\kavya...,straight,279,181,3,0.648746,145.681063,0
3,C:\Users\nisa2\.cache\kagglehub\datasets\kavya...,straight,202,249,3,1.232673,157.705575,0
4,C:\Users\nisa2\.cache\kagglehub\datasets\kavya...,straight,172,151,3,0.877907,148.798552,0


In [119]:
# image resizing
IMG_WIDTH = 224
IMG_HEIGHT = 224
IMG_CHANNELS = 3
BATCH_SIZE = 32

In [120]:
# train / val / test split
train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df["label_int"], random_state=17)

val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label_int"], random_state=17)

print(len(train_df), len(val_df), len(test_df))

980 210 211


In [121]:
# Preprocessing and pipelines
def load_and_preprocess(path, label):
    # load img
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    # resize image
    img = tf.image.resize(img, (IMG_WIDTH, IMG_HEIGHT))
    # preprocess
    img = preprocess_input(img)

    return img, label

In [122]:
# Light augmentation

data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"), 
    tf.keras.layers.RandomRotation(0.05), 
    tf.keras.layers.RandomZoom(0.1), 
    tf.keras.layers.RandomBrightness(factor=0.05)
])

In [123]:
# tf.data 
def build_dataset(df, shuffle=False, augment=False):
    paths = df["image_path"].values
    labels = df["label_int"].values

    dataset = tf.data.Dataset.from_tensor_slices((paths, labels))
    dataset = dataset.map(load_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE)

    if augment:
        dataset = dataset.map(lambda x, y: (data_augmentation(x), y), num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        dataset = dataset.shuffle(1000)

    dataset = dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    
    return dataset

In [124]:
train_data = build_dataset(train_df, augment=True)
val_data = build_dataset(val_df, augment=False)
test_data = build_dataset(test_df, augment=False)

## Build VGG16 Model

In [125]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

base_model = VGG16(weights = "imagenet", include_top=False, input_shape=((IMG_WIDTH, IMG_HEIGHT, IMG_CHANNELS)))

for layer in base_model.layers:
    layer.trainable = False

# classifer head
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.4)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
output = Dense(4, activation='softmax')(x)


In [126]:
# create model
model = Model(inputs=base_model.input, outputs=output)

# compile model
model.compile(optimizer=Adam(learning_rate=-1e-4), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# show summary
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

## Train Model

In [127]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import matplotlib.pyplot as plt

In [128]:
callbacks = [
    EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.3, patience=2, verbose=1),
    ModelCheckpoint(filepath="best_vgg16_model.h5", monitor="val_loss", save_best_only=True, verbose=1)
]

In [129]:
# train model
history = model.fit(train_data, validation_data=val_data, epochs=35, callbacks=callbacks)

Epoch 1/35

KeyboardInterrupt: 

In [None]:
# plot training curves

plt.figure(figsize(12, 5))

# Accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history["accuracy"], label="Train Accuracy")
plt.plot(history.history["val_accuracy"], label=["Validation Accuracy"])
plt.title("Accuracy over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

# Loss
plt.subplot(1, 2, 2)
plt.plot(history.history["loss"], label="Train Loss")
plt.plot(history.history["val_loss"], label=["Validation Loss"])
plt.title("Loss over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()

## Results

## Extract Embeddings