In [21]:
import fiftyone as fo
import fiftyone.zoo as foz

classes_to_keep = [
    "person",
    "bicycle",
    "car",
    "motorcycle",
    "airplane",
]

train_dataset_name = "coco_trai_1000"
if train_dataset_name in fo.list_datasets():
    # Delete the existing dataset
    fo.Dataset(train_dataset_name).delete()

val_dataset_name = "coco_valid_100"
if val_dataset_name in fo.list_datasets():
    # Delete the existing dataset
    fo.Dataset(val_dataset_name).delete()

train_dataset = foz.load_zoo_dataset(
    "coco-2017",
    label_types=["detections"],
    split="train",
    classes=classes_to_keep,
    max_samples=1000,
    shuffle=True,
    seed=123,
    only_matching=True,
    num_workers=10,
    dataset_name=train_dataset_name,
)

val_dataset = foz.load_zoo_dataset(
    "coco-2017",
    label_types=["detections"],
    split="validation",
    classes=classes_to_keep,
    max_samples=100,
    shuffle=True,
    seed=123,
    only_matching=True,
    num_workers=10,
    dataset_name=val_dataset_name,
)


Downloading split 'train' to '/home/yipeng/fiftyone/coco-2017/train' if necessary
Found annotations at '/home/yipeng/fiftyone/coco-2017/raw/instances_train2017.json'
Sufficient images already downloaded
Existing download of split 'train' is sufficient
Loading 'coco-2017' split 'train'
 100% |███████████████| 1000/1000 [2.0s elapsed, 0s remaining, 510.3 samples/s]      
Dataset 'coco_trai_1000' created
Downloading split 'validation' to '/home/yipeng/fiftyone/coco-2017/validation' if necessary
Found annotations at '/home/yipeng/fiftyone/coco-2017/raw/instances_val2017.json'
Sufficient images already downloaded
Existing download of split 'validation' is sufficient
Loading 'coco-2017' split 'validation'
 100% |█████████████████| 100/100 [351.8ms elapsed, 0s remaining, 284.3 samples/s]     
Dataset 'coco_valid_100' created


In [22]:
import os
import cv2 as cv

output_dir = "resized_dataset"
os.makedirs(output_dir, exist_ok=True)

resized_image_size = (224, 224)

def resize_image_and_annotations(sample):

    # Resize image
    image_path = sample.filepath
    image = cv.imread(image_path)
    height, width = image.shape[:2]
    image = cv.resize(image, resized_image_size)

    # Update the file path
    resized_image_path = os.path.join(output_dir, os.path.basename(image_path))
    sample.filepath = resized_image_path

    # Save the resized image
    cv.imwrite(resized_image_path, image)

    # Update the bounding boxes
    scale_x = resized_image_size[0] / float(width)
    scale_y = resized_image_size[1] / float(height)

    for det in sample["ground_truth"].detections:
        x1, y1, x2, y2 = det.bounding_box
        x1 = x1 * scale_x
        x2 = x2 * scale_x
        y1 = y1 * scale_y
        y2 = y2 * scale_y
        det.bounding_box = [x1, y1, x2, y2]

    sample.save()

for sample in train_dataset:
    resize_image_and_annotations(sample)

for sample in val_dataset:
    resize_image_and_annotations(sample)



In [29]:
import tensorflow as tf

def data_generator(dataset):
    # Define a mapping from class names to integer ids (if needed)
    class_name_to_id = {'person': 0, 'bicycle': 1, 'car': 2, 'motorcycle': 3, 'airplane': 4}  # Replace with your own mapping

    for sample in dataset:
        image_path = sample.filepath
        # Load image and apply any preprocessing
        image = tf.io.read_file(image_path)
        image = tf.image.decode_image(image)
        image = tf.image.resize(image, (224, 224))  # Replace with your desired input size

        # Process labels (example for object detection)
        labels = []
        for detection in sample.ground_truth.detections:
            label_id = class_name_to_id[detection.label]
            labels.append([detection.bounding_box[0], detection.bounding_box[1], detection.bounding_box[2], detection.bounding_box[3], label_id])

        yield image, labels


# Create the data generator
train_data_gen = data_generator(train_dataset)
val_data_gen = data_generator(val_dataset)


In [37]:
train_data_gen.__next__()

(<tf.Tensor: shape=(224, 224, 3), dtype=float32, numpy=
 array([[[162., 159., 152.],
         [132., 129., 122.],
         [119., 116., 107.],
         ...,
         [ 74.,  80.,  78.],
         [195., 200., 204.],
         [200., 207., 215.]],
 
        [[149., 146., 139.],
         [198., 195., 188.],
         [140., 137., 128.],
         ...,
         [ 44.,  50.,  46.],
         [160., 165., 168.],
         [208., 213., 217.]],
 
        [[121., 118., 111.],
         [113., 110., 103.],
         [119., 116., 109.],
         ...,
         [ 55.,  62.,  55.],
         [ 45.,  50.,  46.],
         [210., 214., 213.]],
 
        ...,
 
        [[235., 169., 173.],
         [157.,  75.,  79.],
         [180.,  65.,  70.],
         ...,
         [140.,  51.,  47.],
         [141.,  52.,  48.],
         [142.,  53.,  49.]],
 
        [[167.,  62.,  67.],
         [176.,  66.,  69.],
         [189.,  63.,  66.],
         ...,
         [139.,  50.,  46.],
         [140.,  51.,  47.],
      

In [38]:
import tensorflow as tf
from tensorflow.keras import layers, Model


def create_mobilenetv2_backbone(input_shape=(224, 224, 3)):
    base_model = tf.keras.applications.MobileNetV2(input_shape=input_shape, include_top=False, weights='imagenet')
    return base_model


def create_ssd_layers(num_classes, backbone):
    # Choose feature layers from the MobileNetV2 backbone for prediction
    feature_layers = [
        backbone.get_layer('block_4_expand_relu').output,
        backbone.get_layer('block_6_expand_relu').output,
        backbone.get_layer('block_13_expand_relu').output,
        backbone.get_layer('out_relu').output
    ]

    # Define SSD prediction layers
    classification_layers = []
    localization_layers = []

    for i, feature_layer in enumerate(feature_layers):
        # Classification layers
        clf = layers.Conv2D(filters=num_classes * 3, kernel_size=3, padding='same')(feature_layer)
        clf = layers.Reshape((-1, num_classes))(clf)
        classification_layers.append(clf)

        # Localization layers
        loc = layers.Conv2D(filters=4 * 3, kernel_size=3, padding='same')(feature_layer)
        loc = layers.Reshape((-1, 4))(loc)
        localization_layers.append(loc)

    classification_predictions = layers.Concatenate(axis=1, name='classification')(classification_layers)
    localization_predictions = layers.Concatenate(axis=1, name='localization')(localization_layers)

    return classification_predictions, localization_predictions


def create_mobilenetv2_ssd(num_classes, input_shape=(224, 224, 3)):
    backbone = create_mobilenetv2_backbone(input_shape)
    classification_predictions, localization_predictions = create_ssd_layers(num_classes, backbone)
    model = Model(inputs=backbone.input, outputs=[classification_predictions, localization_predictions])
    return model


In [39]:
num_classes = 6  # Number of classes including the background class
model = create_mobilenetv2_ssd(num_classes)


In [40]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 Conv1 (Conv2D)                 (None, 112, 112, 32  864         ['input_4[0][0]']                
                                )                                                                 
                                                                                                  
 bn_Conv1 (BatchNormalization)  (None, 112, 112, 32  128         ['Conv1[0][0]']                  
                                )                                                           

In [41]:
class SSDLoss(tf.keras.losses.Loss):
    def __init__(self, num_classes, alpha=1.0, neg_pos_ratio=3.0):
        super().__init__()
        self.num_classes = num_classes
        self.alpha = alpha
        self.neg_pos_ratio = neg_pos_ratio

    def _smooth_l1_loss(self, y_true, y_pred):
        abs_diff = tf.abs(y_true - y_pred)
        return tf.where(tf.less(abs_diff, 1.0), 0.5 * abs_diff ** 2, abs_diff - 0.5)

    def call(self, y_true, y_pred):
        y_true_cls = y_true[..., :self.num_classes]
        y_true_loc = y_true[..., self.num_classes:]
        y_pred_cls = y_pred[..., :self.num_classes]
        y_pred_loc = y_pred[..., self.num_classes:]

        # Localization loss
        loc_loss = self._smooth_l1_loss(y_true_loc, y_pred_loc)
        loc_loss = tf.reduce_sum(loc_loss, axis=-1)

        # Classification loss
        clf_loss = tf.keras.losses.categorical_crossentropy(y_true_cls, y_pred_cls, from_logits=True)
        clf_loss = tf.reduce_sum(clf_loss, axis=-1)

        # Compute the positive examples
        positives = y_true_cls[..., 1:]
        num_positives = tf.reduce_sum(positives, axis=-1)
        positives = tf.reduce_sum(positives, axis=-1, keepdims=True)

        # Hard negative mining
        clf_loss = clf_loss * positives
        num_negatives = self.neg_pos_ratio * num_positives
        num_negatives = tf.reduce_min([num_negatives, tf.cast(tf.shape(y_true)[1], tf.float32)], axis=-1)
        top_k_negatives_mask = tf.nn.top_k(clf_loss, k=tf.cast(num_negatives, tf.int32), sorted=False).indices
        clf_loss = tf.reduce_sum(tf.gather(clf_loss, top_k_negatives_mask, batch_dims=1), axis=-1)

        # Total loss
        total_loss = clf_loss + self.alpha * loc_loss
        total_loss = total_loss / (num_positives + 1e-16)

        return total_loss


In [42]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss = SSDLoss(num_classes)

model.compile(optimizer=optimizer, loss=loss)


In [43]:
model.fit(train_data_gen, epochs=10, validation_data=val_data_gen)

Epoch 1/10


ValueError: in user code:

    File "/home/yipeng/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/home/yipeng/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/yipeng/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/home/yipeng/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "/home/yipeng/miniconda3/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/yipeng/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/input_spec.py", line 250, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'model_3' (type Functional).
    
    Input 0 of layer "Conv1" is incompatible with the layer: expected min_ndim=4, found ndim=3. Full shape received: (None, None, None)
    
    Call arguments received by layer 'model_3' (type Functional):
      • inputs=tf.Tensor(shape=(None, None, None), dtype=float32)
      • training=True
      • mask=None
