In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Dense, Dropout,Conv2D,BatchNormalization,LeakyReLU,Concatenate,UpSampling2D,MaxPool2D,Add
from keras import backend as K
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
NUM_CLASSES = 26
NUM_ANCHORS = 3
constant_anchors = {
    #"52":[[10,13], [16,30], [33,23]],
    26:[[7.90196078,5.824],[ 17.46078431,16.19298246],[ 43.03448105,39.25490196]],
    13:[[ 95.50221204,73.70000839],[145.99999905,161.99999809],[255.31603053,238.99232006]],
}

ANCHORS = np.array([[7.90196078,5.824],[ 17.46078431,16.19298246],[ 43.03448105,39.25490196],[ 95.50221204,73.70000839],[145.99999905,161.99999809],[255.31603053,238.99232006]])

ANCHOR_INDECES = {
    26:[0,1,2],
    13:[3,4,5],
}

In [None]:
import keras
import keras.layers as layers
import keras.backend as K
import tensorflow as tf


class Mish(layers.Layer):
    """Mish activation"""

    def __init__(self, **kwargs):
        super(Mish, self).__init__(**kwargs)

    def call(self, inputs):
        return inputs * K.tanh(K.softplus(inputs))


# this GroupFeature can not work because it cause some bugs:
# 'number of input channels does not match corresponding dimension of filter, 32!=64'
# class GroupFeature(layers.Layer):
#
#     def __init__(self, num_splits, **kwargs):
#         super(GroupFeature, self).__init__(**kwargs)
#         self.num_splits = num_splits
#
#     def call(self, inputs, **kwargs):
#         return tf.split(inputs, num_or_size_splits=self.num_splits, axis=-1)[0]


def conv_bn_activation(inputs,
                       filters,
                       filter_size,
                       downsample=False,
                       activation='leaky'):
    """yolo4-tiny is using leaky activation in source code"""

    assert activation in ['mish', 'leaky'], 'activation must be leaky or mish'

    if downsample:
        inputs = layers.ZeroPadding2D(padding=((1, 0), (1, 0)))(inputs)
        padding = 'valid'
        strides = 2
    else:
        padding = 'same'
        strides = 1

    x = layers.Conv2D(filters=filters,
                      kernel_size=filter_size,
                      strides=strides,
                      padding=padding,
                      use_bias=False,
                      kernel_regularizer=keras.regularizers.l2(0.0005),
                      kernel_initializer=keras.initializers.RandomNormal(stddev=0.01),
                      bias_initializer=keras.initializers.constant(0.0))(inputs)
    x = layers.BatchNormalization()(x)
    if activation == 'mish':
        x = Mish()(x)
    else:
        x = layers.LeakyReLU(negative_slope=0.1)(x)

    return x


def csp_darknet_tiny(inputs):
    x = conv_bn_activation(inputs, 32, 3, downsample=True)
    x = conv_bn_activation(x, 64, 3, downsample=True)
    x = conv_bn_activation(x, 64, 3)

    route = x

    # TODO: the index is 1 in source code so i change 0 to 1
    x_group = layers.Lambda(lambda y: tf.split(y, num_or_size_splits=2, axis=-1)[1])(x)
    x_group = conv_bn_activation(x_group, 32, 3)
    route1 = x_group
    x_group = conv_bn_activation(x_group, 32, 3)
    x_group = layers.Concatenate()([x_group, route1])
    x_group = conv_bn_activation(x_group, 64, 1)
    x_group = layers.Concatenate()([route, x_group])
    x_group = layers.MaxPool2D(pool_size=2, padding='same')(x_group)

    x = conv_bn_activation(x_group, 128, 3)
    route = x

    # TODO: the index is 1 in source code so i change 0 to 1
    x_group = layers.Lambda(lambda y: tf.split(y, num_or_size_splits=2, axis=-1)[1])(x)
    x_group = conv_bn_activation(x_group, 64, 3)
    route1 = x_group
    x_group = conv_bn_activation(x_group, 64, 3)
    x_group = layers.Concatenate()([x_group, route1])
    x_group = conv_bn_activation(x_group, 128, 1)
    x_group = layers.Concatenate()([route, x_group])
    x_group = layers.MaxPool2D(pool_size=2, strides=2, padding='same')(x_group)

    x = conv_bn_activation(x_group, 256, 3)
    route = x

    # TODO: the index is 1 in source code so i change 0 to 1
    x_group = layers.Lambda(lambda y: tf.split(y, num_or_size_splits=2, axis=-1)[1])(x)
    x_group = conv_bn_activation(x_group, 128, 3)
    route1 = x_group
    x_group = conv_bn_activation(x_group, 128, 3)
    x_group = layers.Concatenate()([x_group, route1])
    x_group = conv_bn_activation(x_group, 256, 1)

    C4 = x_group

    x_group = layers.Concatenate()([route, x_group])
    x_group = layers.MaxPool2D(pool_size=2, strides=2)(x_group)

    C5 = conv_bn_activation(x_group, 512, 3)

    return C4, C5


def yolo4_tiny(inputs, num_anchors, num_classes):

    C4, C5 = csp_darknet_tiny(inputs)

    x = conv_bn_activation(C5, 256, 1)
    output_C5 = conv_bn_activation(x, 512, 3)
    output_C5 = layers.Conv2D(num_anchors * (num_classes + 5),
                              1,
                              kernel_regularizer=keras.regularizers.l2(5e-4),
                              kernel_initializer=keras.initializers.RandomNormal(stddev=0.01),
                              bias_initializer=keras.initializers.constant(0.0))(output_C5)

    x_upsample = conv_bn_activation(x, 128, 1)
    x_upsample = layers.UpSampling2D()(x_upsample)
    x_concat = layers.Concatenate()([x_upsample, C4])

    output_C4 = conv_bn_activation(x_concat, 256, 3)
    output_C4 = layers.Conv2D(num_anchors * (5 + num_classes),
                              1,
                              kernel_regularizer=keras.regularizers.l2(5e-4),
                              kernel_initializer=keras.initializers.RandomNormal(stddev=0.01),
                              bias_initializer=keras.initializers.constant(0.0))(output_C4)

    print(output_C4.shape, output_C5.shape)
    model = keras.Model(inputs, [output_C5, output_C4])

    return model

inputs = keras.Input(shape=(416, 416, 3))
backbone = yolo4_tiny(inputs, num_anchors=NUM_ANCHORS, num_classes=NUM_CLASSES)
backbone.summary()


In [None]:
import json

with open("annotations.json", "r", encoding="utf-8") as f:
    official_data = json.load(f)
with open("annotations_unofficial.json", "r", encoding="utf-8") as f:
    unofficial_data = json.load(f)
print([key for key,content in official_data.items()])
def getData(selected_json):
    list_of_links = 1500 * [0]
    for dict in selected_json["images"]:
        list_of_links[dict['id']] = {'link':dict['flickr_url'],'height':dict['height'],'width':dict['width']}

    official_bbox = 1500 * [0]
    for dict in selected_json["annotations"]:
        #print(dict)
        coco_format = [dict["bbox"][0],dict["bbox"][1],dict["bbox"][2],dict["bbox"][3]]
        if coco_format[0] < 0:
            coco_format[2] += coco_format[0]  # shrink width (x is negative)
            coco_format[0] = 0   # clamp x
        if coco_format[1] < 0:
            coco_format[3] += coco_format[1]  # shrink height (y is negative)
            coco_format[1] = 0   # clamp y
        coco_format[2] = min(coco_format[2], list_of_links[dict["image_id"]]['width'] - coco_format[0])
        coco_format[3] = min(coco_format[3], list_of_links[dict["image_id"]]['height'] - coco_format[1])

        if official_bbox[dict["image_id"]] == 0:
            official_bbox[dict["image_id"]] = {"image_data":list_of_links[dict["image_id"]],"bboxes":[{'category':dict["category_id"],'x':coco_format[0],'y':coco_format[1],'w':coco_format[2],'h':coco_format[3]}]}
        else:
            official_bbox[dict["image_id"]]["bboxes"].append({'category':dict["category_id"],'x':coco_format[0],'y':coco_format[1],'w':coco_format[2],'h':coco_format[3]})
    return official_bbox

official_data_formated = getData(official_data)
unofficial_data_formated = getData(official_data)

print(official_data_formated[0])
print(official_data_formated[1])
print(official_data_formated[2])
print(official_data_formated[2]["bboxes"])
print(official_data["categories"])

print(official_data["annotations"][2]["bbox"])

In [None]:
PHOTO_SIZE = 416

tf.config.run_functions_eagerly(True)
bce = tf.keras.losses.BinaryCrossentropy()
cce = tf.keras.losses.CategoricalCrossentropy()

def iou_vectorized(boxes1, boxes2):
    """
    Compute IoU between every pair of boxes in boxes1 and boxes2.
    boxes1: (N, 4) [x_center, y_center, w, h]
    boxes2: (M, 4) [x_center, y_center, w, h]
    Returns:
      iou: (N, M) tensor with IoU values.
    """
    
    boxes1_corners = box_centers_to_corners(boxes1)  # (N,4)
    boxes2_corners = box_centers_to_corners(boxes2)  # (M,4)

    # Expand dims to broadcast pairwise
    boxes1_exp = tf.expand_dims(boxes1_corners, 1)  # (N,1,4)
    boxes2_exp = tf.expand_dims(boxes2_corners, 0)  # (1,M,4)

    # Intersection coordinates
    inter_xmin = tf.maximum(boxes1_exp[..., 0], boxes2_exp[..., 0])  # (N,M)
    inter_ymin = tf.maximum(boxes1_exp[..., 1], boxes2_exp[..., 1])  # (N,M)
    inter_xmax = tf.minimum(boxes1_exp[..., 2], boxes2_exp[..., 2])  # (N,M)
    inter_ymax = tf.minimum(boxes1_exp[..., 3], boxes2_exp[..., 3])  # (N,M)

    # Intersection dimensions
    inter_w = tf.maximum(inter_xmax - inter_xmin, 0)
    inter_h = tf.maximum(inter_ymax - inter_ymin, 0)
    inter_area = inter_w * inter_h  # (N,M)

    # Areas of boxes
    area1 = (boxes1_corners[..., 2] - boxes1_corners[..., 0]) * (boxes1_corners[..., 3] - boxes1_corners[..., 1])  # (N,)
    area2 = (boxes2_corners[..., 2] - boxes2_corners[..., 0]) * (boxes2_corners[..., 3] - boxes2_corners[..., 1])  # (M,)

    area1 = tf.expand_dims(area1, 1)  # (N,1)
    area2 = tf.expand_dims(area2, 0)  # (1,M)

    union_area = area1 + area2 - inter_area  # (N,M)

    iou = tf.where(union_area > 0, inter_area / union_area, tf.zeros_like(inter_area))

    return iou

def box_centers_to_corners(boxes):
    """
    Convert [x_center, y_center, w, h] to [x_min, y_min, x_max, y_max].
    boxes: (N, 4) tensor.
    Returns: (N, 4) tensor.
    """
    x_c, y_c, w, h = tf.split(boxes, 4, axis=-1)  # each (N,1)
    x_min = x_c - w / 2
    y_min = y_c - h / 2
    x_max = x_c + w / 2
    y_max = y_c + h / 2
    return tf.concat([x_min, y_min, x_max, y_max], axis=-1)  # (N,4)

def ciou(boxes1, boxes2, eps=1e-7):
    """
    Calculate CIoU between boxes1 and boxes2

    boxes1, boxes2: [..., 4], format = [x_center, y_center, width, height], normalized coords (0~1)
    returns: tensor [...], CIoU values
    """
    boxes1 = tf.convert_to_tensor(boxes1)
    boxes2 = tf.convert_to_tensor(boxes2)

    # Convert centers to corners: (x1, y1, x2, y2)
    boxes1_x1 = boxes1[..., 0] - boxes1[..., 2] / 2
    boxes1_y1 = boxes1[..., 1] - boxes1[..., 3] / 2
    boxes1_x2 = boxes1[..., 0] + boxes1[..., 2] / 2
    boxes1_y2 = boxes1[..., 1] + boxes1[..., 3] / 2

    boxes2_x1 = boxes2[..., 0] - boxes2[..., 2] / 2
    boxes2_y1 = boxes2[..., 1] - boxes2[..., 3] / 2
    boxes2_x2 = boxes2[..., 0] + boxes2[..., 2] / 2
    boxes2_y2 = boxes2[..., 1] + boxes2[..., 3] / 2

    # Intersection box
    inter_x1 = tf.maximum(boxes1_x1, boxes2_x1)
    inter_y1 = tf.maximum(boxes1_y1, boxes2_y1)
    inter_x2 = tf.minimum(boxes1_x2, boxes2_x2)
    inter_y2 = tf.minimum(boxes1_y2, boxes2_y2)

    inter_w = tf.maximum(inter_x2 - inter_x1, 0)
    inter_h = tf.maximum(inter_y2 - inter_y1, 0)
    inter_area = inter_w * inter_h

    # Areas
    area1 = (boxes1_x2 - boxes1_x1) * (boxes1_y2 - boxes1_y1)
    area2 = (boxes2_x2 - boxes2_x1) * (boxes2_y2 - boxes2_y1)

    union_area = area1 + area2 - inter_area + eps
    #inter_area = tf.cast(inter_area, tf.float32)
    #union_area = tf.cast(union_area, tf.float32)

    iou = tf.where(
        tf.not_equal(union_area, 0.0),
        inter_area / union_area,
        tf.constant(0.0, dtype=tf.float32)
    )

    # center distance squared
    center_dist = tf.square(boxes1[..., 0] - boxes2[..., 0]) + tf.square(boxes1[..., 1] - boxes2[..., 1])

    # smallest enclosing box
    enclose_x1 = tf.minimum(boxes1_x1, boxes2_x1)
    enclose_y1 = tf.minimum(boxes1_y1, boxes2_y1)
    enclose_x2 = tf.maximum(boxes1_x2, boxes2_x2)
    enclose_y2 = tf.maximum(boxes1_y2, boxes2_y2)
    enclose_w = enclose_x2 - enclose_x1
    enclose_h = enclose_y2 - enclose_y1
    c2 = tf.square(enclose_w) + tf.square(enclose_h) + eps

    # aspect ratio consistency
    w1 = boxes1[..., 2]
    h1 = boxes1[..., 3]
    w2 = boxes2[..., 2]
    h2 = boxes2[..., 3]

    v = (4 / (3.14159265 ** 2)) * tf.square(tf.math.atan(w2 / (h2 + eps)) - tf.math.atan(w1 / (h1 + eps)))
    with tf.device('/CPU:0'):  # to avoid potential GPU precision errors
        alpha = v / (1 - iou + v + eps)

    ciou = iou - (center_dist / c2) - alpha * v

    return ciou

def focal_bce(y_true, y_pred, gamma=2.0, alpha=0.25):
    bce = tf.keras.backend.binary_crossentropy(y_true, y_pred)
    p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
    modulating_factor = tf.pow(1.0 - p_t, gamma)
    alpha_weight = y_true * alpha + (1 - y_true) * (1 - alpha)
    return alpha_weight * modulating_factor * bce

def vectorized_what_grid_cell_it_resides(true_BBs, resolution):
    """
    true_BBs: tf.Tensor of shape (N, 4), each row = [x, y, w, h]
    resolution: int, e.g., 13 or 26
    
    Returns:
        x_cell: tf.Tensor, shape (N,), int32
        y_cell: tf.Tensor, shape (N,), int32
        normalized_BB: tf.Tensor, shape (N, 4), float32
            Each row: [x_offset_in_cell, y_offset_in_cell, w, h]
    """
    pixels_per_grid = PHOTO_SIZE // resolution  # scalar int

    x = true_BBs[:, 0]  # shape (N,)
    y = true_BBs[:, 1]  # shape (N,)
    w = true_BBs[:, 2]  # shape (N,)
    h = true_BBs[:, 3]  # shape (N,)

    x_cell = tf.cast(x // pixels_per_grid, tf.int32)  # (N,)
    y_cell = tf.cast(y // pixels_per_grid, tf.int32)  # (N,)

    x_offset = x % pixels_per_grid  # (N,)
    y_offset = y % pixels_per_grid  # (N,)

    normalized_BB = tf.stack([x_offset, y_offset, w, h], axis=1)  # (N,4)

    return x_cell, y_cell, normalized_BB

def normalize_yolo_output(y_pred):
    """
    y_pred: (batch, S, S, 3, 5 + C)
    anchors: (3, 2) — anchor box sizes (width, height) in pixels
    grid_size: int — S = 13 or 26 typically

    Returns:
        Tensor of shape (batch, S, S, 3, 5 + C) with:
        - objectness: sigmoid
        - class scores: softmax
    """
    grid_size = y_pred.shape[1]
    anchors = tf.convert_to_tensor(constant_anchors[grid_size], dtype=tf.float32)  # (3, 2)

    # Ensure anchor shape is (1, 1, 1, 3, 2) to broadcast
    anchors = tf.reshape(anchors, (1, 1, 1, NUM_ANCHORS, 2))

    # Slice components
    tx = tf.sigmoid(y_pred[..., 0]) * (PHOTO_SIZE // grid_size)
    ty = tf.sigmoid(y_pred[..., 1]) * (PHOTO_SIZE // grid_size)
    tw = tf.exp(y_pred[..., 2]) * anchors[..., 0]
    th = tf.exp(y_pred[..., 3]) * anchors[..., 1]
    obj = tf.sigmoid(y_pred[..., 4])
    cls = tf.nn.softmax(y_pred[..., 5:], axis=-1)

    # Stack normalized values
    bbox = tf.stack([tx, ty, tw, th, obj], axis=-1)  # shape: (batch, S, S, 3, 5)
    out = tf.concat([bbox, cls], axis=-1)           # shape: (batch, S, S, 3, 5 + C)

    return out

@tf.function
def yolo_loss_single_head(y_true, y_pred,
                            lambda_coord = 20.0,
                            lambda_obj = 10.0,
                            lambda_noobj = 0.5,
                            lambda_class = 3.0):
    """
    y_true: (batch, MAX_BOUNDING_BOXES, 5 + C)
    y_pred: (batch, S, S, B, 5 + C)
    anchors: Tensor shape (B, 2) → width, height
    """

    resolution = y_pred.shape[1]
    batch_size = tf.shape(y_true)[0]

    total_loss = 0
    position_loss = 0
    obj_confidence_loss = 0
    noobj_confidence_loss = 0
    clasification_loss = 0

    normalized_pred = normalize_yolo_output(y_pred)#get unsigmoided stuff and tx,ty,tw,th instead of tx,ty,w,h but return this 

    for b in tf.range(batch_size):
        mask = tf.Variable(tf.ones((resolution, resolution, NUM_ANCHORS), dtype=tf.float32))
        true_boxes = y_true[b]
        confidence_mask = tf.greater(true_boxes[:,4], 0)
        true_boxes = tf.boolean_mask(true_boxes, confidence_mask)
        dimensions = tf.cast(true_boxes[:, :4], tf.float32)
        classifications = tf.cast(true_boxes[:, 5:], tf.float32)

        x_cell, y_cell, normalized_true_BB = vectorized_what_grid_cell_it_resides(dimensions, resolution)

        for i in tf.range(tf.shape(true_boxes)[0]):
            preds = normalized_pred[b, x_cell[i], y_cell[i], :, :4]

            ious = iou_vectorized(tf.expand_dims(normalized_true_BB[i], 0), preds)  # (NUM_ANCHORS,)

            best_ind = tf.argmax(ious, axis=-1)
            best_ind = tf.cast(best_ind, tf.int32)

            best_BB = normalized_pred[b, x_cell[i], y_cell[i], best_ind[0], :4]
            best_confidence = normalized_pred[b, x_cell[i], y_cell[i], best_ind[0], 4]
            best_clasification = normalized_pred[b, x_cell[i], y_cell[i], best_ind[0], 5:]

            # Calculate losses
            position_loss += tf.constant(1.0, dtype=tf.float32) - ciou(normalized_true_BB[i], best_BB)
            obj_confidence_loss += focal_bce(ious[best_ind[0]], tf.reshape(best_confidence, (1,)))
            clasification_loss += cce(tf.expand_dims(classifications[i], 0), tf.expand_dims(best_clasification, 0))

            pred_boxes = normalized_pred[b, :, :, :, :4]
            pred_boxes_flat = tf.reshape(pred_boxes, [-1, 4])

            true_box = tf.expand_dims(true_boxes[i,:4],axis=0)

            ious = iou_vectorized(true_box, pred_boxes_flat)  # output shape (1, res*res*NUM_ANCHORS)
            ious = tf.squeeze(ious, axis=0)  # shape (res*res*NUM_ANCHORS)

            iou_mask = ious > 0.7  # boolean mask (res*res*NUM_ANCHORS)

            mask_flat = tf.reshape(mask, [-1])  # flatten mask (res*res*NUM_ANCHORS)
            mask_flat = tf.where(iou_mask, 0.0, mask_flat)  # zero where iou_mask true
            mask.assign(tf.reshape(mask_flat, mask.shape))  # reshape back and assign
            mask[x_cell[i], y_cell[i], best_ind[0]].assign(0.0)

        #TO DO harsher penalty when the model is predicting ious under a threshold prob 0.4
        pred_confidences = normalized_pred[b, :, :, :, 4]  # shape (res, res, NUM_ANCHORS)
        zeros = tf.zeros_like(pred_confidences)
        bce_losses = focal_bce(zeros, pred_confidences)
        weighted_losses = mask * bce_losses

        noobj_confidence_loss += tf.reduce_sum(weighted_losses)

    total_loss = (position_loss * lambda_coord + 
              obj_confidence_loss * lambda_obj +
              noobj_confidence_loss * lambda_noobj +
              clasification_loss * lambda_class)

    return total_loss / tf.cast(batch_size, tf.float32)

def yolo_multihead_loss(y_trues, y_preds):
    """
    y_trues: (batch, 2 , MAX_BOUNDING_BOXES, 5 + C)
    y_preds: list of (nr heads,batch, S, S, B*(5 + C)) tensors
    """
    y_preds_13 = tf.cast(y_preds[0], tf.float32)
    y_preds_26 = tf.cast(y_preds[1], tf.float32)
    batch_size = tf.shape(y_trues)[0]

    y_preds_13 = tf.reshape(y_preds_13, (batch_size, 13, 13, NUM_ANCHORS, 5 + NUM_CLASSES))
    y_preds_26 = tf.reshape(y_preds_26, (batch_size, 26, 26, NUM_ANCHORS, 5 + NUM_CLASSES))
    
    y_13_true = y_trues[:, 0, :, :]
    y_26_true = y_trues[:, 1, :, :]

    total_loss = 0.0

    loss_13 = yolo_loss_single_head(y_13_true, y_preds_13)
    loss_26 = yolo_loss_single_head(y_26_true, y_preds_26)

    total_loss = loss_13 + loss_26

    return tf.cast(total_loss , tf.float32)



In [None]:
class YOLOMultiHeadModel(tf.keras.Model):
    def __init__(self, backbone_model, loss_fn):
        super(YOLOMultiHeadModel, self).__init__()
        self.backbone = backbone_model
        self.loss_fn = loss_fn
        self.loss_tracker = tf.keras.metrics.Mean(name="loss")

    def call(self, inputs, training=False):
        return self.backbone(inputs, training=training)

    def train_step(self, data):
        images, labels = data
        with tf.GradientTape() as tape:
            predictions = self(images, training=True)
            loss = self.loss_fn(labels, predictions)
        gradients = tape.gradient(loss, self.trainable_variables)
        grads_and_vars = []
        for grad, var in zip(gradients, self.trainable_variables):
            if grad is None:
                tf.print(f"No gradients for variable: {var.name}")
            else:
                grads_and_vars.append((grad, var))
        self.optimizer.apply_gradients(grads_and_vars)
        self.loss_tracker.update_state(loss)
        tf.print("\n")
        return {"loss": self.loss_tracker.result()}

    def test_step(self, data):
        images, labels = data
        predictions = self(images, training=False)
        loss = self.loss_fn(labels, predictions)
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    @property
    def metrics(self):
        return [self.loss_tracker]
    
model = YOLOMultiHeadModel(backbone, loss_fn=yolo_multihead_loss)

In [None]:
#metrics
class MultiHeadMeanIoU(tf.keras.metrics.Metric):
    def __init__(self, iou_threshold=0.5, name='mean_iou', **kwargs):
        super().__init__(name=name, **kwargs)
        self.iou_threshold = iou_threshold
        self.total_iou = self.add_weight(name='total_iou', initializer='zeros')
        self.count = self.add_weight(name='count', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        pred_head1, pred_head2 = y_pred
        # Combine boxes from both heads
        boxes_pred = tf.concat([pred_head1[..., :4], pred_head2[..., :4]], axis=1)  # crude example
        boxes_true = tf.concat([y_true[0][..., :4], y_true[1][..., :4]], axis=1)

        iou = iou_vectorized(boxes_true, boxes_pred)
        mask = tf.cast(iou > self.iou_threshold, tf.float32)

        self.total_iou.assign_add(tf.reduce_sum(iou * mask))
        self.count.assign_add(tf.reduce_sum(mask))

    def result(self):
        return self.total_iou / (self.count + 1e-6)

    def reset_states(self):
        self.total_iou.assign(0.0)
        self.count.assign(0.0)

In [None]:
model.load_weights("./checkpoints/epoch_02.h5")

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def decode_predictions(pred, conf_thresh=0.55):
    batch_size, grid, _, anchors, channels = pred.shape
    pred = pred[0].numpy()
    boxes = []
    pixels_per_grid = 416 // grid
    for i in range(grid):
        for j in range(grid):
            for k in range(anchors):
                obj_score = sigmoid(pred[i, j, k, 4])
                if obj_score > conf_thresh:
                    bx, by, bw, bh = pred[i, j, k, 0:4]
                    class_probs = sigmoid(pred[i, j, k, 5:])
                    class_id = np.argmax(class_probs)
                    score = obj_score * class_probs[class_id]
                    boxes.append([bx + pixels_per_grid * i, by + pixels_per_grid * j, bw, bh, score, class_id])
    return boxes

def preprocess_image(image_path, img_size):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (img_size, img_size))
    image = image / 255.0  # normalize
    return tf.expand_dims(image, axis=0)
img_size = 416  # or whatever your model uses
image_tensor = preprocess_image("./images/official/0AuZ8iMUdcKOVqLFwI4467e4smSjGllWkACcq7jV.jpeg", img_size)

preds = model(image_tensor, training=False)
preds = [
    normalize_yolo_output(tf.reshape(preds[0],(1, 13, 13, NUM_ANCHORS, 5 + NUM_CLASSES))),
    normalize_yolo_output(tf.reshape(preds[1],(1, 26, 26, NUM_ANCHORS, 5 + NUM_CLASSES)))
    ]
grid_size_13 = preds[0]
grid_size_26 = preds[1]

filtered_grid_size_13 = decode_predictions(grid_size_13)
filtered_grid_size_26 = decode_predictions(grid_size_26)

def convert_box(cx, cy, w, h, img_w, img_h):
    x = int((cx - w / 2))
    y = int((cy - h / 2))
    w = int(w)
    h = int(h)
    return x, y, w, h

def plot_boxes(image, boxes, class_names=None):
    fig, ax = plt.subplots(1)
    ax.imshow(image)

    for box in boxes:
        cx, cy, w, h, score, class_id = box
        x, y, bw, bh = convert_box(cx, cy, w, h, image.shape[1], image.shape[0])
        print(x, y, bw, bh)
        rect = patches.Rectangle((x, y), bw, bh, linewidth=2, edgecolor='lime', facecolor='none')
        ax.add_patch(rect)
        #label = f"{class_names[int(class_id)] if class_names else class_id}: {score:.2f}"
        #ax.text(x, y, label, color='white', fontsize=10,
        #        bbox=dict(facecolor='black', edgecolor='none', alpha=0.5))
    
    plt.axis('off')

print(image_tensor.shape)
plot_boxes(tf.squeeze(image_tensor, axis=0),filtered_grid_size_26)

In [None]:
def extract_id_from_url(url):
    """Extract a unique ID from the image URL."""
    # Example: gets '12345' from '.../12345.jpg'
    last_index = url.rfind('/')  # Find last occurrence of '/'
    last_index2 = url.rfind('.')
    result = url[last_index + 1:last_index2]  # Take everything after that

    return result

def extract_id_from_url_with_Extension(url):
    """Extract a unique ID from the image URL."""
    # Example: gets '12345' from '.../12345.jpg'
    last_index = url.rfind('/')  # Find last occurrence of '/'
    result = url[last_index + 1:]  # Take everything after that

    return result

In [None]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

save_path_official = "./images/official"
save_path_unofficial = "./images/unofficial"

# Create the directory if it doesn't exist
os.makedirs(save_path_official, exist_ok=True)

def download_and_save_image(url, save_dir):

    image_id = extract_id_from_url(url)
    extension = os.path.splitext(url)[1] or ".jpg"
    file_path = os.path.join(save_dir, f"{image_id}{extension}")

    # Check if file already exists
    if os.path.exists(file_path):
        print(f"File already exists, skipping: {file_path}")
        return  # Skip download

    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise error for bad status codes

        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {url} → {file_path}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")

urls = [item["image_data"]["link"] for item in official_data_formated]

MAX_WORKERS = 24
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(download_and_save_image, url, save_path_official) for url in urls]
    for future in as_completed(futures):
        pass
#print(extract_id_from_url("https://farm66.staticflickr.com/65535/33978196618_e30a59e0a8_o.png"))

In [None]:
import json

# Open and load the JSON file
with open("generated_official_data.json", "r") as f:
    generated_official_formated = json.load(f)
with open("rotated_official_data.json", "r") as f:
    rotated_official_formated = json.load(f)

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image, ImageOps

# Input data
data = official_data_formated[1000]
print(data)

# Target size
TARGET_SIZE = 416

# Load image from URL using tensorflow
image_path = f'./images/official/{extract_id_from_url_with_Extension(data["image_data"]["link"])}'
image_pil = Image.open(image_path)
image_pil = ImageOps.exif_transpose(image_pil)  # Correct orientation

# Convert to NumPy array, then to TensorFlow tensor
image_np = np.array(image_pil)
image = tf.convert_to_tensor(image_np)

# Original size
print(data['image_data']['height'])
print(data['image_data']['width'])
orig_height = data['image_data']['height']
orig_width = data['image_data']['width']

# Resize image to 416x416
image_resized = image
image_resized = tf.image.resize(image, [TARGET_SIZE, TARGET_SIZE])
image_resized = tf.cast(image_resized, tf.uint8).numpy()

# Load bounding box info into pandas
bboxes_df = pd.DataFrame(data['bboxes'])

scale_x = TARGET_SIZE / orig_width
scale_y = TARGET_SIZE / orig_height

# Resize bounding boxes accordingly
bboxes_df['x'] = bboxes_df['x'] * scale_x
bboxes_df['y'] = bboxes_df['y'] * scale_y
bboxes_df['w'] = bboxes_df['w'] * scale_x
bboxes_df['h'] = bboxes_df['h'] * scale_y

# Plot the resized image with bounding box
fig, ax = plt.subplots(1)
ax.imshow(image_resized)

for idx, row in bboxes_df.iterrows():
    x, y, w, h = row['x'], row['y'], row['w'], row['h']
    rect = plt.Rectangle((x, y), w, h, fill=False, edgecolor='lime', linewidth=2)
    ax.add_patch(rect)
    ax.text(x, y - 10, f"Cat: {int(row['category'])}", color='lime', fontsize=10, weight='bold')

plt.axis('off')
plt.show()
print(bboxes_df)

In [None]:
import copy as cp

morethanlimit = []
limit = 500
frequency = 60 * [0]
filtered_official_formated = []

for buc in official_data_formated + generated_official_formated + rotated_official_formated:
    for bbox in buc["bboxes"]:
        frequency[bbox["category"]]+=1

for i in range(60):
    if limit <= frequency[i]:
        morethanlimit.append(i)

Reassigned_classes = {}
for i in range(len(morethanlimit)):
    Reassigned_classes[morethanlimit[i]] = i

filtered_official_formated_val = []
frequency_val = 26 * [0]

def adaug_val(bboxes):
    tmp = 26*[0]
    for bbox in bboxes:
        tmp[bbox["category"]]+=1
    
    for i in range(len(tmp)):
        if frequency_val[i] + tmp[i] > 20:
            return False
        
    for i in range(len(tmp)):
        frequency_val[i] += tmp[i]
    return True


for data in generated_official_formated + official_data_formated + rotated_official_formated:
    copy = cp.deepcopy(data)
    copy["bboxes"] = [
        {**bbox, "category": Reassigned_classes[bbox["category"]]}
        for bbox in copy["bboxes"]
        if bbox["category"] in Reassigned_classes
    ]
    if len(copy["bboxes"]) == 0:
        continue
    if adaug_val(copy["bboxes"]):
        filtered_official_formated_val.append(copy)
    else:     
        filtered_official_formated.append(copy)

print(len(filtered_official_formated))

In [None]:
print(frequency_val)

In [None]:
import tensorflow as tf
from PIL import Image, ImageOps
TARGET_SIZE = 416
def best_anchor_index(box_wh, anchors):
    box_area = box_wh[0] * box_wh[1]
    anchor_areas = anchors[:, 0] * anchors[:, 1]
    inter = np.minimum(anchors[:, 0], box_wh[0]) * np.minimum(anchors[:, 1], box_wh[1])
    iou = inter / (box_area + anchor_areas - inter)
    return np.argmax(iou)

def parser_flat_output(link, height, width, category, x, y, w, h):
    # Convert tensors to numpy
    link = link.numpy().decode("utf-8")
    height = height.numpy()
    width = width.numpy()

    x = x.numpy()
    y = y.numpy()
    w = w.numpy()
    h = h.numpy()
    category = category.numpy()

    image_path = './images/official/{}'.format(extract_id_from_url_with_Extension(link))

    image_pil = Image.open(image_path)
    image_pil = ImageOps.exif_transpose(image_pil)  # Correct orientation
    image_pil = image_pil.convert("RGB")

    # Convert to NumPy array, then to TensorFlow tensor
    image_np = np.array(image_pil)
    image = tf.convert_to_tensor(image_np)
    image = tf.image.resize(image, size = [TARGET_SIZE, TARGET_SIZE])
    image = tf.cast(image, tf.float32) / 255.0

    scale_x = TARGET_SIZE / width
    scale_y = TARGET_SIZE / height

    label_13 = []
    label_26 = []
    for i in range(len(category)):
        formated = (5 + NUM_CLASSES) * [0]
        formated[0] = x[i] * scale_x + (w[i] * scale_x) / 2  # center_x
        formated[1] = y[i] * scale_y + (h[i] * scale_y) / 2  # center_y
        formated[2] = w[i] * scale_x
        formated[3] = h[i] * scale_y
        formated[4] = 1.0
        formated[5 + category[i]] = 1

        best_idx = best_anchor_index([formated[2], formated[3]], ANCHORS)
        #print(best_idx,extract_id_from_url_with_Extension(link))
        for scale, mask in ANCHOR_INDECES.items():
            if best_idx in mask:
                if scale == 13:
                    label_13.append(formated)
                    #print(scale,extract_id_from_url_with_Extension(link))
                else:
                    label_26.append(formated)
                    #print(scale,extract_id_from_url_with_Extension(link))
                break


    max_length = max(len(label_13),len(label_26))
    while len(label_13) != max_length:
        label_13.append((5 + NUM_CLASSES) * [0])
    while len(label_26) != max_length:
        label_26.append((5 + NUM_CLASSES) * [0])

    label_13 = tf.convert_to_tensor(label_13, dtype=tf.float32)
    label_26 = tf.convert_to_tensor(label_26, dtype=tf.float32)

    return image, (label_13,label_26)

cnt = 0
def parser_wrapper(link, height, width, category, x, y, w, h):
    image, label = tf.py_function(
        func=parser_flat_output,
        inp=[link, height, width, category, x, y, w, h],
        Tout=(tf.float32, tf.float32)
    )
    image.set_shape([TARGET_SIZE, TARGET_SIZE, 3])
    label.set_shape([2,None, 5 + NUM_CLASSES])
    return image, label

print(len(filtered_official_formated))
print(len(filtered_official_formated_val))

def generator_train():
    
    for official_data in filtered_official_formated:
        yield (
            official_data['image_data']['link'],
            official_data['image_data']['height'],
            official_data['image_data']['width'],
            [box['category'] for box in official_data['bboxes']],
            [box['x'] for box in official_data['bboxes']],
            [box['y'] for box in official_data['bboxes']],
            [box['w'] for box in official_data['bboxes']],
            [box['h'] for box in official_data['bboxes']]
        )

def generator_val():
    
    for official_data in filtered_official_formated_val:
        yield (
            official_data['image_data']['link'],
            official_data['image_data']['height'],
            official_data['image_data']['width'],
            [box['category'] for box in official_data['bboxes']],
            [box['x'] for box in official_data['bboxes']],
            [box['y'] for box in official_data['bboxes']],
            [box['w'] for box in official_data['bboxes']],
            [box['h'] for box in official_data['bboxes']]
        )

# Define output signature correctly
output_signature = (
    tf.TensorSpec(shape=(), dtype=tf.string),  # link
    tf.TensorSpec(shape=(), dtype=tf.int32),  # height
    tf.TensorSpec(shape=(), dtype=tf.int32),  # width
    tf.TensorSpec(shape=(None,), dtype=tf.int32),  # categories
    tf.TensorSpec(shape=(None,), dtype=tf.float32),  # x
    tf.TensorSpec(shape=(None,), dtype=tf.float32),  # y
    tf.TensorSpec(shape=(None,), dtype=tf.float32),  # w
    tf.TensorSpec(shape=(None,), dtype=tf.float32),  # h
)

dataset = tf.data.Dataset.from_generator(generator_train, output_signature=output_signature)
dataset = dataset.shuffle(1000)
dataset = dataset.repeat()
dataset = dataset.map(parser_wrapper, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.padded_batch(
    16,
    padded_shapes=(
        [TARGET_SIZE, TARGET_SIZE, 3],               # image shape
        [2,None, 5 + NUM_CLASSES]      # label shape with variable number of boxes
    ),
    padding_values=(
        0.0,   # image padding (float32)
        0.0    # label padding (float32)
    )
).prefetch(tf.data.AUTOTUNE)

val = tf.data.Dataset.from_generator(generator_val, output_signature=output_signature)
val = val.shuffle(150)
val = val.map(parser_wrapper, num_parallel_calls=tf.data.AUTOTUNE)
val = val.padded_batch(
    16,
    padded_shapes=(
        [TARGET_SIZE, TARGET_SIZE, 3],               # image shape
        [2,None, 5 + NUM_CLASSES]      # label shape with variable number of boxes
    ),
    padding_values=(
        0.0,   # image padding (float32)
        0.0    # label padding (float32)
    )
).prefetch(tf.data.AUTOTUNE)


In [None]:

fig, ax = plt.subplots(1)
ind = 0
for image_batch, label_batch in dataset.take(1):
    ax.imshow(image_batch[ind])
    print("Image shape:", image_batch[ind].shape)
    print("Image shape:", label_batch[ind][0])
    print("Image shape:", label_batch[ind][1])
    print("Image shape:", label_batch[ind].shape)

plt.axis('off')
plt.show()

In [None]:
TARGET_SIZE = 416
def iou(box, clusters):
    # box: [w, h]
    # clusters: [[w1, h1], [w2, h2], ...]
    x = np.minimum(clusters[:, 0], box[0])
    y = np.minimum(clusters[:, 1], box[1])
    intersection = x * y
    box_area = box[0] * box[1]
    cluster_area = clusters[:, 0] * clusters[:, 1]
    iou_ = intersection / (box_area + cluster_area - intersection)
    return iou_

def kmeans(boxes, k, dist=np.median, seed=42):
    np.random.seed(seed)
    # Initialize clusters randomly
    clusters = boxes[np.random.choice(boxes.shape[0], k, replace=False)]
    while True:
        distances = []
        for box in boxes:
            distances.append(1 - iou(box, clusters))
        distances = np.array(distances)  # shape (num_boxes, k)
        nearest_clusters = np.argmin(distances, axis=1)
        new_clusters = []
        for cluster_idx in range(k):
            cluster_boxes = boxes[nearest_clusters == cluster_idx]
            if len(cluster_boxes) == 0:
                # Avoid empty clusters by reinitializing randomly
                new_clusters.append(clusters[cluster_idx])
            else:
                new_clusters.append(dist(cluster_boxes, axis=0))
        new_clusters = np.array(new_clusters)
        if np.all(clusters == new_clusters):
            break
        clusters = new_clusters
    return clusters

# Example usage
#[[bboxes[2],bboxes[3]] for image_batch, label_batch in dataset for bboxes in label_batch if bboxes[4] == 1]
boxes = np.array([
    [bbox["w"] * (TARGET_SIZE / dict["image_data"]["width"]), bbox["h"] * (TARGET_SIZE / dict["image_data"]["height"])]
    for dict in filtered_official_formated
    for bbox in dict["bboxes"]
])

print(len(boxes))

anchors = kmeans(boxes, k=6)
print("Anchors:", anchors)
print(np.mean([boxess[0] for boxess in boxes]),np.mean([boxess[1] for boxess in boxes]))

In [None]:
with tf.GradientTape() as tape:
    for image_batch,label_batch in dataset.take(1):
        y_pred = model(image_batch, training=True)
        #print(np.array(y_pred).shape)
        loss_value = yolo_multihead_loss(label_batch, y_pred)
        tf.print("Loss:", loss_value)

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath="checkpoints/epoch_{epoch:02d}.h5",  # Save every epoch
    save_weights_only=False,  # Set True if you only want to save weights
    save_best_only=False,     # Save every epoch, not just the best
    verbose=1                 # Print when saving
)

history = model.fit(dataset,validation_data=val,steps_per_epoch=500, epochs=40, callbacks=[checkpoint],initial_epoch=3)

In [None]:
print(history)

In [None]:
model.save("./best_without_metrics.h5")

In [None]:
frequency = 60 * [0]
categories = {}
morethanlimit = []
limit = 500

for dict in official_data_formated + generated_official_formated + rotated_official_formated:
    for bbox in dict["bboxes"]:
        frequency[bbox["category"]]+=1

for i in range(60):
    if limit <= frequency[i]:
        morethanlimit.append(i)
print(frequency)
print(morethanlimit)

official_data["categories"]
for dict in official_data["categories"]:
    categories[dict["name"]] = frequency[dict["id"]]


# Process data:
# - Replace spaces with \n to make each word appear on a new line
# - Sort by frequency (ascending)
sorted_items = sorted(categories.items(), key=lambda item: item[1])
labels = [label.replace(' ', '\n') for label, _ in sorted_items]
frequencies = [freq for _, freq in sorted_items]

# Create spaced x locations
x = np.arange(len(labels)) * 2  # Multiply to increase spacing between bars

# Bar plot
plt.figure(figsize=(60, 10))
plt.bar(x, frequencies, width=1.0, color='skyblue', label='Frequency')

# Trend line (metaplot)

# Ticks & labels
plt.xticks(x, labels)
plt.xlabel('Instance')
plt.ylabel('Frequency')
plt.title('Frequency of Instances with Metaplot')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
categories

In [None]:
#augmentation:
#   -rotate 90
#   -rotate 180
#   -rotate 270
#   -zoom in to the object(cu niste background stanga/dreapta/sus/jos fara sa iasa din poza)
#   -at least 70% vizible in the right if possible without going out of photo, expand/substract the view randomly onto that point, move the view towards the object if unde 70% without going out
#   -at least 70% vizible in the left if possible without going out of photo, expand/substract the view randomly onto that point, move the view towards the object if unde 70% without going out
#   -at least 70% vizible in the up if possible without going out of photo, expand/substract the view randomly onto that point, move the view towards the object if unde 70% without going out
#   -at least 70% vizible in the down if possible without going out of photo, expand/substract the view randomly onto that point, move the view towards the object if unde 70% without going out
#   -mixup doar ca de preferat dupa zoom

In [None]:
print(official_data["categories"])
name_categories = {
}

for category in official_data["categories"]:
    name_categories[category["id"]] = category["name"]

name_categories

In [None]:
print(len(morethanlimit))
Reassigned_classes = {}
for i in range(len(morethanlimit)):
    Reassigned_classes[morethanlimit[i]] = i 

print(Reassigned_classes)

In [None]:
import albumentations as A
import cv2

BOX_COLOR = (255, 0, 0)  # Red
TEXT_COLOR = (255, 255, 255)  # White


def visualize_bbox(img, bbox, class_name, color=BOX_COLOR, thickness=2):
    """Visualizes a single bounding box on the image"""
    x_min, y_min, w, h = bbox
    x_min, x_max, y_min, y_max = int(x_min), int(x_min + w), int(y_min), int(y_min + h)

    cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color=color, thickness=thickness)

    ((text_width, text_height), _) = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.35, 1)
    cv2.rectangle(img, (x_min, y_min - int(1.3 * text_height)), (x_min + text_width, y_min), BOX_COLOR, -1)
    cv2.putText(
        img,
        text=class_name,
        org=(x_min, y_min - int(0.3 * text_height)),
        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
        fontScale=0.35,
        color=TEXT_COLOR,
        lineType=cv2.LINE_AA,
    )
    return img


def visualize(image, bboxes, category_ids, category_id_to_name):
    img = image.copy()
    for bbox, category_id in zip(bboxes, category_ids):
        class_name = category_id_to_name[category_id]
        img = visualize_bbox(img, bbox, class_name)
    plt.figure(figsize=(12, 12))
    plt.axis("off")
    plt.imshow(img)


def flip_all_3_directions(dict):
    name_image = extract_id_from_url_with_Extension(dict["image_data"]["link"])

    image = cv2.imread(f"./images/official/{name_image}", cv2.IMREAD_COLOR_RGB)

    bboxes = []
    categories = []
    for bbox in dict["bboxes"]:
        bboxes.append([bbox["x"],bbox["y"],bbox["w"],bbox["h"]])
        categories.append(bbox["category"])
    
    transform1 = A.Compose(
    [A.VerticalFlip(p=1)],
    bbox_params=A.BboxParams(format="coco", label_fields=["category_ids"]),
    strict=True,
    seed=137,
    )
    
    transform2 = A.Compose(
    [A.HorizontalFlip(p=1)],
    bbox_params=A.BboxParams(format="coco", label_fields=["category_ids"]),
    strict=True,
    seed=137,
    )
    transform3 = A.Compose(
    [A.HorizontalFlip(p=1),A.VerticalFlip(p=1)],
    bbox_params=A.BboxParams(format="coco", label_fields=["category_ids"]),
    strict=True,
    seed=137,
    )

    transformed1 = transform1(image = image, bboxes = bboxes, category_ids = categories)
    transformed2 = transform2(image = image, bboxes = bboxes, category_ids = categories)
    transformed3 = transform3(image = image, bboxes = bboxes, category_ids = categories)
    print(transformed1)

    visualize(
    image,
    bboxes,
    categories,
    name_categories
)

    visualize(
    transformed1["image"],
    transformed1["bboxes"],
    transformed1["category_ids"],
    name_categories
)
    visualize(
    transformed2["image"],
    transformed2["bboxes"],
    transformed2["category_ids"],
    name_categories
)
    visualize(
    transformed3["image"],
    transformed3["bboxes"],
    transformed3["category_ids"],
    name_categories
)
    
flip_all_3_directions(official_data_formated[3])
        

In [None]:
print(official_data_formated[362]["bboxes"])

In [None]:
#partial element show
from PIL import Image
import albumentations as A

TargetOfEveryCategory = 800
numNewPhotos = 0
copyfreq = frequency.copy()

def overlap_of_box_in_sorrounding(sorrounding, box):
        """
        Measures how much of 'box' is inside 'sorrounding' box (both in COCO format).
        Returns a float between 0 and 1.
        """
        x1, y1, w1, h1 = box
        x2, y2, w2, h2 = sorrounding

        box_x1, box_y1, box_x2, box_y2 = x1, y1, x1 + w1, y1 + h1
        surr_x1, surr_y1, surr_x2, surr_y2 = x2, y2, x2 + w2, y2 + h2

        inter_x1 = max(box_x1, surr_x1)
        inter_y1 = max(box_y1, surr_y1)
        inter_x2 = min(box_x2, surr_x2)
        inter_y2 = min(box_y2, surr_y2)

        inter_w = max(0, inter_x2 - inter_x1)
        inter_h = max(0, inter_y2 - inter_y1)
        inter_area = inter_w * inter_h

        box_area = w1 * h1

        return inter_area / box_area if box_area > 0 else 0.0

def Shift_in_Multiple_Directions(SorroundingBox,data,chosenPair,LeastBBox = 0.71,MaxExcludedBBox = 0.3):

    def CanExpandInthatDirection(ChangedBox):
        if data["image_data"]["width"] < ChangedBox[0] + ChangedBox[2] or ChangedBox[0] < 0:
            return False
        if data["image_data"]["height"] < ChangedBox[1] + ChangedBox[3] or ChangedBox[1] < 0:
            return False
        
        for i in range(len(chosenPair)):
            if chosenPair[i] == 1:
                continue
            if overlap_of_box_in_sorrounding(ChangedBox,[data["bboxes"][i]["x"],data["bboxes"][i]["y"],data["bboxes"][i]["w"],data["bboxes"][i]["h"]]) > MaxExcludedBBox:
                return False
            
        return True
    
    def GotBelowLeastBBox(ChangedBox):
        for i in range(len(chosenPair)):
            if chosenPair[i] == 0:
                continue
            if overlap_of_box_in_sorrounding(ChangedBox,[data["bboxes"][i]["x"],data["bboxes"][i]["y"],data["bboxes"][i]["w"],data["bboxes"][i]["h"]]) < LeastBBox:
                return True
            
        return False

    combinations = [
        [-1, -1],
        #[-1,  0],
        [-1,  1],
        #[ 0, -1],
        #[ 0,  1],
        [ 1, -1],
        #[ 1,  0],
        [ 1,  1],
    ]#[width,height]

    ValidSorroundingBBoxes = []
    for combination in combinations:
        stillExpanding = True
        CopyOfSorroundingBox = SorroundingBox.copy()
        while stillExpanding and not GotBelowLeastBBox(CopyOfSorroundingBox):
            stillExpanding = False
            copy = CopyOfSorroundingBox.copy()
            copy[0]+=combination[0]
            if CanExpandInthatDirection(copy) and combination[0] != 0:
                CopyOfSorroundingBox[0]+=combination[0]
                stillExpanding = True
            copy = CopyOfSorroundingBox.copy()
            copy[1]+=combination[1]
            if CanExpandInthatDirection(copy) and combination[1] != 0:
                CopyOfSorroundingBox[1]+=combination[1]
                stillExpanding = True

        if not GotBelowLeastBBox(CopyOfSorroundingBox):
            continue

        ValidSorroundingBBoxes.append(CopyOfSorroundingBox)

    return ValidSorroundingBBoxes



def crop_and_normalize(image, surrounding_bbox, bboxes):
    """
    Crops the image using surrounding_bbox and normalizes all given bboxes
    relative to the new cropped image.

    Args:
        image: PIL.Image.Image or NumPy array (H, W, C)
        surrounding_bbox: list [x, y, w, h] in COCO format
        bboxes: list of bboxes in COCO format [x, y, w, h] — all inside surrounding_bbox

    Returns:
        cropped_image: PIL.Image.Image (cropped)
        normalized_bboxes: list of [x, y, w, h] with coordinates relative to crop
    """
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image)

    sx, sy, sw, sh = surrounding_bbox
    cropped_image = image.crop((sx, sy, sx + sw, sy + sh))
    width, height = cropped_image.size

    normalized_bboxes = []
    for dict in bboxes:
        copy = dict.copy()
        copy["x"] -= sx
        copy["y"] -= sy
        if copy["x"] < 0:
            copy["w"] += copy["x"]  # shrink width (x is negative)
            copy["x"] = 0   # clamp x
        if copy["y"] < 0:
            copy["h"] += copy["y"]  # shrink height (y is negative)
            copy["y"] = 0   # clamp y
        copy["w"] = min(copy["w"], width - copy["x"])
        copy["h"] = min(copy["h"], height - copy["y"])
        normalized_bboxes.append(copy)
        
    normalizedInstance = {
        "image_data":{
            "link":image,
            "width":width,
            "height":height,
        },
        "bboxes":normalized_bboxes
    }

    return cropped_image, normalizedInstance

def valid_pair(data,chosenPair):

    def expand_bbox(bbox, img_width, img_height, initial_pad=100, min_size=416):
        x, y, w, h = bbox

        # Initial corners with padding
        x1 = max(0, x - initial_pad)
        y1 = max(0, y - initial_pad)
        x2 = min(img_width, x + w + initial_pad)
        y2 = min(img_height, y + h + initial_pad)

        # Expand width until at least 416
        while (x2 - x1) < min_size:
            if x1 > 0:
                x1 -= 1
            if (x2 - x1) < min_size and x2 < img_width:
                x2 += 1
            # Stop if stuck
            if x1 == 0 and x2 == img_width:
                break

        # Expand height until at least 416
        while (y2 - y1) < min_size:
            if y1 > 0:
                y1 -= 1
            if (y2 - y1) < min_size and y2 < img_height:
                y2 += 1
            # Stop if stuck
            if y1 == 0 and y2 == img_height:
                break

        # Final COCO format [x, y, w, h]
        return [x1, y1, x2 - x1, y2 - y1]

    def getSorroundingBBox(boxes):
        # Convert to x_min, y_min, x_max, y_max
        x_mins = [box[0] for box in boxes]
        y_mins = [box[1] for box in boxes]
        x_maxs = [box[0] + box[2] for box in boxes]
        y_maxs = [box[1] + box[3] for box in boxes]

        # Get enclosing box
        x_min_enclosing = min(x_mins)
        y_min_enclosing = min(y_mins)
        x_max_enclosing = max(x_maxs)
        y_max_enclosing = max(y_maxs)

        # Convert back to COCO format: [x, y, width, height]
        enclosing_box = [
            x_min_enclosing,
            y_min_enclosing,
            x_max_enclosing - x_min_enclosing,
            y_max_enclosing - y_min_enclosing
        ]

        return enclosing_box
    
    all_selected__bboxes = []

    for i in range(len(chosenPair)):
        if chosenPair[i] == 0:
            continue
        if copyfreq[data["bboxes"][i]["category"]] > TargetOfEveryCategory:
            return False
        all_selected__bboxes.append([data["bboxes"][i]["x"],data["bboxes"][i]["y"],data["bboxes"][i]["w"],data["bboxes"][i]["h"]])
    
    SorroundingBox = getSorroundingBBox(all_selected__bboxes)
    SorroundingBox = expand_bbox(SorroundingBox,data["image_data"]["width"],data["image_data"]["height"])
    
    for i in range(len(chosenPair)):
        if chosenPair[i] == 1:
            continue
        if overlap_of_box_in_sorrounding(SorroundingBox,[data["bboxes"][i]["x"],data["bboxes"][i]["y"],data["bboxes"][i]["w"],data["bboxes"][i]["h"]]) > 0.3:
            return False

    #can happen to not be minimum requierment of 416x416
    return Shift_in_Multiple_Directions(SorroundingBox,data,chosenPair)

tries = 0

generated_official_formated = []

def generate_element_pairs(data):

    image = Image.open(f"images/official/{extract_id_from_url_with_Extension(data["image_data"]["link"])}")
    image = ImageOps.exif_transpose(image)  # Correct orientation

    sol = np.array(len(data["bboxes"]) * [0])
    sol[len(sol)-1] = 1
    global tries
    tries = 0
    while not np.all(sol == 1) and tries < 1e6:
        tries +=1

        for i in range(len(sol)):
            if copyfreq[data["bboxes"][i]["category"]] >= TargetOfEveryCategory:
                sol[i] = 1

        flipped = 1 - sol

        if np.all(flipped == 0):
            break

        result = valid_pair(data, flipped)
        #print(flipped)
        if not isinstance(result, bool):
            tries = 0
            global copyfreq
            chosenBBoxes = []
            for i in range(len(flipped)):
                if flipped[i] == 1:
                    chosenBBoxes.append(data["bboxes"][i])
                    copyfreq[data["bboxes"][i]["category"]] += len(result)

            for Sourround in result:
                global numNewPhotos
                numNewPhotos+=1
                cropped_image,normalizedInstance = crop_and_normalize(image,Sourround,chosenBBoxes)
                cropped_image = cropped_image.convert("RGB")

                normalizedInstance["image_data"]["link"] = f"generated/zommed_{numNewPhotos}.png"
                cropped_image.save(f"./images/official/zommed_{numNewPhotos}.png")
                generated_official_formated.append(normalizedInstance)

            #    #print(normalizedInstance)
            #    bboxes_df = pd.DataFrame(normalizedInstance['bboxes'])
            #    # Plot the resized image with bounding box
            #    fig, ax = plt.subplots(1)
            #    ax.imshow(cropped_image)
            #    for idx, row in bboxes_df.iterrows():
            #        x, y, w, h = row['x'], row['y'], row['w'], row['h']
            #        rect = plt.Rectangle((x, y), w, h, fill=False, edgecolor='lime', linewidth=2)
            #        ax.add_patch(rect)
            #        ax.text(x, y - 10, f"Cat: {int(row['category'])}", color='lime', fontsize=10, weight='bold')
            #    plt.axis('off')
            #    plt.show()

        #print("----------------------------------")

        ind = len(sol)-1
        while sol[ind] == 1:
            sol[ind] = 0
            ind-=1
        sol[ind] = 1

#print(len(official_data_formated[326]["bboxes"]))
current = 0
for data in official_data_formated:
    current+=1
    print(current)
    generate_element_pairs(data)
#print(numNewPhotos)
#print(copyfreq)

import json
with open("generated_official_data.json", "w") as f:
    json.dump(generated_official_formated, f, indent=4)

In [None]:
numNewPhotos

In [None]:
print(copyfreq)
a_second_freqcopy = copyfreq.copy()
print(a_second_freqcopy)

In [None]:
from PIL import Image
import albumentations as A

numNewRotatedPhotos = 0
a_second_freqcopy = copyfreq.copy()
MaximumTargetToAllCategories = 1000

transform1 = A.Compose(
[A.VerticalFlip(p=1)],
bbox_params=A.BboxParams(format="coco", label_fields=["category_ids"]),
strict=True,
seed=137,
)

transform2 = A.Compose(
[A.HorizontalFlip(p=1)],
bbox_params=A.BboxParams(format="coco", label_fields=["category_ids"]),
strict=True,
seed=137,
)
transform3 = A.Compose(
[A.HorizontalFlip(p=1),A.VerticalFlip(p=1)],
bbox_params=A.BboxParams(format="coco", label_fields=["category_ids"]),
strict=True,
seed=137,
)

rotated_official_formated = []

def verify_frequency(data):
    counts = {}
    for bbox in data["bboxes"]:
        cat = bbox["category"]
        counts[cat] = counts.get(cat, 0) + 1  # count how many times each category appears

    for cat, count in counts.items():
        if a_second_freqcopy[cat] + (count * 3) > MaximumTargetToAllCategories:
            return False
    return True

def make_formated(image,bboxes,catgories):
    global numNewRotatedPhotos
    global a_second_freqcopy
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image)

    categ = []
    for cat in catgories:
        a_second_freqcopy[int(cat)] += 1
        categ.append(a_second_freqcopy[int(cat)])
    print(categ)
    print(catgories)
    print(len(bboxes))

    width, height = image.size
    image.save(f"./images/official/rotated_{numNewRotatedPhotos}.png")

    return {
        "image_data":{
            "link":f"generated/rotated_{numNewRotatedPhotos}.png",
            "width":width,
            "height":height,
        },
        "bboxes":[{"x":bboxes[i][0],"y":bboxes[i][1],"w":bboxes[i][2],"h":bboxes[i][3],"category":int(catgories[i])} for i in range(len(bboxes))]
    }

def rotate_in_all_directions(data):#formated
    image = Image.open(f"images/official/{extract_id_from_url_with_Extension(data["image_data"]["link"])}")
    image = ImageOps.exif_transpose(image)  # Correct orientation
    image = image.convert("RGB")
    np_cropped_image = np.array(image).astype(np.uint8)
    bboxes = []
    categories = []
    for bbox in data["bboxes"]:
        bboxes.append([bbox["x"],bbox["y"],bbox["w"],bbox["h"]])
        categories.append(bbox["category"])

    global numNewRotatedPhotos
    numNewRotatedPhotos +=1
    transformed1 = transform1(image = np_cropped_image, bboxes = bboxes, category_ids = categories)
    rotated_official_formated.append(make_formated(transformed1["image"],transformed1["bboxes"],transformed1["category_ids"]))
    numNewRotatedPhotos +=1
    transformed2 = transform2(image = np_cropped_image, bboxes = bboxes, category_ids = categories)
    rotated_official_formated.append(make_formated(transformed2["image"],transformed2["bboxes"],transformed2["category_ids"]))
    numNewRotatedPhotos +=1
    transformed3 = transform3(image = np_cropped_image, bboxes = bboxes, category_ids = categories)
    rotated_official_formated.append(make_formated(transformed3["image"],transformed3["bboxes"],transformed3["category_ids"]))

current = 0
print(a_second_freqcopy)
for data in official_data_formated + generated_official_formated:
    current+=1
    if verify_frequency(data):
        rotate_in_all_directions(data)
        print(current,"accept")
    else:
        print(current,"skipped")

import json
with open("rotated_official_data.json", "w") as f:
    json.dump(rotated_official_formated, f, indent=4)

In [None]:
numNewRotatedPhotos

In [None]:
categories = {}
print(numNewPhotos)

official_data["categories"]
for dict in official_data["categories"]:
    categories[dict["name"]] = a_second_freqcopy[dict["id"]]


# Process data:
# - Replace spaces with \n to make each word appear on a new line
# - Sort by frequency (ascending)
sorted_items = sorted(categories.items(), key=lambda item: item[1])
labels = [label.replace(' ', '\n') for label, _ in sorted_items]
frequencies = [freq for _, freq in sorted_items]

# Create spaced x locations
x = np.arange(len(labels)) * 2  # Multiply to increase spacing between bars

# Bar plot
plt.figure(figsize=(60, 10))
plt.bar(x, frequencies, width=1.0, color='skyblue', label='Frequency')

# Trend line (metaplot)

# Ticks & labels
plt.xticks(x, labels)
plt.xlabel('Instance')
plt.ylabel('Frequency')
plt.title('Frequency of Instances with Metaplot')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
len(filtered_official_formated)