In [24]:
import torchvision.models as models
import torch.nn as nn
from keras.applications import VGG16
from keras.layers import Conv2D
from keras.layers import Flatten, Dense, Reshape
from keras.models import Model
import opendatasets as od
from keras.optimizers import Adam
import ast
import cv2
import pandas as pd
import numpy as np
import os

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14720793218906469071
xla_global_id: -1
]


**1 . Loading the pre trained backbone : VGG16**
**2. freezing layers , except the top ones**


In [25]:


def object_detection_cnn():

    vgg16 = VGG16(include_top=False,input_shape=(480,640,3),weights="imagenet",pooling=None)
    vgg16.trainable = False
    input_to_vgg16 = vgg16.input
    vgg16_output = Conv2D(filters=30,kernel_size=(9,14),activation="relu")(vgg16.layers[-1].output)

    return Model(inputs=input_to_vgg16,outputs=vgg16_output)

**3. Adding Heads(flatten ---> dense ---> reshape)**

In [26]:
def build_yolo_model():
    base_model = object_detection_cnn()
    
    x = base_model.output
    x = Flatten()(x)
    x = Dense(4096, activation='relu')(x)

    S, B, C = 7, 2, 20  # Grid size, boxes, classes
    x = Dense(S * S * (B * 5 + C), activation='sigmoid')(x)
    output = Reshape((S, S, B * 5 + C))(x)

    model = Model(inputs=base_model.input, outputs=output)
    return model

# Instantiate the model
yolo_cnn = build_yolo_model()
yolo_cnn.summary()

**4. Implementing mse loss**
 

In [27]:
from keras.optimizers import Adam

yolo_cnn.compile(optimizer=Adam(1e-4), loss='mse')


In [28]:
import ast

def safe_eval(value):
    if isinstance(value, str):
        return ast.literal_eval(value)
    return value


In [29]:
# Parse the CSV row to get image and target
def parse_row(row, img_height=480, img_width=640, S=7):
    img_path = row['img_path']
    bboxes = safe_eval(row['img_gt_bbox_coords'])
    labels = safe_eval(row['img_gt_class_labels'])
    if not os.path.exists(img_path):
        print(f"Image path does not exist: {img_path}")
        return None, None
    img = cv2.imread(img_path)
    if img is None:
        print(f"Failed to load image: {img_path}")
        return None, None
    img = cv2.resize(img, (img_width, img_height)).astype(np.float32) / 255.0
    #img = img.astype(np.float32) / 255.0

    target = np.zeros((S, S, 2 * 5 + 20), dtype=np.float32)  # B=2, C=20

    for bbox, label in zip(bboxes, labels):
        xmin, ymin, xmax, ymax = bbox

        x_center = (xmin + xmax) / 2 / img_width
        y_center = (ymin + ymax) / 2 / img_height
        w = (xmax - xmin) / img_width
        h = (ymax - ymin) / img_height

        grid_x = int(x_center * S)
        grid_y = int(y_center * S)

        if 0 <= grid_x < S and 0 <= grid_y < S:
            target[grid_y, grid_x, 0:5] = [x_center, y_center, w, h, 1]
            target[grid_y, grid_x, 5 + label] = 1

    return img, target



In [30]:
# Data generator function to load data in batches
def data_generator(df, batch_size=8, img_height=480, img_width=640, S=7):
    while True:
        batch_imgs = []
        batch_targets = []
        attempts=0
        max_attempts=10
        while len(batch_imgs) < batch_size and attempts < max_attempts:
            sample_df = df.sample(n=batch_size)
            for _, row in sample_df.iterrows():
                img, target = parse_row(row)
                if img is None or target is None:
                    continue
                batch_imgs.append(img)
                batch_targets.append(target)
                if len(batch_imgs) == batch_size:
                    break
                attempts+=1
        if len(batch_imgs) < batch_size:
            print(f"❌ Could not collect enough valid samples. Got {len(batch_imgs)}. Skipping batch.")
            continue
        if len(batch_imgs) == 0:
            raise ValueError("No valid data found for this batch. Check your image paths or annotations.")


        yield np.array(batch_imgs), np.array(batch_targets)



In [31]:
def data_generator(df, batch_size=8, img_height=480, img_width=640, S=7):
    while True:
        batch_imgs = []
        batch_targets = []

        # Sample more than needed to reduce failed attempts
        candidate_df = df.sample(n=min(len(df), batch_size * 2))

        for _, row in candidate_df.iterrows():
            img, target = parse_row(row, img_height, img_width, S)
            if img is not None and target is not None:
                batch_imgs.append(img)
                batch_targets.append(target)
            else:
                print("[data_generator] ⚠️ Skipped invalid sample.")

            if len(batch_imgs) == batch_size:
                break

        if len(batch_imgs) < batch_size:
            print(f"[data_generator] ❌ Only collected {len(batch_imgs)} samples. Skipping batch.")
            continue  # Skip incomplete batch

        yield np.array(batch_imgs), np.array(batch_targets)


In [32]:
'''import pathlib
def train_test_df(imgs_base_path,annotations_base_path):

    img_complete_paths = list()
    img_class_labels = list()
    img_gt_bbox_coords = list()

    for single_img_complete_path in pathlib.Path(imgs_base_path).glob("*"):

        img_path = str(single_img_complete_path)
        img_label_path = os.path.join(annotations_base_path,str(single_img_complete_path).split("/")[-1].split(".")[0]+".xml")

        class_gt_labels_list = list()
        gt_bbox_coords_list = list()

        tree = ET.parse(img_label_path)
        root = tree.getroot()

        for member in root.findall("object"):
            
            class_gt_labels_list.append(member.find("name").text)
            xmin = float(member.find("bndbox/xmin").text)
            ymin = float(member.find("bndbox/ymin").text)
            xmax = float(member.find("bndbox/xmax").text)
            ymax = float(member.find("bndbox/ymax").text)
            
            #bbox_width = xmax - xmin
            #bbox_height = ymax - ymin
            
            gt_bbox_coords_list.append([xmin,ymin,xmax,ymax])

        img_complete_paths.append(str(single_img_complete_path))
        img_class_labels.append(class_gt_labels_list)
        img_gt_bbox_coords.append(gt_bbox_coords_list)

    return pd.DataFrame(data={"img_path":img_complete_paths,
                              "img_gt_class_labels":img_class_labels,
                              "img_gt_bbox_coords":img_gt_bbox_coords})'''

'import pathlib\ndef train_test_df(imgs_base_path,annotations_base_path):\n\n    img_complete_paths = list()\n    img_class_labels = list()\n    img_gt_bbox_coords = list()\n\n    for single_img_complete_path in pathlib.Path(imgs_base_path).glob("*"):\n\n        img_path = str(single_img_complete_path)\n        img_label_path = os.path.join(annotations_base_path,str(single_img_complete_path).split("/")[-1].split(".")[0]+".xml")\n\n        class_gt_labels_list = list()\n        gt_bbox_coords_list = list()\n\n        tree = ET.parse(img_label_path)\n        root = tree.getroot()\n\n        for member in root.findall("object"):\n            \n            class_gt_labels_list.append(member.find("name").text)\n            xmin = float(member.find("bndbox/xmin").text)\n            ymin = float(member.find("bndbox/ymin").text)\n            xmax = float(member.find("bndbox/xmax").text)\n            ymax = float(member.find("bndbox/ymax").text)\n            \n            #bbox_width = xmax - x

In [33]:
#data_df = train_test_df("VOCdevkit/VOC2012/JPEGImages","VOCdevkit/VOC2012/Annotations")

In [34]:
'''training_data = pd.read_csv("training_data.csv")
training_data["img_gt_class_labels"] = training_data["img_gt_class_labels"].apply(ast.literal_eval)
training_data["img_gt_bbox_coords"] = training_data["img_gt_bbox_coords"].apply(ast.literal_eval)'''

'training_data = pd.read_csv("training_data.csv")\ntraining_data["img_gt_class_labels"] = training_data["img_gt_class_labels"].apply(ast.literal_eval)\ntraining_data["img_gt_bbox_coords"] = training_data["img_gt_bbox_coords"].apply(ast.literal_eval)'

In [35]:
'''import os

file_path = training_data.iloc[0, 0]
print(f"Checking: {file_path}")
print("Exists:", os.path.exists(file_path))'''

'import os\n\nfile_path = training_data.iloc[0, 0]\nprint(f"Checking: {file_path}")\nprint("Exists:", os.path.exists(file_path))'

In [36]:
#plt.imread(training_data.iloc[0,0]).shape

In [37]:
#data_df.head()

In [38]:
# %% [markdown]
# **6. Training Pipeline**

'''def train_yolo_model():
    # Read training and validation CSV files
    train_csv = '/Users/twinkle/object_detection_model/training_data.csv'  # path to your training data CSV
    cv_csv = '/Users/twinkle/object_detection_model/cv_data.csv'           # path to your cross-validation CSV
    BASE_IMG_DIR = '/Users/twinkle/object_detection_model/VOCdevkit/VOC2012/JPEGImages'

    train_df = pd.read_csv(train_csv)
    train_df['img_path'] = train_df['img_path'].apply(lambda x: os.path.join(BASE_IMG_DIR, os.path.basename(x)))
    cv_df = pd.read_csv(cv_csv)

    # Create data generators
    train_gen = data_generator(train_df)
    val_gen = data_generator(cv_df)

    # Train the model
    yolo_cnn.fit(
        train_gen,
        steps_per_epoch=len(train_df) // 8,  # BATCH_SIZE
        validation_data=val_gen,
        validation_steps=len(cv_df) // 8,
        epochs=20
    )

    # Save the trained model
    yolo_cnn.save("yolo_model.h5")
    print("Model saved as yolo_model.h5")

# Run the training function
if __name__ == "__main__":
    train_yolo_model()'''

'def train_yolo_model():\n    # Read training and validation CSV files\n    train_csv = \'/Users/twinkle/object_detection_model/training_data.csv\'  # path to your training data CSV\n    cv_csv = \'/Users/twinkle/object_detection_model/cv_data.csv\'           # path to your cross-validation CSV\n    BASE_IMG_DIR = \'/Users/twinkle/object_detection_model/VOCdevkit/VOC2012/JPEGImages\'\n\n    train_df = pd.read_csv(train_csv)\n    train_df[\'img_path\'] = train_df[\'img_path\'].apply(lambda x: os.path.join(BASE_IMG_DIR, os.path.basename(x)))\n    cv_df = pd.read_csv(cv_csv)\n\n    # Create data generators\n    train_gen = data_generator(train_df)\n    val_gen = data_generator(cv_df)\n\n    # Train the model\n    yolo_cnn.fit(\n        train_gen,\n        steps_per_epoch=len(train_df) // 8,  # BATCH_SIZE\n        validation_data=val_gen,\n        validation_steps=len(cv_df) // 8,\n        epochs=20\n    )\n\n    # Save the trained model\n    yolo_cnn.save("yolo_model.h5")\n    print("Mode

In [39]:
'''import cv2
import matplotlib.pyplot as plt
im=train_df = pd.read_csv('/Users/twinkle/object_detection_model/training_data.csv')
plt.imread(im.iloc[0,0]).shape'''


"import cv2\nimport matplotlib.pyplot as plt\nim=train_df = pd.read_csv('/Users/twinkle/object_detection_model/training_data.csv')\nplt.imread(im.iloc[0,0]).shape"

In [None]:
def train_yolo_model():
    model = build_yolo_model()
    model.compile(optimizer=Adam(1e-4), loss='mse')
    train_csv = 'training_data.csv'
    val_csv = 'cv_data.csv'
    BASE_IMG_DIR = 'VOCdevkit/VOC2012/JPEGImages'

    # Load and prepare training/validation data
    train_df = pd.read_csv(train_csv)
    val_df = pd.read_csv(val_csv)
    train_df['img_path'] = train_df['img_path'].apply(lambda x: os.path.join(BASE_IMG_DIR, os.path.basename(x)))
    val_df['img_path'] = val_df['img_path'].apply(lambda x: os.path.join(BASE_IMG_DIR, os.path.basename(x)))


    # Convert string representations of lists back to actual lists
    train_df["img_gt_class_labels"] = train_df["img_gt_class_labels"].apply(ast.literal_eval)
    train_df["img_gt_bbox_coords"] = train_df["img_gt_bbox_coords"].apply(ast.literal_eval)
    val_df["img_gt_class_labels"] = val_df["img_gt_class_labels"].apply(ast.literal_eval)
    val_df["img_gt_bbox_coords"] = val_df["img_gt_bbox_coords"].apply(ast.literal_eval)

    # Create data generators
    train_gen = data_generator(train_df, batch_size=8)
    val_gen = data_generator(val_df, batch_size=8)

    # Fit the model
    yolo_cnn.fit(
        train_gen,
        steps_per_epoch=max(1,len(train_df) // 8),
        validation_data=val_gen,
        validation_steps=max(1,len(val_df) // 8),
        epochs=20,
    )

    # Save the trained model
    yolo_cnn.save("yolo_model.h5")
    print("Model saved as yolo_model.h5")


In [41]:
from keras.models import load_model
import matplotlib.pyplot as plt

# Decode YOLO output for visualization
def decode_predictions(pred, S=7, B=2, C=20, conf_thresh=0.2):
    boxes = []
    for i in range(S):
        for j in range(S):
            cell = pred[i, j]
            for b in range(B):
                x, y, w, h, conf = cell[b*5:(b+1)*5]
                if conf > conf_thresh:
                    class_probs = cell[B*5:]
                    class_id = np.argmax(class_probs)
                    score = conf * class_probs[class_id]

                    x *= 640
                    y *= 480
                    w *= 640
                    h *= 480

                    xmin = int(x - w/2)
                    ymin = int(y - h/2)
                    xmax = int(x + w/2)
                    ymax = int(y + h/2)
                    boxes.append(((xmin, ymin, xmax, ymax), class_id, score))
    return boxes

# Visualize predictions
def visualize_predictions(model_path, test_df_path):
    model = load_model(model_path)
    df = pd.read_csv(test_df_path)
    df["img_gt_class_labels"] = df["img_gt_class_labels"].apply(ast.literal_eval)
    df["img_gt_bbox_coords"] = df["img_gt_bbox_coords"].apply(ast.literal_eval)

    for idx in range(5):  # Show 5 predictions
        row = df.iloc[idx]
        img = cv2.imread(row["img_path"])
        img_input = cv2.resize(img, (640, 480)).astype(np.float32) / 255.0
        pred = model.predict(np.expand_dims(img_input, 0))[0]
        detections = decode_predictions(pred)

        for (xmin, ymin, xmax, ymax), class_id, score in detections:
            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0,255,0), 2)
            cv2.putText(img, f"Class {class_id}: {score:.2f}", (xmin, ymin-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1)

        plt.figure(figsize=(10,6))
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.axis("off")
        plt.title("Prediction")
        plt.show()


In [42]:
if __name__ == "__main__":
    train_yolo_model()
    visualize_predictions("yolo_model.h5", "cv_data.csv")



Epoch 1/20
[1m1781/1875[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m11:06[0m 7s/step - loss: 0.0083

KeyboardInterrupt: 

In [None]:
gen = data_generator(train_df, batch_size=8)
x, y = next(gen)
print(f"x.shape = {x.shape}, y.shape = {y.shape}")
