## Mount the Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


🚀 Step 9: Resize Images to 640x640 for YOLOv8

📌 Why do we do this?
	•	YOLOv8 expects all images to be 640x640 pixels.
	•	Resizing ensures consistency across the dataset.
	•	Faster and more efficient training.


In [3]:
from PIL import Image
import os
import multiprocessing

# Paths
processed_images_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_Subset/images"
resized_images_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_Subset/resized_images"
os.makedirs(resized_images_dir, exist_ok=True)

# Resize function
def resize_image(filename):
    try:
        img_path = os.path.join(processed_images_dir, filename)
        resized_img_path = os.path.join(resized_images_dir, filename)

        if not os.path.exists(resized_img_path):  # Skip already resized images
            img = Image.open(img_path).convert("RGB")
            img = img.resize((640, 640))
            img.save(resized_img_path)
            return f"✅ Resized: {filename}"
    except Exception as e:
        return f"❌ Error resizing {filename}: {e}"

# Get all image filenames
image_filenames = os.listdir(processed_images_dir)

# Use multiprocessing for faster execution
with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    results = pool.map(resize_image, image_filenames)

# Print results
for result in results:
    if result:
        print(result)

print(f"✅ All images resized and saved in {resized_images_dir}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
✅ Resized: 000000027789.jpg
✅ Resized: 000000140490.jpg
✅ Resized: 000000094666.jpg
✅ Resized: 000000318785.jpg
✅ Resized: 000000500816.jpg
✅ Resized: 000000539189.jpg
✅ Resized: 000000381433.jpg
✅ Resized: 000000119404.jpg
✅ Resized: 000000369446.jpg
✅ Resized: 000000287774.jpg
✅ Resized: 000000487796.jpg
✅ Resized: 000000532625.jpg
✅ Resized: 000000016525.jpg
✅ Resized: 000000031745.jpg
✅ Resized: 000000567268.jpg
✅ Resized: 000000218026.jpg
✅ Resized: 000000255443.jpg
✅ Resized: 000000504054.jpg
✅ Resized: 000000347467.jpg
✅ Resized: 000000490878.jpg
✅ Resized: 000000573513.jpg
✅ Resized: 000000015816.jpg
✅ Resized: 000000073801.jpg
✅ Resized: 000000142726.jpg
✅ Resized: 000000209989.jpg
✅ Resized: 000000539719.jpg
✅ Resized: 000000203345.jpg
✅ Resized: 000000328740.jpg
✅ Resized: 000000154008.jpg
✅ Resized: 000000535113.jpg
✅ Resized: 000000503844.jpg
✅ Resized: 000000498082.jpg
✅ Resized: 000000074462.jpg
✅ Resized: 

In [4]:
import os

# Define the path to the images
image_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_Subset/resized_images"

# Count the number of images
num_images = len([f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))])

# Print the result
print(f"There are {num_images} images in the directory.")

There are 81370 images in the directory.


### 🚀 Step 11: Convert COCO Annotations to YOLO Format

📌 What this script does:
✔ Reads the filtered annotation file (filtered_instances_train2017.json).
✔ Converts each COCO annotation into YOLO format (normalizing bounding boxes).
✔ Creates .txt label files in the correct folder (train/labels/).

In [2]:
import json
import os

# Paths
coco_annotations_path = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_Subset/filtered_instances_train2017.json"
yolo_labels_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO/train/labels"
os.makedirs(yolo_labels_dir, exist_ok=True)

# Load COCO annotations
with open(coco_annotations_path, "r") as f:
    coco_data = json.load(f)

# Create category mapping (COCO class IDs → YOLO class IDs)
category_mapping = {cat["id"]: i for i, cat in enumerate(coco_data["categories"])}

# Convert COCO to YOLO format
for ann in coco_data["annotations"]:
    img_id = ann["image_id"]
    category_id = category_mapping[ann["category_id"]]

    # Get bounding box
    x, y, width, height = ann["bbox"]
    x_center = x + (width / 2)
    y_center = y + (height / 2)

    # Normalize by image size (assuming all images are 640x640)
    x_center /= 640
    y_center /= 640
    width /= 640
    height /= 640

    # Save to YOLO format
    label_path = os.path.join(yolo_labels_dir, f"{img_id}.txt")
    with open(label_path, "a") as f:
        f.write(f"{category_id} {x_center} {y_center} {width} {height}\n")

print("✅ COCO annotations converted to YOLO format!")

✅ COCO annotations converted to YOLO format!


In [3]:
import os

# Define the directory path
directory_path = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO/train/labels"

# Count the number of .txt files
txt_file_count = len([f for f in os.listdir(directory_path) if f.endswith('.txt')])

# Print the result
print(f"There are {txt_file_count} .txt files in the directory.")

There are 81372 .txt files in the directory.


In [4]:
import os

def check_image_types(directory):
  """Checks the types of images in a directory.

  Args:
    directory: The path to the directory containing the images.

  Returns:
    A dictionary mapping image types to the number of images of that type.
  """

  image_types = {}
  for filename in os.listdir(directory):
    if filename.lower().endswith(('.jpg', '.jpeg')):
      image_type = os.path.splitext(filename)[1].lower()
      image_types[image_type] = image_types.get(image_type, 0) + 1
  return image_types

# Call the function with the directory path
image_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_Subset/resized_images"
image_type_counts = check_image_types(image_dir)

# Print the results
for image_type, count in image_type_counts.items():
  print(f"There are {count} images of type {image_type} in the directory.")

There are 81370 images of type .jpg in the directory.


🚀 Final Step 13: Copy Images + Labels into Train/Val Structure

This script will:
	•	✅ Create a new folder COCO_YOLO_FINAL
	•	✅ Randomly split the dataset (80% train / 20% val)
	•	✅ Copy images from resized_images/
	•	✅ Copy matching labels from train/labels/

In [5]:
import os
import random
import shutil

# ✅ Source paths
resized_images_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_Subset/resized_images"
labels_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO/train/labels"

# ✅ Target base path
yolo_final_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL"

# ✅ Train / Val target folders
train_images_dir = os.path.join(yolo_final_dir, "train/images")
train_labels_dir = os.path.join(yolo_final_dir, "train/labels")
val_images_dir = os.path.join(yolo_final_dir, "val/images")
val_labels_dir = os.path.join(yolo_final_dir, "val/labels")

# ✅ Create necessary folders
for d in [train_images_dir, train_labels_dir, val_images_dir, val_labels_dir]:
    os.makedirs(d, exist_ok=True)

# ✅ Get all image filenames
all_images = [f for f in os.listdir(resized_images_dir) if f.endswith(".jpg")]

# ✅ Shuffle and split (80% train / 20% val)
random.seed(42)
random.shuffle(all_images)
split_idx = int(len(all_images) * 0.8)
train_files = all_images[:split_idx]
val_files = all_images[split_idx:]

# ✅ Function to copy image + label
def copy_files(file_list, src_img_dir, src_lbl_dir, dst_img_dir, dst_lbl_dir):
    for file in file_list:
        # Copy image
        src_img = os.path.join(src_img_dir, file)
        dst_img = os.path.join(dst_img_dir, file)
        shutil.copy(src_img, dst_img)

        # Copy label
        label_file = file.replace(".jpg", ".txt")
        src_lbl = os.path.join(src_lbl_dir, label_file)
        dst_lbl = os.path.join(dst_lbl_dir, label_file)
        if os.path.exists(src_lbl):
            shutil.copy(src_lbl, dst_lbl)

# ✅ Copy training files
copy_files(train_files, resized_images_dir, labels_dir, train_images_dir, train_labels_dir)

# ✅ Copy validation files
copy_files(val_files, resized_images_dir, labels_dir, val_images_dir, val_labels_dir)

print(f"✅ Copy completed! {len(train_files)} train images, {len(val_files)} val images")
print(f"📂 Dataset is ready at: {yolo_final_dir}")

✅ Copy completed! 65096 train images, 16274 val images
📂 Dataset is ready at: /content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL


In [4]:
import os

def count_files(directory):
  """Counts the number of files in a directory.

  Args:
    directory: The path to the directory.

  Returns:
    The number of files in the directory.
  """

  file_count = len([f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))])
  return file_count

def check_file_types(directory):
  """Checks the types of files in a directory.

  Args:
    directory: The path to the directory.

  Returns:
    A dictionary mapping file types to the number of files of that type.
  """

  file_types = {}
  for filename in os.listdir(directory):
    file_type = os.path.splitext(filename)[1].lower()
    file_types[file_type] = file_types.get(file_type, 0) + 1
  return file_types

# Specify the paths
train_images_path = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/train/images"
val_images_path = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/val/images"

# Count files and check types
train_file_count = count_files(train_images_path)
print(f"There are {train_file_count} files in the train/images directory.")

val_file_count = count_files(val_images_path)
print(f"There are {val_file_count} files in the val/images directory.")

train_file_types = check_file_types(train_images_path)
print("File types in train/images:")
for file_type, count in train_file_types.items():
  print(f"- {file_type}: {count}")

val_file_types = check_file_types(val_images_path)
print("File types in val/images:")
for file_type, count in val_file_types.items():
  print(f"- {file_type}: {count}")

There are 65096 files in the train/images directory.
There are 16274 files in the val/images directory.
File types in train/images:
- .jpg: 65096
File types in val/images:
- .jpg: 16274


In [5]:
import os

def analyze_directory(directory_path):
  """Lists files, counts them, and identifies file types in a directory.

  Args:
    directory_path: The path to the directory.

  Returns:
    A tuple containing:
      - A list of file names in the directory.
      - The total number of files in the directory.
      - A dictionary mapping file types to their counts.
  """

  file_list = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
  file_count = len(file_list)
  file_types = {}
  for filename in file_list:
    file_type = os.path.splitext(filename)[1].lower()
    file_types[file_type] = file_types.get(file_type, 0) + 1
  return file_list, file_count, file_types

# Specify the directory paths
train_labels_path = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/train/labels"
val_labels_path = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/val/labels"

# Analyze the train/labels directory
train_files, train_count, train_types = analyze_directory(train_labels_path)
print(f"Train/labels directory analysis:")
# print(f"  Files: {train_files}")  # Uncomment to print the file list
print(f"  Total files: {train_count}")
print(f"  File types: {train_types}")

# Analyze the val/labels directory
val_files, val_count, val_types = analyze_directory(val_labels_path)
print(f"\nVal/labels directory analysis:")
# print(f"  Files: {val_files}")  # Uncomment to print the file list
print(f"  Total files: {val_count}")
print(f"  File types: {val_types}")

Train/labels directory analysis:
  Total files: 0
  File types: {}

Val/labels directory analysis:
  Total files: 0
  File types: {}


### 🔁 Corrected Copy Function (With Zero Stripping)

In [6]:
import os
import shutil

# Paths
labels_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO/train/labels"
train_images_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/train/images"
train_labels_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/train/labels"
val_images_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/val/images"
val_labels_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/val/labels"

# ✅ Strip leading zeros from image names to match .txt file names
def copy_labels_strip_zeros(image_dir, src_lbl_dir, dest_lbl_dir):
    copied = 0
    missing = []

    os.makedirs(dest_lbl_dir, exist_ok=True)

    for img_file in os.listdir(image_dir):
        if not img_file.endswith(".jpg"):
            continue
        img_id = os.path.splitext(img_file)[0]             # '000000000073'
        stripped_id = str(int(img_id))                     # '73'
        label_file = stripped_id + ".txt"

        src_lbl = os.path.join(src_lbl_dir, label_file)
        dst_lbl = os.path.join(dest_lbl_dir, label_file)

        if os.path.exists(src_lbl):
            shutil.copy(src_lbl, dst_lbl)
            copied += 1
        else:
            missing.append(label_file)

    return copied, missing

# 🔁 Copy labels (only)
train_copied, train_missing = copy_labels_strip_zeros(train_images_dir, labels_dir, train_labels_dir)
val_copied, val_missing = copy_labels_strip_zeros(val_images_dir, labels_dir, val_labels_dir)

# ✅ Summary
print(f"✅ Train labels copied: {train_copied}")
print(f"✅ Val labels copied: {val_copied}")

if train_missing or val_missing:
    print(f"⚠️ Missing train: {len(train_missing)} | val: {len(val_missing)}")
    print("Example missing train labels:", train_missing[:3])
    print("Example missing val labels:", val_missing[:3])
else:
    print("🎉 All labels copied successfully and matched with images!")

✅ Train labels copied: 65094
✅ Val labels copied: 16274
⚠️ Missing train: 2 | val: 0
Example missing train labels: ['220226.txt', '388693.txt']
Example missing val labels: []


### 🗑️ Delete Unlabeled Images (Missing Labels)

In [7]:
import os

# Paths
train_images_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/train/images"
missing_ids = ['220226', '388693']  # Without padding

deleted = 0
for img_id in missing_ids:
    filename = str(img_id).zfill(12) + ".jpg"
    filepath = os.path.join(train_images_dir, filename)
    if os.path.exists(filepath):
        os.remove(filepath)
        print(f"🗑️ Deleted: {filename}")
        deleted += 1
    else:
        print(f"⚠️ File not found: {filename}")

print(f"✅ {deleted} images deleted successfully.")

🗑️ Deleted: 000000220226.jpg
🗑️ Deleted: 000000388693.jpg
✅ 2 images deleted successfully.


🔁 Let’s Rename the .txt Labels to Match the Zero-Padded Filenames

This ensures YOLOv8 finds the label for each image during training.

In [8]:
import os

# Directories
train_labels_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/train/labels"
val_labels_dir = "/content/drive/MyDrive/Colab Notebooks/FYP_Model_NEW/datasets/processed/COCO_YOLO_FINAL/val/labels"

# Padding width (COCO-style)
PAD_WIDTH = 12

def pad_txt_filenames(label_dir):
    renamed = 0
    for fname in os.listdir(label_dir):
        if fname.endswith(".txt"):
            name, ext = os.path.splitext(fname)
            if len(name) < PAD_WIDTH:
                padded_name = name.zfill(PAD_WIDTH) + ext
                src = os.path.join(label_dir, fname)
                dst = os.path.join(label_dir, padded_name)
                if not os.path.exists(dst):
                    os.rename(src, dst)
                    renamed += 1
    return renamed

# Apply renaming
train_renamed = pad_txt_filenames(train_labels_dir)
val_renamed = pad_txt_filenames(val_labels_dir)

print(f"✅ Renamed {train_renamed} train labels and {val_renamed} val labels to match padded image filenames.")

✅ Renamed 65094 train labels and 16274 val labels to match padded image filenames.


In [9]:
# Check how many image-label pairs exist
images = os.listdir(train_images_dir)
labels = os.listdir(train_labels_dir)
print(len(images), "images")
print(len(labels), "labels")

65094 images
65094 labels
