# Organisation notebook
This notebook is for all of the things involved in the image classification that are not the actual model training/validation/inference:
- classification into day/night images'
- sorting datasets into balanced test/train/val sets 
- checking how many instances of classes are in a labels folder
- relabelling of images by replacing values in the label files
- Deleting duplicate images from training/val datasets to prevent data leakage

In [None]:
import shutil
import cv2
from ultralytics import YOLO
import os
import pandas as pd
import time
from glob import glob
import re
import numpy as np
from tqdm import tqdm
from skmultilearn.model_selection import iterative_train_test_split

## Sort images into day/night
- This classifies images in any directory into day/night images based on hue
- It misses some images (<10 per 1000) so some manual checking is needed

Specifically for labelled images

In [None]:
# Define the source directory and the target directories for day and night images
image_source_directory = "D:\\Wild deserts photos\\model_val\\all_images\\val\\images"
label_source_directory = "D:\\Wild deserts photos\\model_val\\all_images\\val\\labels"

day_directory_images = "D:\\Wild deserts photos\\model_val\\day_images\\val\\images"
day_directory_labels = "D:\\Wild deserts photos\\model_val\\day_images\\val\\labels"

night_directory_images = "D:\\Wild deserts photos\\model_val\\night_images\\val\\images"
night_directory_labels = "D:\\Wild deserts photos\\model_val\\night_images\\val\\images"

# Create the target directories if they do not exist
os.makedirs(day_directory_images, exist_ok=True)
os.makedirs(day_directory_labels, exist_ok=True)
os.makedirs(night_directory_images, exist_ok=True)
os.makedirs(night_directory_labels, exist_ok=True)

def classify_image(image_path):
    image = cv2.imread(image_path)
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    brightness = hsv_image[:, :, 2].mean()  # V channel represents brightness
    hue = hsv_image[:, :, 0].mean()  # H channel represents hue
    # Adjust the thresholds as needed
    if hue > 10:
        return 'day'
    else:
        return 'night'

# Classify and copy images and labels
for filename in os.listdir(image_source_directory):
    if filename.endswith(".JPG") or filename.endswith(".jpg"):
        image_path = os.path.join(image_source_directory, filename)
        label_path = os.path.join(label_source_directory, filename.replace(".JPG", ".txt").replace(".jpg", ".txt"))
        print(f"Classifying {image_path}")
        classification = classify_image(image_path)
        if classification == 'day':
            shutil.copy(image_path, os.path.join(day_directory_images, filename))
            shutil.copy(label_path, os.path.join(day_directory_labels, os.path.basename(label_path)))
        else:
            shutil.copy(image_path, os.path.join(night_directory_images, filename))
            shutil.copy(label_path, os.path.join(night_directory_labels, os.path.basename(label_path)))

print("Classification and copying completed!")

For unlabelled images/just a directory of images

In [None]:
# Define the source directory and the target directories for day and night images
image_source_directory = "D:\\Wild deserts photos\\Reconyx\\BeyondTheFence\\PCAM14"

day_directory_images = "D:\\Wild deserts photos\\Reconyx\\BeyondTheFence\\PCAM14\\day"

night_directory_images = "D:\\Wild deserts photos\\Reconyx\\BeyondTheFence\\PCAM14\\night"

# Create the target directories if they do not exist
os.makedirs(day_directory_images, exist_ok=True)
os.makedirs(night_directory_images, exist_ok=True)

def classify_image(image_path):
    image = cv2.imread(image_path)
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    brightness = hsv_image[:, :, 2].mean()  # V channel represents brightness
    hue = hsv_image[:, :, 0].mean()  # H channel represents hue
    # Adjust the thresholds as needed
    if hue > 10:
        return 'day'
    else:
        return 'night'

# Classify and copy images
for filename in os.listdir(image_source_directory):
    if filename.endswith(".JPG") or filename.endswith(".jpg"):
        image_path = os.path.join(image_source_directory, filename)
        print(image_path)
        classification = classify_image(image_path)
        if classification == 'day':
            shutil.copy(image_path, os.path.join(day_directory_images, filename))
        else:
            shutil.copy(image_path, os.path.join(night_directory_images, filename))

print("Classification and copying completed!")

## Reclassify labels and split into train/val sets
- This uses some basic text editing to replace values that are being reclassified
- Faster than reclassifying using X-Anylabel
- I also split images into new test/train/val sets after reclassifying.
    - I use iterative_train_test_split from Scikit-multilearn to perform stratified classification because of the imbalanced dataset
    


### Reclassification
- remember to change the to_remove and the species

In [None]:
for file_path in glob('D:\\Wild deserts photos\\model_val\\three_class_final_21_12_2024\\val\\labels\\*.txt'):
    print(f'{file_path}')
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        modified_lines = []
        for line in lines:
            to_remove = re.match(r'^(0 .*\n|3 .*\n|4 .*\n|7 .*\n|9 .*\n|10 .*\n|13 .*\n|15 .*\n|6 .*\n|2 .*\n)', line)
            kangaroo = re.match(r'^(14 |12 |8 |5 )', line)
            rabbit = re.match(r'^(11 )', line)
            #dingo = re.match(r'^(2 )', line)
            #fox = re.match(r'^(6 )', line)
            if to_remove:
                continue  # Skip appending this line
            elif kangaroo:
                modified_line = re.sub(r'^(14 |12 |8 |5 )', '0 ', line)
            elif rabbit:
                modified_line = re.sub(r'^(11 )', "2 ", line)
            #elif dingo:
            #    modified_line = re.sub(r'^(2 )', "3 ", line)  
            #elif fox:
            #    modified_line = re.sub(r'^(6 )', "4 ", line)  
            else:
                modified_line = line  
            
            modified_lines.append(modified_line)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.writelines(modified_lines)
        print(f'File written to: {file_path}')

print("Done!")

### Split the reclassified images
- remember to change the 4D zero vector according to the number of classes you have

In [None]:

# Path to your YOLO .txt annotations
annotations_path = "C:\\Users\\willo\\OneDrive - UNSW\\Documents\\Work\\CES\\Wild Deserts\\Image classification\\feedback_loop\\v1_day_160125\\labels\\*.txt"

# List all .txt files
txt_files = glob(annotations_path)

X = []  # Will store image paths or anything representing the "features"
y = []  # Will store the multi-label vectors of shape (16,)

for txt_file in txt_files:
    # Construct the corresponding image path
    # (assuming .jpg files, adjust if your images are .png, etc.)
    img_file = txt_file.replace(".txt", ".JPG")
    img_file = img_file.replace("labels", "images")
    # Initialize a 4-dimensional zero vector for the labels !!! MAKE SURE TO CHANGE IF YOU HAVE MORE CLASSES !!!
    labels = np.zeros(5, dtype=int)
    print(txt_file)
    # Read the YOLO annotation file
    with open(txt_file, "r") as f:
        for line in f:
            items = line.strip().split()
            if len(items) >= 5:
                class_id = int(items[0])
                # Mark that class_id as present
                labels[class_id] = 1

    # Append to X and y
    X.append(img_file)     # or store actual image data if needed
    y.append(labels)

X = np.array(X)
X = X.reshape(-1, 1)
y = np.array(y)
print("Data extracted into numpy arrays")

val_size = 0.2 #20% of the data will be used for test

# Perform the iterative train/test split
X_train, y_train, X_val, y_val = iterative_train_test_split(X, y, test_size=val_size)
X_val = X_test.ravel()  
X_train = X_train.ravel()
print("Proportions:")
print("  train:", X_train.shape, y_train.shape)
print("  val:     ", X_val.shape,    y_val.shape)

# Summarize the TRAIN data
train_counts = y_train.sum(axis=0)
print(f"TRAIN - counts per class: {train_counts}")
print(f"TRAIN - total samples: {len(y_train)}\n")

# Summarize the TEST data
val_counts = y_val.sum(axis=0)
print(f"TEST - counts per class: {val_counts}")
print(f"TEST - total samples: {len(y_val)}\n")



Move files into the correct folders

In [None]:
#Change these to your directories
labels_dir = "C:\\Users\\willo\\OneDrive - UNSW\\Documents\\Work\\CES\\Wild Deserts\\Image classification\\feedback_loop\\v1_day_160125\\labels"
images_dir = "C:\\Users\\willo\\OneDrive - UNSW\\Documents\\Work\\CES\\Wild Deserts\\Image classification\\feedback_loop\\v1_day_160125\\images"
# Create "labels" and "images" folders in each directory
for directory in [labels_dir, images_dir]:
    os.makedirs(os.path.join(directory, "val"), exist_ok=True)
    os.makedirs(os.path.join(directory, "train"), exist_ok=True)
    #os.makedirs(os.path.join(directory, "val"), exist_ok=True)
# Copy test images and labels. Note that this leaves the files in the original directories too
for img_path in tqdm(X_val, desc="Copying val data"):
    
    txt_path = img_path.replace(".JPG", ".txt")
    txt_path = txt_path.replace("images", "labels")

    shutil.copy(img_path, os.path.join(images_dir, "val"))
    shutil.copy(txt_path, os.path.join(labels_dir, "val"))
# Copy training images and labels
for img_path in tqdm(X_train, desc="Copying training data"):
    txt_path = img_path.replace(".JPG", ".txt")  # or .png, whichever you have
    txt_path = txt_path.replace("images", "labels")
    shutil.copy(img_path, os.path.join(images_dir, "train"))
    shutil.copy(txt_path, os.path.join(labels_dir, "train"))


## Delete duplicate images from two directories

In [None]:
import os

# Define the directories
dir2 = "C:\\Users\\willo\\OneDrive - UNSW\\Documents\\Work\\CES\\Wild Deserts\\Image classification\\feedback_loop\\v1_night_16012025\\images\\val"
# Walk through all subdirectories in the root directory
for root, dirs, files in os.walk("D:\\Wild deserts photos\\Reconyx"):
    # Get the list of image files in each subdirectory
    images_dir1 = set(os.listdir(root))
    images_dir2 = set(os.listdir(dir2))

    # Find the intersection of the two sets
    shared_images = images_dir1.intersection(images_dir2)

    # Delete the shared images from the current subdirectory
    for image in shared_images:
        print(image)
        image_path = os.path.join(root, image)
        if os.path.exists(image_path):
            os.remove(image_path)
            print(f"Deleted {image_path}")


## Check distribution of labels in directory 
- Need to remember to change the classes file and also the directory

In [None]:
import os

# Define the YOLO classes in the correct order
CLASSES = [
    "Bird", 
    "Cat", 
    "Dingo", 
    "Echidna", 
    "Emu", 
    "Euro", 
    "Fox", 
    "Goat", 
    "Kangaroo", 
    "Other", 
    "Pig", 
    "Rabbit", 
    "Red Kangaroo", 
    "Small mammal", 
    "Western Grey Kangaroo", 
    "empty"
]

# Path to your folder containing .txt annotation files
folder_path = "C:\\Users\\willo\\OneDrive - UNSW\\Documents\\Work\\CES\\Wild Deserts\\Image classification\\training\\model_training_no_split\\14_classes_b_plus_empty\\labels"

# Create a count dictionary initialized to zero for each class
label_counts = {cls: 0 for cls in CLASSES}

# Loop through all .txt files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        txt_file_path = os.path.join(folder_path, filename)
        
        with open(txt_file_path, "r") as file:
            lines = file.readlines()
            
            # For each line in the .txt, extract the class index
            for line in lines:
                line = line.strip()
                if line:
                    class_index = int(line.split()[0])  # YOLO class index
                    label_counts[CLASSES[class_index]] += 1

# Print out the counts per class
for cls_name, cls_count in label_counts.items():
    print(f"{cls_name}: {cls_count}")


Plot distribution

In [None]:
import matplotlib.pyplot as plt
plt.bar(range(len(label_counts)), list(label_counts.values()), align='center')
plt.xticks(range(len(label_counts)), list(label_counts.keys()))
plt.tick_params(axis='x', labelrotation=90) 
plt.figure(figsize=(10, 6))
plt.show()