Since edge detection catches too many side objects, we're trying image segmentation.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from tensorflow import keras
import tensorflow as tf
import tensorflow_datasets as tfds
import cv2
import PIL
from IPython.display import clear_output
import shutil 

### SPLITTING INTO TRAINING AND TEST SETS

In [4]:
# split images into training and testing 

directory = '../data/u-net-data/full'

print(os.listdir(directory))


['20250527212620.jpg', '20250527212624.jpg', '20250527212629.jpg', '20250527212634.jpg', '20250527212639.jpg', '20250527212643.jpg', '20250527212647.jpg', '20250527212651.jpg', '20250527212654.jpg', '20250527212658.jpg', '20250527212726.jpg', '20250527212731.jpg', '20250527212735.jpg', '20250527212739.jpg', '20250527212745.jpg', '20250527212749.jpg', '20250527212753.jpg', '20250527212756.jpg', '20250527212800.jpg', '20250527212803.jpg', '20250527212807.jpg', '20250527212814.jpg', '20250527212915.jpg', '20250527212918.jpg', '20250527212943.jpg', '20250527212947.jpg', '20250527212950.jpg', '20250527212953.jpg', '20250527212957.jpg', '20250527213001.jpg', '20250527213004.jpg', '20250527213008.jpg', '20250527213012.jpg', '20250527213017.jpg', '20250527213020.jpg', '20250527213024.jpg', '20250527213027.jpg', '20250527213048.jpg', '20250527213056.jpg', '20250527213100.jpg', '20250527213105.jpg', '20250527213108.jpg', '20250527213112.jpg', '20250527213116.jpg', '20250527213119.jpg', '20250527

In [None]:
source_folder = "C:\\Users\\tania\\PythonProjects\\keylife-ng-pv\\ml_project_2\\data\\u-net-data\\full"
train_folder = "C:\\Users\\tania\\PythonProjects\\keylife-ng-pv\\ml_project_2\\data\\u-net-data\\training"
test_folder = "C:\\Users\\tania\\PythonProjects\\keylife-ng-pv\\ml_project_2\\data\\u-net-data\\testing"


In [13]:
os.path.join(train_folder, '20250527213718.jpg')

'C:\\Users\\tania\\PythonProjects\\keylife-ng-pv\\ml_project_2\\data\\u-net-data\\training\\20250527213718.jpg'

In [None]:
import random

images = os.listdir(source_folder)
random.shuffle(images)


split_idx = int(len(images) * 0.8)  # 80% for training, 20% for testing

In [27]:
train_images = images[:split_idx]
test_images = images[split_idx:]

# Move images to respective folders
for img in train_images:
    # print(os.path.join(source_folder, img))
    shutil.move(os.path.join(source_folder, img), os.path.join(train_folder, img))


In [28]:

for img in test_images:
        # print(os.path.join(train_folder, img.name))
        shutil.move(os.path.join(source_folder, img), os.path.join(test_folder, img))

### LABELLING THE DATA FOR U-NET

In [None]:
import glob
from PIL import Image
from tqdm import tqdm
import sys

def convert_images_to_tif(training_dir, training_labels_dir, output_training_dir, output_labels_dir):
    """
    Convert PNG images to TIF format, but only process pairs where both the image and mask exist.
    
    Args:
        training_dir: Directory containing original training images
        training_labels_dir: Directory containing mask images
        output_training_dir: Directory to save converted training images
        output_labels_dir: Directory to save converted mask images
    """
    # Create output directories if they don't exist
    os.makedirs(output_training_dir, exist_ok=True)
    os.makedirs(output_labels_dir, exist_ok=True)
    
    # Get list of all mask files
    mask_files = glob.glob(os.path.join(training_labels_dir, '*_mask.png'))
    
    # Extract base filenames from masks (without _mask.png)
    mask_base_names = [os.path.basename(f).replace('_mask.png', '') for f in mask_files]
    
    print(f"Found {len(mask_files)} mask files in {training_labels_dir}")
    
    # Initialize counters
    processed_count = 0
    skipped_count = 0
    error_count = 0
    
    # Process each image that has a corresponding mask
    for base_name in tqdm(mask_base_names, desc="Converting image pairs"):
        # Define source paths
        source_image_path = os.path.join(training_dir, f"{base_name}.jpg")
        source_mask_path = os.path.join(training_labels_dir, f"{base_name}_mask.png")
        
        # Define destination paths
        dest_image_path = os.path.join(output_training_dir, f"{base_name}.tif")
        dest_mask_path = os.path.join(output_labels_dir, f"{base_name}_mask.tif")
        
        # Check if source image exists
        if not os.path.exists(source_image_path):
            print(f"Warning: Training image {source_image_path} not found, skipping pair")
            skipped_count += 1
            continue
        
        try:
            # Open and convert training image
            with Image.open(source_image_path) as img:
                # Save as TIFF with high quality
                img.save(dest_image_path, format='TIFF', compression='tiff_lzw')
            
            # Open and convert mask image
            with Image.open(source_mask_path) as mask:
                # Ensure mask is in binary/grayscale mode
                if mask.mode != 'L':
                    mask = mask.convert('L')
                # Save as TIFF with appropriate compression for binary images
                mask.save(dest_mask_path, format='TIFF', compression='tiff_lzw')
            
            processed_count += 1
            
        except Exception as e:
            print(f"Error processing {base_name}: {str(e)}")
            error_count += 1
    
    # Print summary
    total_training_images = len(glob.glob(os.path.join(training_dir, '*.png')))
    print("\nConversion Summary:")
    print(f"Total training images: {total_training_images}")
    print(f"Total mask images: {len(mask_files)}")
    print(f"Successfully converted pairs: {processed_count}")
    print(f"Skipped (no matching pair): {skipped_count}")
    print(f"Errors during conversion: {error_count}")
    print(f"Training images without masks: {total_training_images - len(mask_files)}")


def verify_conversions(output_training_dir, output_labels_dir):
    """
    Verify that the conversions were successful by checking file counts and opening a sample.
    """
    training_tifs = glob.glob(os.path.join(output_training_dir, '*.tif'))
    mask_tifs = glob.glob(os.path.join(output_labels_dir, '*.tif'))
    
    print("\nVerification:")
    print(f"TIF files in output training directory: {len(training_tifs)}")
    print(f"TIF files in output labels directory: {len(mask_tifs)}")
    
    if training_tifs and mask_tifs:
        # Try to open a sample image to verify
        try:
            sample_img = Image.open(training_tifs[0])
            sample_mask = Image.open(mask_tifs[0])
            print(f"Sample image size: {sample_img.size}, mode: {sample_img.mode}")
            print(f"Sample mask size: {sample_mask.size}, mode: {sample_mask.mode}")
            print("Successfully verified sample files can be opened")
        except Exception as e:
            print(f"Error verifying sample files: {str(e)}")


In [6]:

training_dir = "C:\\Users\\tania\\PythonProjects\\keylife-ng-pv\\ml_project_2\\data\\u-net-data\\training"
training_labels_dir = "C:\\Users\\tania\\PythonProjects\\keylife-ng-pv\\ml_project_2\\data\\u-net-data\\training-labels"
output_training_dir = "training-tif"
output_labels_dir = "training-labels-tif"

# Allow command-line overrides of directories
if len(sys.argv) > 4:
    training_dir = sys.argv[1]
    training_labels_dir = sys.argv[2]
    output_training_dir = sys.argv[3]
    output_labels_dir = sys.argv[4]

print(f"Source training directory: {training_dir}")
print(f"Source labels directory: {training_labels_dir}")
print(f"Output training directory: {output_training_dir}")
print(f"Output labels directory: {output_labels_dir}")

# Run the conversion
convert_images_to_tif(training_dir, training_labels_dir, output_training_dir, output_labels_dir)

# Verify the results
verify_conversions(output_training_dir, output_labels_dir)

Source training directory: C:\Users\tania\PythonProjects\keylife-ng-pv\ml_project_2\data\u-net-data\training
Source labels directory: C:\Users\tania\PythonProjects\keylife-ng-pv\ml_project_2\data\u-net-data\training-labels
Output training directory: training-tif
Output labels directory: training-labels-tif
Found 107 mask files in C:\Users\tania\PythonProjects\keylife-ng-pv\ml_project_2\data\u-net-data\training-labels


Converting image pairs: 100%|██████████| 107/107 [00:11<00:00,  9.35it/s]


Conversion Summary:
Total training images: 0
Total mask images: 107
Successfully converted pairs: 107
Skipped (no matching pair): 0
Errors during conversion: 0
Training images without masks: -107

Verification:
TIF files in output training directory: 107
TIF files in output labels directory: 107
Sample image size: (1440, 1616), mode: RGB
Sample mask size: (1440, 1616), mode: L
Successfully verified sample files can be opened





In [13]:
test_dir = "C:\\Users\\tania\\PythonProjects\\keylife-ng-pv\\ml_project_2\\data\\u-net-data\\testing"
output_test_dir = "C:\\Users\\tania\\PythonProjects\\keylife-ng-pv\\ml_project_2\\data\\u-net-data\\testing-tif"

base_names = [os.path.basename(f).replace('.jpg', '') for f in os.listdir(test_dir)]

for image in tqdm(base_names, 'Converting to TIFF: '):

    source_image_path = os.path.join(test_dir, f"{image}.jpg")
    dest_image_path = os.path.join(output_test_dir, f"{image}.tif")

    try:
        # Open and convert training image
        with Image.open(source_image_path) as img:
            # Save as TIFF with high quality
            img.save(dest_image_path, format='TIFF', compression='tiff_lzw')
        
        
    except Exception as e:
        print(f"Error processing {image}: {str(e)}")

# Print summary
total_training_images = len(glob.glob(os.path.join(test_dir, '*.jpg')))
print("Processed : ", total_training_images, "images")


Converting to TIFF: 100%|██████████| 45/45 [00:04<00:00, 11.21it/s]

Processed :  45 images



