In [1]:
import cv2 
from PIL import Image 
import numpy as np 
import pandas as pd 
import os
from skimage.feature import graycomatrix, graycoprops
from skimage.color import rgb2gray

img_dir = 'train'
mask_dir = 'train/masks'
excel_file = 'train/classif.xlsx'

# Load images
def load_images(img_dir, count):
    images = []
    for i in range(1, count + 1):
        img_path = os.path.join(img_dir, f"{i}.jpg")
        if os.path.exists(img_path):
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert from BGR to RGB format
            images.append(img)
        else:
            print(f"Image {img_path} not found.")
    return images

# Load masks
def load_masks(mask_dir, count):
    masks = []
    for i in range(1, count + 1):
        mask_path = os.path.join(mask_dir, f"binary_{i}.tif")
        if os.path.exists(mask_path):
            mask = Image.open(mask_path)
            mask = np.array(mask)
            masks.append(mask)
        else:
            print(f"Mask {mask_path} not found.")
    return masks

# Load classification file
def load_classification(excel_file):
    if os.path.exists(excel_file):
        return pd.read_excel(excel_file)
    else:
        print(f"Excel file {excel_file} not found.")
        return None
    
# Print        
images = load_images(img_dir, 250)
masks = load_masks(mask_dir, 250)
classif_df = load_classification(excel_file)    

print(f"Loaded {len(images)} images and {len(masks)} masks.")
print(classif_df.head())



Loaded 250 images and 250 masks.
   ID bug_type         species
0   1      Bee  Apis mellifera
1   2      Bee  Apis mellifera
2   3      Bee  Apis mellifera
3   4      Bee  Apis mellifera
4   5      Bee  Apis mellifera


In [2]:
""" Symmetry index """
def calculate_symmetry_index(mask):

    # Flip the mask horizontally
    flipped_mask = np.fliplr(mask)

    # Calculate the symmetry as the inverse of the normalized sum of absolute differences
    symmetry = 1.0 - (np.sum(np.abs(mask - flipped_mask)) / (2 * np.sum(mask)))
    return symmetry

# Apply the function to all
# IMPORTANT the putout of this function, a list for symmetry_indices and certain value for symmetry_index
symmetry_index = calculate_symmetry_index(masks[0])
symmetry_indices = [calculate_symmetry_index(mask) for mask in masks]
symmetry_df = pd.DataFrame(symmetry_indices, columns=['SymmetryIndex'])
print(symmetry_df.head)

# Print first 10 symmetry indices
#for i, si in enumerate(symmetry_indices[:10]):  # Print first 10 symmetry indices
#    print(f"Symmetry index for mask {i+1}: {si}")

# put the new feature in a csv
data_features_df = pd.DataFrame(symmetry_indices, columns=['SymmetryIndex'])
# data_features_df.to_csv('data_features.csv', index=False)
# print(data_features.head())

<bound method NDFrame.head of      SymmetryIndex
0         0.804721
1         0.772273
2         0.593619
3         0.487819
4         0.666718
..             ...
245       0.621142
246       0.774722
247       0.634358
248       0.506549
249       0.498039

[250 rows x 1 columns]>


In [3]:
""" Ratio of longest orthogonal lines """
def longest_orthogonal_ratio(mask):
    if len(mask.shape) == 3:
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
    _, binary_mask = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)
    
    contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours and len(contours) > 0: # findContours: Find the outline of the bug in the image
        rect = cv2.minAreaRect(contours[0]) # minAreaRect: find Minimum Bounding Rectangle
        width, height = rect[1] 
        if width == 0 or height == 0:
            return 0
        else:
            return min(width, height) / max(width, height) # calcule ratio by height/width
    return 0

# Calculate the ratio for all masks
longest_orthogonal_ratios = [longest_orthogonal_ratio(mask) for mask in masks]
longest_orthogonal_ratio_df = pd.DataFrame(longest_orthogonal_ratios, columns=['LongestOrthogonalRatio'])
print(longest_orthogonal_ratio_df.head)

# Print first 10 ratios
#for i, ratio in enumerate(longest_orthogonal_ratios[:10]):
#    print(f"Longest orthogonal ratio for mask {i+1}: {ratio}")

# Merge the new feature 
data_features_df = pd.concat([data_features_df,longest_orthogonal_ratio_df], axis=1)
#data_features_df.to_csv('data_features.csv', index=False)
#print(data_features_df.head())

<bound method NDFrame.head of      LongestOrthogonalRatio
0                  0.821019
1                  0.675619
2                  0.805196
3                  0.709335
4                  0.744057
..                      ...
245                0.985775
246                0.714783
247                0.773522
248                0.944525
249                0.706377

[250 rows x 1 columns]>


In [4]:
""" Ratio of bug pixels to total pixels """
def bug_to_total_ratio(mask):
    bug_pixels = np.sum(mask > 0)  # Count the number of pixels for bugs
    total_pixels = mask.shape[0] * mask.shape[1]  # Calculate the total number of pixels
    return bug_pixels / total_pixels  # devid

# Calculate the ratio for all and creat a df
bug_to_total_ratios = [bug_to_total_ratio(mask) for mask in masks]
bug_to_total_ratios_df = pd.DataFrame(bug_to_total_ratios, columns=['ratio'])
print (bug_to_total_ratios_df.head)

# Print first 10 ratios
#for i, ratio in enumerate(bug_to_total_ratios[:10]):
#    print(f"Bug to total pixel ratio for mask {i+1}: {ratio}")

# Merge the new feature 
data_features_df = pd.concat([data_features_df,bug_to_total_ratios_df], axis=1)
#classif_df.to_csv('data_features.csv', index=False)
#print(data_features_df.head())


<bound method NDFrame.head of         ratio
0    0.022284
1    0.025659
2    0.066279
3    0.039562
4    0.027494
..        ...
245  0.006250
246  0.020050
247  0.007032
248  0.011959
249  0.023367

[250 rows x 1 columns]>


In [5]:
"""calculate the min, max, and mean values the median and standard deviation for RGB channels within the bug mask"""
def color_stats(image, mask):
    # Apply the mask to the image
    masked_image = image[mask > 0]

    if  masked_image.ndim == 1  or masked_image.shape[1] != 3: #not with a RGB tableau one dimo or 3 shape
        # If masked_image is grey
        min_val = masked_image.min()
        max_val = masked_image.max()
        mean_val = masked_image.mean()
        median_val = np.median(masked_image)
        std_val = np.std(masked_image)
        stats = {
            'min_red': min_val, 'min_green': min_val, 'min_blue': min_val,
            'max_red': max_val, 'max_green': max_val, 'max_blue': max_val,
            'mean_red': mean_val, 'mean_green': mean_val, 'mean_blue': mean_val,
            'median_red': median_val, 'median_green': median_val, 'median_blue': median_val,
            'std_red': std_val, 'std_green': std_val, 'std_blue': std_val,
        }
    
    else:
    # for RGB
        min_vals = masked_image.min(axis=0)
        max_vals = masked_image.max(axis=0)
        mean_vals = masked_image.mean(axis=0)
        median_vals = np.median(masked_image, axis=0)
        std_vals = np.std(masked_image, axis=0)
    
    # Create a dictionary to store the results
        stats = {
            'min_red': min_vals[0], 'min_green': min_vals[1], 'min_blue': min_vals[2],
            'max_red': max_vals[0], 'max_green': max_vals[1], 'max_blue': max_vals[2],
            'mean_red': mean_vals[0], 'mean_green': mean_vals[1], 'mean_blue': mean_vals[2],
            'median_red': median_vals[0], 'median_green': median_vals[1], 'median_blue': median_vals[2],
            'std_red': std_vals[0], 'std_green': std_vals[1], 'std_blue': std_vals[2],
        }
    
    return stats


# Calculate the color statistics for all images and masks and create a df
color_stats_list = [color_stats(img, mask) for img, mask in zip(images, masks)]
color_stats_df = pd.DataFrame(color_stats_list)
print(color_stats_df.head())

# Print the first 10 color statistics
#for i, stats in enumerate(color_stats_list[:10]):
#    print(f"Color stats for image {i+1}: {stats}")

# Merge the new features with the classification DataFrame
data_features_df = pd.concat([data_features_df,color_stats_df], axis=1)
#data_features_df.to_csv('data_features.csv', index=False)
# print(data_features_df.head())


   min_red  min_green  min_blue  max_red  max_green  max_blue    mean_red  \
0        0          0         0      208        208       208   54.286820   
1        0          0         0      251        251       251   50.533765   
2        0          0         0      255        255       255   86.118029   
3        0          0         0      219        219       219   69.684210   
4        0          0         0      255        255       255  101.673712   

   mean_green   mean_blue  median_red  median_green  median_blue    std_red  \
0   54.286820   54.286820        37.0          37.0         37.0  44.962646   
1   50.533765   50.533765        35.0          35.0         35.0  41.672498   
2   86.118029   86.118029        82.0          82.0         82.0  60.634858   
3   69.684210   69.684210        61.0          61.0         61.0  46.061015   
4  101.673712  101.673712        99.0          99.0         99.0  64.366072   

   std_green   std_blue  
0  44.962646  44.962646  
1  41.6724

In [6]:
"""calculate texture features using GLCM"""

def texture_features(image, mask):

    gray_image = rgb2gray(image)
    glcm = graycomatrix((gray_image* 255).astype('uint8'), [1], [0, np.pi/2], symmetric=True, normed=True)
    
    # Extract texture features
    contrast = graycoprops(glcm, 'contrast').mean()
    homogeneity = graycoprops(glcm, 'homogeneity').mean()
    energy = graycoprops(glcm, 'energy').mean()
    correlation = graycoprops(glcm, 'correlation').mean()

    stats = {
        'contrast': contrast,
        'homogeneity': homogeneity,
        'energy': energy,
        'correlation': correlation
    }  
     
    return stats

# Calculate the texture features for all images and masks and create df
texture_features_list = [texture_features(img, mask) for img, mask in zip(images, masks)]
texture_features_df = pd.DataFrame(texture_features_list)
print(texture_features_df.head())

# Merge the new features with the classification DataFrame
data_features_df = pd.concat([data_features_df,texture_features_df], axis=1)
data_features_df.to_csv('data_features.csv', index=False)
print(data_features_df.head())



   contrast  homogeneity    energy  correlation
0  1.478668     0.725038  0.052065     0.999783
1  1.677912     0.724125  0.053770     0.999756
2  2.528066     0.679071  0.043417     0.999685
3  2.274407     0.638564  0.045116     0.999476
4  2.466985     0.682676  0.043565     0.999690
   SymmetryIndex  LongestOrthogonalRatio     ratio  min_red  min_green  \
0       0.804721                0.821019  0.022284        0          0   
1       0.772273                0.675619  0.025659        0          0   
2       0.593619                0.805196  0.066279        0          0   
3       0.487819                0.709335  0.039562        0          0   
4       0.666718                0.744057  0.027494        0          0   

   min_blue  max_red  max_green  max_blue    mean_red  ...  median_red  \
0         0      208        208       208   54.286820  ...        37.0   
1         0      251        251       251   50.533765  ...        35.0   
2         0      255        255       255   8

In [7]:
new_data_features_df = data_features_df
new_data_features_df = new_data_features_df.drop(columns=['SymmetryIndex','LongestOrthogonalRatio','std_blue','std_green','std_red','correlation','contrast','min_red','min_green','min_blue','max_red','max_green','max_blue'])
new_data_features_df.to_csv('data_features.csv', index=False)