In [1]:
import cv2 
from PIL import Image 
import numpy as np 
import pandas as pd 
import os

img_dir = 'train'
mask_dir = 'train/masks'
excel_file = 'train/classif.xlsx'

# Load images
def load_images(img_dir, count):
    images = []
    for i in range(1, count + 1):
        img_path = os.path.join(img_dir, f"{i}.jpg")
        if os.path.exists(img_path):
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert from BGR to RGB format
            images.append(img)
        else:
            print(f"Image {img_path} not found.")
    return images

# Load masks
def load_masks(mask_dir, count):
    masks = []
    for i in range(1, count + 1):
        mask_path = os.path.join(mask_dir, f"binary_{i}.tif")
        if os.path.exists(mask_path):
            mask = Image.open(mask_path)
            mask = np.array(mask)
            masks.append(mask)
        else:
            print(f"Mask {mask_path} not found.")
    return masks

# Load classification file
def load_classification(excel_file):
    if os.path.exists(excel_file):
        return pd.read_excel(excel_file)
    else:
        print(f"Excel file {excel_file} not found.")
        return None
    
# Print        
images = load_images(img_dir, 250)
masks = load_masks(mask_dir, 250)
classif_df = load_classification(excel_file)    

print(f"Loaded {len(images)} images and {len(masks)} masks.")
print(classif_df.head())



Loaded 250 images and 250 masks.
   ID bug type         species
0   1      Bee  Apis mellifera
1   2      Bee  Apis mellifera
2   3      Bee  Apis mellifera
3   4      Bee  Apis mellifera
4   5      Bee  Apis mellifera


In [2]:
""" Symmetry index """
def calculate_symmetry_index(mask):

    # Flip the mask horizontally
    flipped_mask = np.fliplr(mask)

    # Calculate the symmetry as the inverse of the normalized sum of absolute differences
    symmetry = 1.0 - (np.sum(np.abs(mask - flipped_mask)) / (2 * np.sum(mask)))
    return symmetry

# Apply the function to all
# IMPORTANT the putout of this function, a list for symmetry_indices and certain value for symmetry_index
symmetry_index = calculate_symmetry_index(masks[0])
symmetry_indices = [calculate_symmetry_index(mask) for mask in masks]

# Print first 10 symmetry indices
for i, si in enumerate(symmetry_indices[:10]):  # Print first 10 symmetry indices
    print(f"Symmetry index for mask {i+1}: {si}")

# 创建一个DataFrame用于存储对称性指数
# symmetry_df = pd.DataFrame(symmetry_indices, columns=['SymmetryIndex'])
# 合并对称性指数到您的特征集
# 假设其他特征集data_features已经存在
# data_features = pd.DataFrame(...) # 这里填充您已有的特征集
# data_features = pd.DataFrame({'ID': range(1, 251)})  # 示例特征集
# data_features = pd.concat([data_features, symmetry_df], axis=1)
# Save to CSV for verification
#classif_df.to_csv('symmetry_index.csv', index=False)
# 打印合并后的特征集
# print(data_features.head())

Symmetry index for mask 1: 0.8047209080745074
Symmetry index for mask 2: 0.7722734849598822
Symmetry index for mask 3: 0.5936187782533617
Symmetry index for mask 4: 0.48781922170811365
Symmetry index for mask 5: 0.6667178811660399
Symmetry index for mask 6: 0.48815425944645374
Symmetry index for mask 7: 0.48343662413795563
Symmetry index for mask 8: 0.4853059999185443
Symmetry index for mask 9: 0.6095427397800142
Symmetry index for mask 10: 0.6493464145627762


In [3]:
""" Ratio of longest orthogonal lines """
def longest_orthogonal_ratio(mask):
    if len(mask.shape) == 3:
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
    _, binary_mask = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)
    
    contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours and len(contours) > 0: # findContours: Find the outline of the bug in the image
        rect = cv2.minAreaRect(contours[0]) # minAreaRect: find Minimum Bounding Rectangle
        width, height = rect[1] 
        if width == 0 or height == 0:
            return 0
        else:
            return min(width, height) / max(width, height) # calcule ratio by height/width
    return 0

# Calculate the ratio for all masks
longest_orthogonal_ratios = [longest_orthogonal_ratio(mask) for mask in masks]

# Print first 10 ratios
for i, ratio in enumerate(longest_orthogonal_ratios[:10]):
    print(f"Longest orthogonal ratio for mask {i+1}: {ratio}")

# Create a DataFrame to store the longest orthogonal ratios
#orthogonal_ratio_df = pd.DataFrame(longest_orthogonal_ratios, columns=['LongestOrthogonalRatio'])
# Merge the new feature with the classification DataFrame
#classif_df = pd.concat([classif_df, orthogonal_ratio_df], axis=1)
# Save to CSV for verification
#classif_df.to_csv('features_with_orthogonal_ratio.csv', index=False)
# Print the head of the updated DataFrame
#print(classif_df.head())

Longest orthogonal ratio for mask 1: 0.8210185892043161
Longest orthogonal ratio for mask 2: 0.6756187527305683
Longest orthogonal ratio for mask 3: 0.8051959270203646
Longest orthogonal ratio for mask 4: 0.7093345658298247
Longest orthogonal ratio for mask 5: 0.7440569563316096
Longest orthogonal ratio for mask 6: 0.9661990308475512
Longest orthogonal ratio for mask 7: 0.4512559361622812
Longest orthogonal ratio for mask 8: 0.33333332645824193
Longest orthogonal ratio for mask 9: 0.9446961528654619
Longest orthogonal ratio for mask 10: 0.7962038272225026


In [4]:
""" Ratio of bug pixels to total pixels """
def bug_to_total_ratio(mask):
    bug_pixels = np.sum(mask > 0)  # Count the number of pixels for bugs
    total_pixels = mask.shape[0] * mask.shape[1]  # Calculate the total number of pixels
    return bug_pixels / total_pixels  # devid

# Calculate the ratio for all
bug_to_total_ratios = [bug_to_total_ratio(mask) for mask in masks]

# Print first 10 ratios
for i, ratio in enumerate(bug_to_total_ratios[:10]):
    print(f"Bug to total pixel ratio for mask {i+1}: {ratio}")

# Create a DataFrame to store the bug to total pixel ratios
#bug_to_total_ratio_df = pd.DataFrame(bug_to_total_ratios, columns=['BugToTotalPixelRatio'])
# Assuming classif_df is already loaded and contains previous features
# Merge the new feature with the classification DataFrame
#classif_df = pd.concat([classif_df, bug_to_total_ratio_df], axis=1)
# Save the updated DataFrame to a CSV file for verification
#classif_df.to_csv('features_with_bug_to_total_ratio.csv', index=False)
# Print the head of the updated DataFrame to check the new feature
#print(classif_df.head())


Bug to total pixel ratio for mask 1: 0.022284125
Bug to total pixel ratio for mask 2: 0.025659
Bug to total pixel ratio for mask 3: 0.066279375
Bug to total pixel ratio for mask 4: 0.039562125
Bug to total pixel ratio for mask 5: 0.027493875
Bug to total pixel ratio for mask 6: 0.0177505
Bug to total pixel ratio for mask 7: 0.01697625
Bug to total pixel ratio for mask 8: 0.02236225
Bug to total pixel ratio for mask 9: 0.02159825
Bug to total pixel ratio for mask 10: 0.047350375


In [5]:
import numpy as np
import pandas as pd

# 计算在虫子掩码区域内的RGB统计值的函数
def calculate_color_stats(image, mask):
    masked_image = image[mask > 0]  # 将掩码应用到图像上
    if masked_image.size == 0:  # 如果掩码区域内没有有效的像素
        return np.zeros(9)  # 返回全零的数组，防止出现零维数组错误
    min_vals = masked_image.min(axis=0)
    max_vals = masked_image.max(axis=0)
    mean_vals = masked_image.mean(axis=0)
    return np.concatenate([min_vals, max_vals, mean_vals])

# 计算所有图像的RGB统计值
color_stats = []
for i in range(len(images)):
    stats = calculate_color_stats(images[i], masks[i])
    color_stats.append(stats)
    print(f"Image {i+1} stats: {stats}")  # 调试输出每个图像的统计值

# 将结果转为DataFrame
color_stats_df = pd.DataFrame(color_stats, columns=['Red_min', 'Green_min', 'Blue_min', 'Red_max', 'Green_max', 'Blue_max', 'Red_mean', 'Green_mean', 'Blue_mean'])

# 假设classif_df已经加载并包含之前的特征
# 将新的RGB统计特征合并到分类DataFrame中
classif_df = pd.concat([classif_df, color_stats_df], axis=1)

# 保存更新后的DataFrame到CSV文件以便验证
classif_df.to_csv('features_with_rgb_statistics.csv', index=False)

# 打印更新后的DataFrame的前几行以检查新特征
print(classif_df.head())


ValueError: zero-dimensional arrays cannot be concatenated

In [None]:
def calculate_color_median_std(image, mask):
    masked_image = image[mask > 0]  # Apply mask to image
    median_vals = np.median(masked_image, axis=0)
    std_vals = masked_image.std(axis=0)
    return median_vals, std_vals

color_medians_stds = [calculate_color_median_std(images[i], masks[i]) for i in range(len(images))]


In [None]:
from skimage.feature import graycomatrix, graycoprops
from skimage.color import rgb2gray
from skimage.measure import shannon_entropy

# Example custom feature: texture contrast and entropy
def calculate_texture_features(image, mask):
    # Convert image to grayscale
    gray_image = rgb2gray(image)
    # Calculate Gray-Level Co-occurrence Matrix (GLCM) and derive texture properties
    glcm = graycomatrix((gray_image * 255).astype('uint8'), distances=[5], angles=[0], symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0, 0]
    entropy = shannon_entropy(mask)
    return contrast, entropy

texture_features = [calculate_texture_features(images[i], masks[i]) for i in range(len(images))]


In [None]:
# Symmetry index
symmetry_index = calculate_symmetry_index(mask)
print(f"Symmetry index: {symmetry_index}")

# Ratio of longest orthogonal lines
#longest_line_ratio = ratio_of_longest_lines(mask)
#print(f"Ratio of longest orthogonal lines: {longest_line_ratio}")

# Pixel ratio
pixel_ratio = calculate_pixel_ratio(img, mask)
print(f"Ratio of bug pixels to total pixels: {pixel_ratio}")

# Color statistics
min_vals, max_vals, mean_vals = calculate_color_stats(img, mask)
print(f"Min RGB values within the bug mask: {min_vals}")
print(f"Max RGB values within the bug mask: {max_vals}")
print(f"Mean RGB values within the bug mask: {mean_vals}")

# Median and standard deviation for RGB
median_vals, std_vals = calculate_color_median_std(img, mask)
print(f"Median RGB values within the bug mask: {median_vals}")
print(f"Standard deviation of RGB values within the bug mask: {std_vals}")

# Custom features (texture contrast and entropy)
contrast, entropy = calculate_texture_features(img, mask)
print(f"Texture contrast: {contrast}")
print(f"Texture entropy: {entropy}")

Symmetry index: 0.4980392156862745
Ratio of bug pixels to total pixels: 5.9585
Min RGB values within the bug mask: [0 0 0]
Max RGB values within the bug mask: [255 255 255]
Mean RGB values within the bug mask: [127.06082382 101.62600571  77.42946683]
Median RGB values within the bug mask: [125.  96.  63.]
Standard deviation of RGB values within the bug mask: [62.59178458 60.36123553 59.83946465]
Texture contrast: 121.04977894078397
Texture entropy: 0.15994742607236198
