In [None]:
import cv2
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import os
import tqdm as tqdm

dataset_path = r"D:\yinghuaxia\race dataset\FairFace Race\train_inRange\White"

# 加載Haar Cascade人臉檢測器
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')


# 用於儲存每個圖像的 L、A、B 值
lab_values = []
img_path = []

total_images = sum([len(files) for root, dirs, files in os.walk(dataset_path)])
# 遍歷數據集中的所有圖像
with tqdm(total = total_images, desc = "Processing Image", unit="image")as pbar:
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".jpg"):
                image_path = os.path.join(root, file)
                image = cv2.imread(image_path)
                if image is None:
                    print(f"Failed to load image: {image_path}")
                    continue

                # 人臉檢測
                gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

                for (x, y, w, h) in faces:
                    # 獲取人臉區域
                    face_region = image[y:y+h, x:x+w]

                    # 將人臉區域轉換為 CIELAB 色彩空間
                    lab_face = cv2.cvtColor(face_region, cv2.COLOR_BGR2LAB)

                    # 計算 L、A、B 通道的平均值
                    l_mean = np.mean(lab_face[:, :, 0])
                    a_mean = np.mean(lab_face[:, :, 1])
                    b_mean = np.mean(lab_face[:, :, 2])

                    # 將平均值添加到列表中
                    lab_values.append((l_mean, a_mean, b_mean))
                    img_path.append(image_path)
                
            pbar.update(1)

# 轉換為 NumPy 數組
lab_values = np.array(lab_values)

In [None]:
z_scores = np.abs(stats.zscore(lab_values, axis=0))
filtered_indices = np.all(z_scores < 3, axis=1)  # 這裡設置 z 分數閾值為 3
filtered_lab_values = lab_values[filtered_indices]
outlier_lab_values = lab_values[~filtered_indices]

# 列印過濾前後的數據量
print(f"#original: {len(lab_values)}")
print(f"#filtered: {len(filtered_lab_values)}")
print(f"outlier: {len(outlier_lab_values)}")


In [None]:
# 繪製 L、A、B 值分布，區分異常值和非異常值
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# 非異常值用藍色標記
ax.scatter(filtered_lab_values[:, 0], filtered_lab_values[:, 1], filtered_lab_values[:, 2], c='b', label='Filtered Values')
# 異常值用紅色標記
ax.scatter(outlier_lab_values[:, 0], outlier_lab_values[:, 1], outlier_lab_values[:, 2], c='r', label='Outliers')

ax.set_xlabel('L*')
ax.set_ylabel('A*')
ax.set_zlabel('B*')
plt.legend()
plt.title('CIELAB Color Space Distribution of Faces (Filtered and Outliers)')
plt.show()