In [1]:
import os
import cv2
import shutil
from tqdm import tqdm


In [7]:
WORKSHOP_DIR = os.getcwd()  
ori_dir = os.path.join(WORKSHOP_DIR, "original")
bad_base = os.path.join(WORKSHOP_DIR,  "bad_samples")

os.makedirs(bad_base, exist_ok=True)
print("当前工作目录:", WORKSHOP_DIR)
print("原始图片目录:", ori_dir)
print("坏样本保存目录:", bad_base)


当前工作目录: e:\1.code\Jupyter-notebook\MUST-DataScience\1-groupwork\dataprocess
原始图片目录: e:\1.code\Jupyter-notebook\MUST-DataScience\1-groupwork\dataprocess\original
坏样本保存目录: e:\1.code\Jupyter-notebook\MUST-DataScience\1-groupwork\dataprocess\bad_samples


In [3]:
def move_with_rename(src_path, dst_dir):
    """
    移动文件到目标目录，若存在同名文件则自动重命名，如:
    image.jpg → image_1.jpg → image_2.jpg
    """
    filename = os.path.basename(src_path)
    name, ext = os.path.splitext(filename)
    dst_path = os.path.join(dst_dir, filename)

    counter = 1
    while os.path.exists(dst_path):
        dst_path = os.path.join(dst_dir, f"{name}_{counter}{ext}")
        counter += 1

    shutil.move(src_path, dst_path)
    return dst_path


In [4]:
def is_blurry(img, threshold=120.0):
    """ 使用拉普拉斯算子方差判断图像是否模糊 """
    lap = cv2.Laplacian(img, cv2.CV_64F).var()
    return lap < threshold

def check_image_quality(fpath, blur_th=120):
    """
    检查图像质量:
    1）能否读取
    2）是否模糊
    3）是否过曝 / 欠曝
    """
    try:
        img = cv2.imread(fpath)

        # 无法加载
        if img is None:
            return False, "损坏图片"

        # 模糊检测
        if is_blurry(img, threshold=blur_th):
            return False, "模糊图片"

        # 亮度检测（判断过曝或欠曝）
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        mean_val = gray.mean()

        if mean_val < 15:
            return False, "欠曝图片"
        if mean_val > 240:
            return False, "过曝图片"

        return True, "正常图片"

    except:
        return False, "异常错误"


In [10]:
classes = os.listdir(ori_dir)
total_bad = 0

for cls in classes:
    cls_path = os.path.join(ori_dir, cls)
    if not os.path.isdir(cls_path):
        continue

    print(f"\n 正在检查类别：{cls}")

    # 为该类别创建专属 bad samples 文件夹
    bad_cls_dir = os.path.join(bad_base, cls)
    os.makedirs(bad_cls_dir, exist_ok=True)

    files = os.listdir(cls_path)

    for fname in tqdm(files):
        fpath = os.path.join(cls_path, fname)

        # 检查图像质量
        ok, reason = check_image_quality(fpath)

        if not ok:
            move_with_rename(fpath, bad_cls_dir)
            total_bad += 1

print("\n 数据清洗完成！")
print(f"共复制了 {total_bad} 张坏样本到 bad_samples/ 下的对应类别文件夹")



 正在检查类别：banana


100%|██████████| 31/31 [00:01<00:00, 25.14it/s]



 正在检查类别：Cherry


100%|██████████| 32/32 [00:00<00:00, 101.43it/s]



 正在检查类别：cotton


100%|██████████| 32/32 [00:01<00:00, 30.44it/s]



 正在检查类别：Cucumber


100%|██████████| 31/31 [00:00<00:00, 32.04it/s]



 正在检查类别：maize


100%|██████████| 31/31 [00:00<00:00, 53.85it/s]



 正在检查类别：Pearl_millet(bajra)


100%|██████████| 39/39 [00:00<00:00, 53.64it/s]



 正在检查类别：Tobacco-plant


100%|██████████| 33/33 [00:00<00:00, 105.73it/s]



 正在检查类别：wheat


100%|██████████| 31/31 [00:00<00:00, 36.46it/s]


 数据清洗完成！
共复制了 14 张坏样本到 bad_samples/ 下的对应类别文件夹



