In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mikhailma/test-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/mikhailma/test-dataset?dataset_version_number=1...


100%|██████████| 393M/393M [00:15<00:00, 27.4MB/s] 

Extracting files...





Path to dataset files: C:\Users\bvb09\.cache\kagglehub\datasets\mikhailma\test-dataset\versions\1


In [34]:
# 기본 연산/데이터 처리
import os
import glob
import numpy as np
import pandas as pd

# 이미지 처리
import cv2
from skimage import color, exposure, filters, morphology, feature
from skimage.util import img_as_ubyte

# 특징 추출
from skimage.feature import local_binary_pattern, hog
from skimage.filters import gaussian, sobel, unsharp_mask
from skimage.restoration import denoise_bilateral
from skimage.morphology import disk, closing, opening
from scipy import ndimage
import mahotas


# 머신러닝
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.metrics.pairwise import euclidean_distances


# 시각화
import matplotlib.pyplot as plt

from joblib import Parallel, delayed




In [2]:
# 데이터 로드
base_path = "C:/Users/bvb09/.cache/kagglehub/datasets/mikhailma/test-dataset/versions/1/Google_Recaptcha_V2_Images_Dataset/images"

data = []

# base_path 안에 있는 모든 하위 폴더 이름을 가져와 라벨로 지정
label_folders = [f.name for f in os.scandir(base_path) if f.is_dir()]

# 각 라벨 폴더를 순회합니다.
for label in label_folders:
    # 현재 라벨에 해당하는 폴더의 전체 경로를 만듭니다.
    label_folder_path = os.path.join(base_path, label)
    
    # 해당 라벨 폴더 안의 모든 .png 이미지 파일 경로를 가져옵니다.
    image_paths = glob.glob(os.path.join(label_folder_path, "*.png")) # 모든 이미지가 .png라고 가정합니다.

    # 찾아낸 각 이미지 경로를 순회하며 이미지를 불러옵니다.
    for fp in image_paths:
        img = cv2.imread(fp)
        if img is not None:  
            img = cv2.resize(img, (120, 120)) 
            data.append((img, label)) # 이미지와 해당 폴더 이름을 라벨로 추가합니다.
        else:
            print(f"경고: 이미지를 로드할 수 없습니다: {fp}")

# 리스트에 담긴 데이터를 Pandas DataFrame으로 변환합니다.
df = pd.DataFrame(data, columns=["image", "label"])

# DataFrame의 처음 몇 줄을 출력하여 데이터를 확인합니다.
print(df.head())
print(f"총 로드된 이미지 수: {len(df)}")

# 라벨 종류: bicycle, bridge, bus, car, chimney, crosswalk, hydrant, motorcycle, palm, stair, traffic light, other



                                               image    label
0  [[[104, 114, 117], [101, 110, 114], [100, 109,...  Bicycle
1  [[[67, 75, 54], [69, 76, 58], [71, 77, 66], [6...  Bicycle
2  [[[27, 33, 38], [28, 34, 37], [29, 34, 37], [3...  Bicycle
3  [[[51, 49, 49], [58, 56, 55], [75, 73, 72], [9...  Bicycle
4  [[[125, 128, 135], [129, 130, 143], [135, 135,...  Bicycle
총 로드된 이미지 수: 11730


In [3]:
# Stratified 5-fold cross validation

# 이미지, 라벨 나누기
X = np.array([img for img, _ in data])
labels = [label for _, label in data]

# 라벨 인코딩
le = LabelEncoder()
y = le.fit_transform(labels)

# Stratified K-Fold 준비
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# fold 저장용 리스트
folds = []

for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    folds.append({
        'fold': fold_idx,
        'X_train': X[train_idx],
        'y_train': y[train_idx],
        'X_test': X[test_idx],
        'y_test': y[test_idx]
    })
    
    print(f"[Fold {fold_idx}] ▶ Train: {len(train_idx)}개, Test: {len(test_idx)}개")



[Fold 0] ▶ Train: 9384개, Test: 2346개
[Fold 1] ▶ Train: 9384개, Test: 2346개
[Fold 2] ▶ Train: 9384개, Test: 2346개
[Fold 3] ▶ Train: 9384개, Test: 2346개
[Fold 4] ▶ Train: 9384개, Test: 2346개


In [4]:
import collections

for f in folds:
    counter = collections.Counter(f['y_test'])
    print(f"Fold {f['fold']} 라벨 분포: {dict(counter)}")


Fold 0 라벨 분포: {np.int64(0): 156, np.int64(1): 107, np.int64(2): 242, np.int64(3): 711, np.int64(4): 25, np.int64(5): 248, np.int64(6): 191, np.int64(7): 16, np.int64(8): 268, np.int64(9): 182, np.int64(10): 42, np.int64(11): 158}
Fold 1 라벨 분포: {np.int64(0): 156, np.int64(1): 107, np.int64(2): 242, np.int64(3): 711, np.int64(4): 25, np.int64(5): 248, np.int64(6): 190, np.int64(7): 17, np.int64(8): 268, np.int64(9): 182, np.int64(10): 42, np.int64(11): 158}
Fold 2 라벨 분포: {np.int64(0): 156, np.int64(1): 107, np.int64(2): 241, np.int64(3): 712, np.int64(4): 25, np.int64(5): 248, np.int64(6): 190, np.int64(7): 16, np.int64(8): 268, np.int64(9): 183, np.int64(10): 42, np.int64(11): 158}
Fold 3 라벨 분포: {np.int64(0): 156, np.int64(1): 106, np.int64(2): 242, np.int64(3): 712, np.int64(4): 25, np.int64(5): 248, np.int64(6): 190, np.int64(7): 16, np.int64(8): 268, np.int64(9): 182, np.int64(10): 43, np.int64(11): 158}
Fold 4 라벨 분포: {np.int64(0): 156, np.int64(1): 106, np.int64(2): 242, np.int64(3)

In [22]:
# Preprocessing (image enhancement)

# Point Processing
def contrast_stretch(image):
    # 2% ~ 98% 범위로 contrast stretching
    p2, p98 = np.percentile(image, (2, 98))
    return exposure.rescale_intensity(image, in_range=(p2, p98))

# Gray scale / HSI 변환
def to_grayscale(image):
    if len(image.shape) == 3:
        return color.rgb2gray(image)
    else:
        print("hsi 변형 불가능")
        return image

def to_hsi(image):
    hsv = color.rgb2hsv(image)
    return hsv[:, :, 0], hsv[:, :, 1], hsv[:, :, 2]  # hue, sat, intensity

# Histogram Equalization (area processing)
def histogram_equalization(image):
    return exposure.equalize_hist(image)

# Noise Filtering 
def remove_noise(image):
    image = gaussian(image, sigma=1)
    return denoise_bilateral(image, sigma_color=0.05, sigma_spatial=15, channel_axis=None)

# Morphological Operators 
def morphological_close(image_binary, selem_size=3):
    selem = disk(selem_size)
    return morphology.closing(image_binary, selem)

def morphological_open(image_binary, selem_size=3):
    selem = disk(selem_size)
    return morphology.opening(image_binary, selem)


In [24]:
# Edge Feature

# Edge Detection 
def sobel_detection(image_gray):
    return sobel(image_gray)

def edge_detection(image_gray, low_threshold=50, high_threshold=150, aperture_size=3):
    # 1) float나 다른 타입이라면 0~255 스케일 후 uint8 로 변환
    if image_gray.dtype != np.uint8:
        # 영상이 0~1 사이의 float라면 255 곱해주고, 아니면 절댓값 후 변환
        image_gray = cv2.convertScaleAbs(image_gray)
    # 2) 캐니 엣지
    return cv2.Canny(image_gray, low_threshold, high_threshold, apertureSize=aperture_size)

# Sharpening: Unsharp Mask 대신 Canny 엣지로 마스크 생성 후 강조
def sharpen_image(image_gray, low_threshold=50, high_threshold=150, amount=1.0):
    # 1) Canny 엣지 맵
    edges = cv2.Canny(image_gray, low_threshold, high_threshold)
    # 2) 엣지를 mask로 사용해 원본에 더함
    #    edges는 0/255 이므로 255로 나눈 뒤 float 연산
    mask = (edges / 255.0).astype(np.float32)
    sharpened = image_gray.astype(np.float32) + amount * (mask * 255.0)
    # 3) uint8로 클리핑 후 반환
    return np.clip(sharpened, 0, 255).astype(np.uint8)

In [25]:
# Color Feature

def extract_color_features(image):
    # RGB 히스토그램
    hist_r = cv2.calcHist([image], [0], None, [32], [0, 256])
    hist_g = cv2.calcHist([image], [1], None, [32], [0, 256])
    hist_b = cv2.calcHist([image], [2], None, [32], [0, 256])
    
    # HSV 변환 및 히스토그램
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist_h = cv2.calcHist([hsv], [0], None, [32], [0, 180])
    hist_s = cv2.calcHist([hsv], [1], None, [32], [0, 256])
    hist_v = cv2.calcHist([hsv], [2], None, [32], [0, 256])
    
    # 정규화
    features = np.concatenate([
        hist_r.flatten(), hist_g.flatten(), hist_b.flatten(),
        hist_h.flatten(), hist_s.flatten(), hist_v.flatten()
    ])
    return features / np.sum(features)  # 정규화

In [26]:
# Texture Feature

# LBP 함수
def extract_lbp_features(image_gray, radius=1, method='uniform'):
    n_points = 8 * radius
    lbp = feature.local_binary_pattern(image_gray, n_points, radius, method)
    n_bins = int(lbp.max() + 1)
    hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins), density=True)
    return hist

# Laws' Texture Energy 함수
def extract_laws_energy_features(image_gray, window_size=15):
    # 1D Laws kernels
    L5 = np.array([1, 4, 6, 4, 1], dtype=np.float32)
    E5 = np.array([-1, -2,  0,  2,  1], dtype=np.float32)
    S5 = np.array([-1,  0,  2,  0, -1], dtype=np.float32)
    W5 = np.array([-1,  2,  0, -2,  1], dtype=np.float32)
    R5 = np.array([1, -4,  6, -4,  1], dtype=np.float32)
    kernels = [L5, E5, S5, W5, R5]

    energy_features = []
    for k1 in kernels:
        for k2 in kernels:
            # build 2D Laws kernel
            kernel = np.outer(k1, k2)
            # convolve and take absolute value (energy)
            filtered = ndimage.convolve(image_gray.astype(np.float32),
                                        kernel, mode='reflect')
            energy = np.abs(filtered)
            # sum local energy with a box filter
            summed = cv2.boxFilter(energy, ddepth=-1,
                                   ksize=(window_size, window_size),
                                   normalize=False)
            # global average energy for this filter
            energy_features.append(summed.mean())

    return np.array(energy_features, dtype=np.float32)

In [27]:
# Shape Feature

def extract_sift_descriptors(image):
    sift = cv2.SIFT_create()
    keypoints, descriptors = sift.detectAndCompute(image, None)

    return descriptors

def extract_hog_descriptors(image):
    features = hog(
        image,
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        block_norm='L2-Hys',
        transform_sqrt=True,
        feature_vector=True
    )
    return features

In [28]:
# 전처리 함수
def preprocess_image(img, sigma=1.0):
    img = contrast_stretch(img)
    img = to_grayscale(img)
    img = histogram_equalization(img)
    img = remove_noise(img)
    img = edge_detection(img)
    img = cv2.convertScaleAbs(img)
    img = sharpen_image(img)
    return img



In [29]:
# feature 함수
def extract_features(img):
    """
    img: 2D np.uint8 grayscale image
    returns: L2-normalized 1D feature vector combining LBP, HOG, SIFT, and Laws energy.
    """
    # 1) 입력이 이미 그레이스케일이라 가정, uint8 보장
    gray = img.astype(np.uint8)

    # 2) 개별 피처 추출
    lbp_vec  = extract_lbp_features(gray, radius=1, method='uniform')
    hog_vec  = extract_hog_descriptors(gray)
    desc     = extract_sift_descriptors(gray)  # 반드시 uint8
    if desc is not None and len(desc) > 0:
        sift_vec = desc.mean(axis=0).astype(np.float32)  # 128D
    else:
        sift_vec = np.zeros(128, dtype=np.float32)
    laws_vec = extract_laws_energy_features(gray, window_size=15)

    # 3) 그룹별 정규화
    norm = np.linalg.norm(lbp_vec, ord=1)
    if norm > 0: lbp_vec /= norm

    norm = np.linalg.norm(hog_vec, ord=2)
    if norm > 0: hog_vec /= norm

    norm = np.linalg.norm(sift_vec, ord=2)
    if norm > 0: sift_vec /= norm

    norm = np.linalg.norm(laws_vec, ord=2)
    if norm > 0: laws_vec /= norm

    # 4) 최종 벡터 합치고 전체 L2 정규화
    feat = np.concatenate([lbp_vec, hog_vec, sift_vec, laws_vec]).astype(np.float32)
    norm = np.linalg.norm(feat, ord=2)
    if norm > 0:
        feat /= norm

    return feat

In [30]:
# k-means clustering
def kmeans_clustering(features, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(features)
    return kmeans, cluster_labels

In [31]:
# classifier (knn)

def chi2_distance(x, y):
    """
    Chi-square distance for histogram features.
    Returns 0.5 * sum((x - y)^2 / (x + y + eps)).
    """
    eps = 1e-10
    return 0.5 * np.sum((x - y) ** 2 / (x + y + eps))

def train_knn_classifier(features, labels, n_neighbors=5, weights='distance', metric='chi2'):
    if metric == 'chi2':
        metric_fn = chi2_distance
    else:
        metric_fn = metric  # can be 'euclidean', 'cosine', or a callable

    knn = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights=weights,
        metric=metric_fn
    )
    knn.fit(features, labels)
    return knn

In [32]:
# test

def test(model, X_test, y_test, average='weighted'): # <- average 파라미터 추가
    """
    model: train()이 반환한 KNeighborsClassifier
    X_test: train()이 반환한 테스트 feature 벡터
    y_test: train()이 반환한 테스트 라벨
    Prints:
        - 전체 정확도
        - 각 클래스별 리포트
    Returns:
        y_pred: 예측된 라벨 배열
    """
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    # average 파라미터를 precision_score와 recall_score에 전달
    prec = precision_score(y_test, y_pred, average=average, zero_division=0)
    rec  = recall_score(y_test, y_pred, average=average, zero_division=0)
    print(f"[Test Accuracy] {acc:.4f}")
    print(classification_report(y_test, y_pred))
    return y_pred

In [35]:
# 결과 저장용
accuracies = []
precisions = []
recalls = []

for f in folds:
    print(f"\n=== Fold {f['fold']} ===")

    # 1) train set에서 feature 추출 & 학습 (병렬 처리 적용)
    print(f"  ▶ Train feature extraction (Fold {f['fold']})...")
    # n_jobs=-1 은 사용 가능한 모든 CPU 코어를 사용하겠다는 의미입니다.
    X_train_feats = Parallel(n_jobs=-1)(
        delayed(lambda img: extract_features(preprocess_image(img)))(img)
        for img in f['X_train']
    )
    X_train_feats = np.array(X_train_feats) # 리스트를 numpy 배열로 변환
    print(f"  ✔ Train feature extraction complete for Fold {f['fold']}.")

    knn = train_knn_classifier(
        X_train_feats, # 이미 numpy 배열입니다.
        f['y_train'],
        n_neighbors=5
    )

    # 2) test set에서 feature 추출 & 평가 (병렬 처리 적용)
    print(f"  ▶ Test feature extraction (Fold {f['fold']})...")
    X_test_feats = Parallel(n_jobs=-1)(
        delayed(lambda img: extract_features(preprocess_image(img)))(img)
        for img in f['X_test']
    )
    X_test_feats = np.array(X_test_feats) # 리스트를 numpy 배열로 변환
    print(f"  ✔ Test feature extraction complete for Fold {f['fold']}.")


    # test() 함수는 precision/recall/accuracy와 classification report를 출력하고,
    # y_pred를 리턴합니다.
    y_pred = test(
        knn,
        X_test_feats, # 이미 numpy 배열입니다.
        f['y_test'],
        average='weighted'
    )

    # 3) 필요하면 fold별 지표를 저장
    # (test() 내부에서 이미 print 되지만, 나중에 평균 낼 때를 위해)
    from sklearn.metrics import accuracy_score, precision_score, recall_score
    acc  = accuracy_score(f['y_test'], y_pred)
    prec = precision_score(f['y_test'], y_pred, average='weighted', zero_division=0)
    rec  = recall_score(f['y_test'], y_pred, average='weighted', zero_division=0)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)

# 4) 5-fold 평균 지표 출력
print("\n=== 5-Fold CV Average ===")
print(f"Accuracy : {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} ± {np.std(precisions):.4f}")
print(f"Recall   : {np.mean(recalls):.4f} ± {np.std(recalls):.4f}")


=== Fold 0 ===
  ▶ Train feature extraction (Fold 0)...


KeyboardInterrupt: 