In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mikhailma/test-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/mikhailma/test-dataset?dataset_version_number=1...


100%|██████████| 393M/393M [00:15<00:00, 27.4MB/s] 

Extracting files...





Path to dataset files: C:\Users\bvb09\.cache\kagglehub\datasets\mikhailma\test-dataset\versions\1


In [1]:
# 기본 연산/데이터 처리
import os
import glob
import numpy as np
import pandas as pd

# 이미지 처리
import cv2
from skimage import color, exposure, filters, morphology, feature
from skimage.util import img_as_ubyte

# 특징 추출
from skimage.feature import local_binary_pattern, hog
from skimage.filters import gaussian, sobel, unsharp_mask
from skimage.restoration import denoise_bilateral
from skimage.morphology import disk, closing, opening
import mahotas


# 머신러닝
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder


# 시각화
import matplotlib.pyplot as plt

# SIFT (OpenCV contrib 필요)
sift = cv2.SIFT_create()


In [2]:
# 데이터 로드
base_path = "C:/Users/bvb09/.cache/kagglehub/datasets/mikhailma/test-dataset/versions/1/Google_Recaptcha_V2_Images_Dataset/images"

data = []

# base_path 안에 있는 모든 하위 폴더 이름을 가져와 라벨로 지정
label_folders = [f.name for f in os.scandir(base_path) if f.is_dir()]

# 각 라벨 폴더를 순회합니다.
for label in label_folders:
    # 현재 라벨에 해당하는 폴더의 전체 경로를 만듭니다.
    label_folder_path = os.path.join(base_path, label)
    
    # 해당 라벨 폴더 안의 모든 .png 이미지 파일 경로를 가져옵니다.
    image_paths = glob.glob(os.path.join(label_folder_path, "*.png")) # 모든 이미지가 .png라고 가정합니다.

    # 찾아낸 각 이미지 경로를 순회하며 이미지를 불러옵니다.
    for fp in image_paths:
        img = cv2.imread(fp)
        if img is not None:  
            img = cv2.resize(img, (200, 200)) 
            data.append((img, label)) # 이미지와 해당 폴더 이름을 라벨로 추가합니다.
        else:
            print(f"경고: 이미지를 로드할 수 없습니다: {fp}")

# 리스트에 담긴 데이터를 Pandas DataFrame으로 변환합니다.
df = pd.DataFrame(data, columns=["image", "label"])

# DataFrame의 처음 몇 줄을 출력하여 데이터를 확인합니다.
print(df.head())
print(f"총 로드된 이미지 수: {len(df)}")



                                               image    label
0  [[[104, 114, 117], [103, 112, 116], [101, 110,...  Bicycle
1  [[[67, 75, 54], [68, 75, 55], [69, 76, 58], [7...  Bicycle
2  [[[27, 33, 38], [27, 33, 38], [28, 34, 37], [2...  Bicycle
3  [[[51, 49, 49], [54, 52, 51], [58, 56, 55], [6...  Bicycle
4  [[[125, 128, 135], [127, 129, 138], [129, 130,...  Bicycle
총 로드된 이미지 수: 11730


In [3]:
# Stratified 5-fold cross validation

# 이미지, 라벨 나누기
X = np.array([img for img, _ in data])
labels = [label for _, label in data]

# 라벨 인코딩
le = LabelEncoder()
y = le.fit_transform(labels)

# Stratified K-Fold 준비
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# fold 저장용 리스트
folds = []

for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    folds.append({
        'fold': fold_idx,
        'X_train': X[train_idx],
        'y_train': y[train_idx],
        'X_test': X[test_idx],
        'y_test': y[test_idx]
    })
    
    print(f"[Fold {fold_idx}] ▶ Train: {len(train_idx)}개, Test: {len(test_idx)}개")



[Fold 0] ▶ Train: 9384개, Test: 2346개
[Fold 1] ▶ Train: 9384개, Test: 2346개
[Fold 2] ▶ Train: 9384개, Test: 2346개
[Fold 3] ▶ Train: 9384개, Test: 2346개
[Fold 4] ▶ Train: 9384개, Test: 2346개


In [4]:
import collections

for f in folds:
    counter = collections.Counter(f['y_test'])
    print(f"Fold {f['fold']} 라벨 분포: {dict(counter)}")


Fold 0 라벨 분포: {np.int64(0): 156, np.int64(1): 107, np.int64(2): 242, np.int64(3): 711, np.int64(4): 25, np.int64(5): 248, np.int64(6): 191, np.int64(7): 16, np.int64(8): 268, np.int64(9): 182, np.int64(10): 42, np.int64(11): 158}
Fold 1 라벨 분포: {np.int64(0): 156, np.int64(1): 107, np.int64(2): 242, np.int64(3): 711, np.int64(4): 25, np.int64(5): 248, np.int64(6): 190, np.int64(7): 17, np.int64(8): 268, np.int64(9): 182, np.int64(10): 42, np.int64(11): 158}
Fold 2 라벨 분포: {np.int64(0): 156, np.int64(1): 107, np.int64(2): 241, np.int64(3): 712, np.int64(4): 25, np.int64(5): 248, np.int64(6): 190, np.int64(7): 16, np.int64(8): 268, np.int64(9): 183, np.int64(10): 42, np.int64(11): 158}
Fold 3 라벨 분포: {np.int64(0): 156, np.int64(1): 106, np.int64(2): 242, np.int64(3): 712, np.int64(4): 25, np.int64(5): 248, np.int64(6): 190, np.int64(7): 16, np.int64(8): 268, np.int64(9): 182, np.int64(10): 43, np.int64(11): 158}
Fold 4 라벨 분포: {np.int64(0): 156, np.int64(1): 106, np.int64(2): 242, np.int64(3)

In [None]:
# train

In [None]:
# test

In [8]:
# Preprocessing

# Point Processing
def contrast_stretch(image):
    # 2% ~ 98% 범위로 contrast stretching
    p2, p98 = np.percentile(image, (2, 98))
    return exposure.rescale_intensity(image, in_range=(p2, p98))

# Gray scale / HSI 변환
def to_grayscale(image):
    return color.rgb2gray(image)

def to_hsi(image):
    hsv = color.rgb2hsv(image)
    return hsv[:, :, 0], hsv[:, :, 1], hsv[:, :, 2]  # hue, sat, intensity

# Histogram Equalization (area processing)
def histogram_equalization(image):
    return exposure.equalize_hist(image)

# Noise Filtering 
def remove_noise(image):
    image = gaussian(image, sigma=1)
    return denoise_bilateral(image, sigma_color=0.05, sigma_spatial=15, channel_axis=None)

# Edge Detection 
def edge_detection(image_gray):
    return sobel(image_gray)

# Sharpening 
def sharpen_image(image_gray):
    return unsharp_mask(image_gray, radius=1, amount=1)

# Morphological Operators 
def morphological_close(image_binary, selem_size=3):
    selem = disk(selem_size)
    return morphology.closing(image_binary, selem)

def morphological_open(image_binary, selem_size=3):
    selem = disk(selem_size)
    return morphology.opening(image_binary, selem)


In [9]:
# LBP feature 추출 함수
def extract_lbp_features(image_gray, radius=1, method='uniform'):
    n_points = 8 * radius
    lbp = feature.local_binary_pattern(image_gray, n_points, radius, method)
    n_bins = int(lbp.max() + 1)
    hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins), density=True)
    return hist

In [10]:
# 전처리 함수
def preprocess_image(img, sigma=1.0):
    img = contrast_stretch(img)
    img = to_grayscale(img)
    img = histogram_equalization(img)
    img = remove_noise(img)
    img = edge_detection(img)
    return img

In [None]:
# texture 함수
def texture_features(img):
    return extract_lbp_features(img)


In [None]:
# shape 함수