## 데이터 분석

In [None]:
import torch
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

# 1데이터 로드
data_dir = "data/dress"  # 상위 폴더
class_names = ['dispose', 'recycle', 'reusable']

transform = transforms.Compose([
    transforms.ToTensor()
])

tr = datasets.ImageFolder(root=f"{data_dir}/train", transform=transform)
te = datasets.ImageFolder(root=f"{data_dir}/test", transform=transform)

print(f"Train samples: {len(tr)}")
print(f"Test samples: {len(te)}")
print(f"Classes: {tr.classes}")

# 2️예시 이미지 출력
img, label = tr[1100]
real_label = class_names[label]

plt.imshow(img.permute(1, 2, 0))
plt.title(f"Label: {label}, real_name: {real_label}")
plt.axis("off")
plt.show()


In [None]:
import os, cv2, time
data_dir = "data/dress/train"
files = [os.path.join(root, f) for root, _, fs in os.walk(data_dir) for f in fs if f.endswith(".jpg")]
print("총 이미지:", len(files))
t0 = time.perf_counter()
for i, p in enumerate(files[:10]):
    img = cv2.imread(p)
    if img is None: print("로드 실패:", p)
print("10장 처리 시간:", time.perf_counter()-t0, "초")


In [None]:
import os, cv2, numpy as np, torch, time
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# === 전체 타이머 시작 ===
t0 = time.perf_counter()

# === 1. Load dataset ===
start = time.perf_counter()
data_dir = "data/dress"
dataset = datasets.ImageFolder(root=f"{data_dir}/train", transform=transforms.ToTensor())
print(f"Total images: {len(dataset)} | Classes: {dataset.classes}")
print(f"⏱ Load dataset: {time.perf_counter() - start:.2f}s")

# === 2. Class distribution ===
start = time.perf_counter()
labels = [label for _, label in dataset]
sns.countplot(x=labels)
plt.title('Class Distribution'); plt.show()
print(f"⏱ Class distribution: {time.perf_counter() - start:.2f}s")

# === 3. Image size distribution ===
start = time.perf_counter()
img_sizes = []
for path, _ in dataset.samples:
    with Image.open(path) as img:
        img_sizes.append(img.size)
widths, heights = zip(*img_sizes)
plt.hist(widths, bins=30, alpha=0.7, label='width')
plt.hist(heights, bins=30, alpha=0.7, label='height')
plt.legend(); plt.title('Image Dimension Distribution'); plt.show()
print(f"⏱ Image size distribution: {time.perf_counter() - start:.2f}s")

# === 4. Brightness / Contrast ===
start = time.perf_counter()
brightness, contrast = [], []
for path, _ in dataset.samples:
    img = np.array(Image.open(path).convert('L'))
    brightness.append(np.mean(img))
    contrast.append(np.std(img))
plt.scatter(brightness, contrast, alpha=0.5)
plt.xlabel('Brightness'); plt.ylabel('Contrast'); plt.title('Brightness vs Contrast'); plt.show()
print(f"⏱ Brightness/Contrast: {time.perf_counter() - start:.2f}s")

# === 5. Blur score ===
start = time.perf_counter()
blur_scores = []
for path, _ in dataset.samples:
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    if img is not None:
        blur_scores.append(cv2.Laplacian(img, cv2.CV_64F).var())
plt.hist(blur_scores, bins=50)
plt.title('Blur Score Distribution'); plt.xlabel('Variance'); plt.show()
print(f"⏱ Blur analysis: {time.perf_counter() - start:.2f}s")

# === 6. Mean / Std ===
start = time.perf_counter()
loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)
imgs, _ = next(iter(loader))
mean = imgs.mean(dim=[0,2,3])
std = imgs.std(dim=[0,2,3])
print(f"Mean per channel: {mean} | Std per channel: {std}")
print(f"⏱ Mean/Std calc: {time.perf_counter() - start:.2f}s")

# === 전체 시간 요약 ===
print(f"\n✅ Total elapsed time: {time.perf_counter() - t0:.2f} seconds")


## 데이터 전처리