In [4]:
#데이터를 불러오기

import torch
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

# 1) 전처리(transform) 정의 (예: 크기 조정 + 텐서 변환)
transform = transforms.Compose([
    transforms.Resize((224, 224)),      # 모델 입력 크기에 맞춰주세요
    transforms.RandomHorizontalFlip(),  # 학습 시 간단한 증강
    transforms.ToTensor(),              # PIL → Tensor
    transforms.Normalize(               # ImageNet 통계 기준 정규화 (필요시)
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# 2) 데이터셋 로드
train_dataset = ImageFolder(
    root= '/kaggle/input/car-zip-data/train',  # 상위 폴더 경로
    transform=transform                   # 위에서 정의한 전처리
)

# 3) 클래스명 ↔ 인덱스 매핑 확인
print(train_dataset.class_to_idx)
# 예: {'avante': 0, 'bongo': 1, …}

# 4) DataLoader 생성
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4,   # I/O 병렬 처리 개수 (CPU 코어 수에 맞게 조절)
    pin_memory=True  # GPU 사용 시 메모리 고정하면 속도 향상
)

# 5) 배치 확인
images, labels = next(iter(train_loader))
print(images.shape, labels.shape)
# 예: torch.Size([32, 3, 224, 224]) torch.Size([32])

{'1시리즈_F20_2013_2015': 0, '1시리즈_F20_2016_2019': 1, '1시리즈_F40_2020_2024': 2, '2008_2015_2017': 3, '2시리즈_그란쿠페_F44_2020_2024': 4, '2시리즈_액티브_투어러_F45_2019_2021': 5, '2시리즈_액티브_투어러_U06_2022_2024': 6, '3008_2세대_2018_2023': 7, '3시리즈_E90_2005_2012': 8, '3시리즈_F30_2013_2018': 9, '3시리즈_G20_2019_2022': 10, '3시리즈_G20_2023_2025': 11, '3시리즈_GT_F34_2014_2021': 12, '4시리즈_F32_2014_2020': 13, '4시리즈_G22_2021_2023': 14, '4시리즈_G22_2024_2025': 15, '5008_2세대_2018_2019': 16, '5008_2세대_2021_2024': 17, '5시리즈_F10_2010_2016': 18, '5시리즈_G30_2017_2023': 19, '5시리즈_G60_2024_2025': 20, '5시리즈_GT_F07_2010_2017': 21, '6시리즈_F12_2011_2018': 22, '6시리즈_GT_G32_2018_2020': 23, '6시리즈_GT_G32_2021_2024': 24, '718_박스터_2017_2024': 25, '718_카이맨_2017_2024': 26, '7시리즈_F01_2009_2015': 27, '7시리즈_G11_2016_2018': 28, '7시리즈_G11_2019_2022': 29, '7시리즈_G70_2023_2025': 30, '8시리즈_G15_2020_2024': 31, '911_2003_2019': 32, '911_992_2020_2024': 33, 'A4_B9_2016_2019': 34, 'A4_B9_2020_2024': 35, 'A5_F5_2019_2024': 36, 'A6_C8_2019_2025': 37, 'A7_2012_201

In [5]:
from collections import Counter
import pandas as pd

# 1) 클래스 목록과 인덱스 확인
classes = train_dataset.classes
num_classes = len(classes)

# 2) 전체 이미지 수
total_images = len(train_dataset)

# 3) 클래스별 이미지 개수 계산
#    train_dataset.targets 에는 각 샘플의 레이블 인덱스가 리스트로 들어 있습니다.
class_counts = Counter(train_dataset.targets)

# 4) 보기 좋게 DataFrame 으로 정리
df = pd.DataFrame({
    'class_name': classes,
    'count': [class_counts[i] for i in range(num_classes)]
})

# 5) 출력
print(f'클래스 수: {num_classes}')
print(f'전체 이미지 수: {total_images}\n')
print('클래스별 이미지 개수:')
print(df)

클래스 수: 396
전체 이미지 수: 33137

클래스별 이미지 개수:
                  class_name  count
0         1시리즈_F20_2013_2015     86
1         1시리즈_F20_2016_2019     83
2         1시리즈_F40_2020_2024     79
3             2008_2015_2017     86
4    2시리즈_그란쿠페_F44_2020_2024     87
..                       ...    ...
391          팰리세이드_2019_2022     83
392           팰리세이드_LX3_2025     75
393       프리우스_4세대_2016_2018     80
394       프리우스_4세대_2019_2022     90
395         프리우스_C_2018_2020     86

[396 rows x 2 columns]


In [7]:
import os
import torch
import numpy as np
import pandas as pd
from torchvision import models, transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from sklearn.covariance import EmpiricalCovariance
from tqdm.auto import tqdm  # tqdm 불러오기

# 1) 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data_dir = '/kaggle/input/car-zip-data/train'
batch_size = 32
num_workers = 4

# 2) 전처리
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225])
])

# 3) Dataset & DataLoader
dataset = ImageFolder(root=data_dir, transform=transform)
loader  = DataLoader(dataset, batch_size=batch_size,
                     shuffle=False, num_workers=num_workers)

# 4) ResNet50 특징 추출기 준비
resnet = models.resnet50(pretrained=True)
feature_extractor = torch.nn.Sequential(*list(resnet.children())[:-1]).to(device)
feature_extractor.eval()

# 5) 특징 벡터 추출
print("▶ 5) Feature Extraction 시작")
features, labels, paths = [], [], []
with torch.no_grad():
    for imgs, labs in tqdm(loader, desc="Extracting features"):
        imgs = imgs.to(device)
        out  = feature_extractor(imgs)                   # [B,2048,1,1]
        out  = out.view(out.size(0), -1).cpu().numpy()    # [B,2048]
        features.append(out)
        labels.append(labs.numpy())
        start = len(paths)
        for i in range(out.shape[0]):
            paths.append(dataset.samples[start + i][0])
features = np.vstack(features)
labels   = np.hstack(labels)
print("✔ Feature Extraction 완료\n")

# 6) Mahalanobis 거리 계산
print("▶ 6) Mahalanobis 거리 계산 시작")
df_results = []
for cls_idx, cls_name in tqdm(enumerate(dataset.classes),
                              total=len(dataset.classes),
                              desc="Computing Mahalanobis"):
    cls_feats = features[labels == cls_idx]
    if len(cls_feats) < 2:
        continue
    cov = EmpiricalCovariance().fit(cls_feats)
    dists = cov.mahalanobis(cls_feats)
    df_cls = pd.DataFrame({
        'path': paths,
        'label': cls_name,
        'dist': np.nan
    })
    df_cls.loc[labels == cls_idx, 'dist'] = dists
    df_results.append(df_cls[df_cls['label'] == cls_name])
df_all = pd.concat(df_results, ignore_index=True)
print("✔ Mahalanobis 계산 완료\n")

# 7) 이상치 필터링
print("▶ 7) 이상치 필터링 시작")
outliers = []
for cls in tqdm(dataset.classes, desc="Filtering outliers"):
    d = df_all.loc[df_all.label == cls, 'dist']
    thr = d.mean() + 3 * d.std()
    outliers.append(df_all[(df_all.label == cls) & (df_all.dist > thr)])
df_outliers = pd.concat(outliers, ignore_index=True)
print(f"✔ 이상치 필터링 완료  (총 {len(df_outliers)}개 샘플)\n")

# 8) 결과 출력 & 저장
print(f"전체 샘플 수: {len(df_all)}")
print(f"탐지된 이상치 수: {len(df_outliers)}\n")
print("이상치 예시:")
print(df_outliers.head(), "\n")
df_outliers.to_csv('detected_outliers.csv', index=False)
print("→ detected_outliers.csv에 저장되었습니다.")

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 235MB/s]


▶ 5) Feature Extraction 시작


Extracting features:   0%|          | 0/1036 [00:00<?, ?it/s]

✔ Feature Extraction 완료

▶ 6) Mahalanobis 거리 계산 시작


Computing Mahalanobis:   0%|          | 0/396 [00:00<?, ?it/s]

✔ Mahalanobis 계산 완료

▶ 7) 이상치 필터링 시작


Filtering outliers:   0%|          | 0/396 [00:00<?, ?it/s]

✔ 이상치 필터링 완료  (총 15개 샘플)

전체 샘플 수: 13122252
탐지된 이상치 수: 15

이상치 예시:
                                                path                 label  \
0  /kaggle/input/car-zip-data/train/E_클래스_W213_20...  E_클래스_W213_2017_2020   
1  /kaggle/input/car-zip-data/train/G90_2019_2022...         G90_2019_2022   
2  /kaggle/input/car-zip-data/train/New_XF_2012_2...      New_XF_2012_2015   
3  /kaggle/input/car-zip-data/train/뉴_QM6_2021_20...       뉴_QM6_2021_2023   
4  /kaggle/input/car-zip-data/train/더_뉴_K5_2세대_20...  더_뉴_K5_2세대_2019_2020   

        dist  
0  86.000015  
1  76.000013  
2  84.000015  
3  58.000011  
4  83.000014   

→ detected_outliers.csv에 저장되었습니다.


In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- 1. 설정 (Configuration) ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

data_dir = '/kaggle/input/car-zip-data/train'
outliers_csv_path = 'detected_outliers.csv' # 이전 스크립트에서 저장된 이상치 파일
batch_size = 32
num_workers = 4
num_epochs = 30 # 충분한 epoch 수 (Early Stopping으로 조절)
learning_rate = 0.001
model_save_path = 'best_car_classifier_model.pth'

# --- 2. 데이터 전처리 (Data Preprocessing) ---
input_size = 224 # ViT-B/16 모델의 표준 입력 크기 (대부분의 ViT는 224 또는 384)

# 훈련 시 적용할 증강 및 정규화
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(input_size), # 224 대신 input_size 변수 사용
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 검증 시 적용할 전처리 (증강 없음)
val_transform = transforms.Compose([
    # ViT는 224x224가 표준이므로, 256으로 리사이즈 후 224로 중앙 크롭하는 기존 방식이 적합합니다.
    transforms.Resize(256),
    transforms.CenterCrop(input_size), # 224 대신 input_size 변수 사용
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# --- 3. 데이터셋 로드 및 이상치 필터링 ---
print("▶ 3) 데이터셋 로드 및 이상치 필터링 시작")
full_dataset = ImageFolder(root=data_dir, transform=None) # 초기에는 transform 적용 안함

# 이상치 파일 로드
if os.path.exists(outliers_csv_path):
    df_outliers = pd.read_csv(outliers_csv_path)
    outlier_paths = set(df_outliers['path'].tolist())
    print(f"Loaded {len(outlier_paths)} outlier paths from {outliers_csv_path}")
else:
    outlier_paths = set()
    print(f"Warning: {outliers_csv_path} not found. Proceeding without outlier filtering.")

# 이상치를 제외한 데이터 인덱스 필터링
non_outlier_indices = []
for i, (path, _) in enumerate(full_dataset.samples):
    if path not in outlier_paths:
        non_outlier_indices.append(i)

# 필터링된 데이터셋 생성
filtered_dataset = Subset(full_dataset, non_outlier_indices)
print(f"Original dataset size: {len(full_dataset)}")
print(f"Filtered dataset size (after removing outliers): {len(filtered_dataset)}")

# 데이터셋을 훈련/검증 세트로 분할
train_indices, val_indices = train_test_split(
    list(range(len(filtered_dataset))),
    test_size=0.2, # 20%를 검증 세트로 사용
    random_state=42, # 재현성을 위해 시드 고정
    stratify=[filtered_dataset.dataset.samples[i][1] for i in non_outlier_indices] # 클래스 비율 유지
)

# 분할된 인덱스를 사용하여 실제 훈련/검증 데이터셋 생성 (전처리 적용)
train_dataset = Subset(full_dataset, [non_outlier_indices[i] for i in train_indices])
val_dataset = Subset(full_dataset, [non_outlier_indices[i] for i in val_indices])

# 각 서브셋에 맞는 transform 적용
train_dataset.dataset.transform = train_transform
val_dataset.dataset.transform = val_transform

# DataLoader 생성
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False, # 검증 시에는 섞을 필요 없음
    num_workers=num_workers,
    pin_memory=True
)
print("✔ 데이터셋 로드 및 분할 완료\n")

# 클래스명 ↔ 인덱스 매핑 확인 (원본 dataset에서 가져옴)
class_to_idx = full_dataset.class_to_idx
idx_to_class = {v: k for k, v in class_to_idx.items()}
num_classes = len(class_to_idx)
print(f"Number of classes: {num_classes}")
print(f"Class mapping: {class_to_idx}\n")

# 4. 모델 정의 (ViT-B/16 Fine-tuning)
model = models.vit_b_16(pretrained=True) # <-- 이 라인을 수정합니다.

# ViT 모델의 마지막 분류 레이어는 'heads' 아래에 'head'로 있습니다.
num_ftrs = model.heads.head.in_features # <-- 이 라인을 수정합니다.
model.heads.head = nn.Linear(num_ftrs, num_classes) # <-- 이 라인을 수정합니다.
model = model.to(device)
print("✔ 모델 정의 완료 (ViT-B/16 Fine-tuning)\n")

# --- 5. 손실 함수, 옵티마이저, 스케줄러 정의 ---
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5) # AdamW는 L2 정규화 포함
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=5) # 5 epoch 동안 val_acc 개선 없으면 LR 0.1배 감소

# --- 6. 모델 훈련 (Model Training) ---
print("▶ 6) 모델 훈련 시작")
best_val_accuracy = 0.0
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

for epoch in range(num_epochs):
    # 훈련 단계
    model.train() # 훈련 모드
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    # 여기에 tqdm이 적용되어 있습니다.
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} Train"):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total_samples += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

    epoch_train_loss = running_loss / len(train_dataset)
    epoch_train_accuracy = correct_predictions / total_samples
    train_losses.append(epoch_train_loss)
    train_accuracies.append(epoch_train_accuracy)

    # 검증 단계
    model.eval() # 평가 모드
    val_running_loss = 0.0
    val_correct_predictions = 0
    val_total_samples = 0

    with torch.no_grad():
        # 그리고 여기에도 tqdm이 적용되어 있습니다.
        for inputs, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} Val"):
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_total_samples += labels.size(0)
            val_correct_predictions += (predicted == labels).sum().item()

    epoch_val_loss = val_running_loss / len(val_dataset)
    epoch_val_accuracy = val_correct_predictions / val_total_samples
    val_losses.append(epoch_val_loss)
    val_accuracies.append(epoch_val_accuracy)

    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_accuracy:.4f} | "
          f"Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.4f}")

    scheduler.step(epoch_val_accuracy)

    if epoch_val_accuracy > best_val_accuracy:
        best_val_accuracy = epoch_val_accuracy
        torch.save(model.state_dict(), model_save_path)
        print(f"⭐ Saved best model with Val Acc: {best_val_accuracy:.4f} ⭐")

print("\n✔ 모델 훈련 완료")
print(f"Best validation accuracy: {best_val_accuracy:.4f}")
print(f"Trained model saved to: {model_save_path}\n")

# --- 7. 훈련 과정 시각화 (Optional) ---
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# --- 8. 저장된 모델 로드 및 테스트 (선택 사항) ---
print("▶ 8) 저장된 모델 로드 및 간단 테스트 시작")
loaded_model = models.resnet50(pretrained=False) # 가중치 없이 모델 구조만 로드
loaded_model.fc = nn.Linear(num_ftrs, num_classes)
loaded_model.load_state_dict(torch.load(model_save_path))
loaded_model = loaded_model.to(device)
loaded_model.eval() # 평가 모드로 설정

# 검증 세트에서 최종 정확도 확인
correct = 0
total = 0
with torch.no_grad():
    for images, labels in tqdm(val_loader, desc="Testing Loaded Model"):
        images = images.to(device)
        labels = labels.to(device)
        outputs = loaded_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the best model on the validation set: {100 * correct / total:.2f}%')
print("✔ 테스트 완료")

Using device: cuda
▶ 3) 데이터셋 로드 및 이상치 필터링 시작
Loaded 15 outlier paths from detected_outliers.csv
Original dataset size: 33137
Filtered dataset size (after removing outliers): 33122
✔ 데이터셋 로드 및 분할 완료

Number of classes: 396
Class mapping: {'1시리즈_F20_2013_2015': 0, '1시리즈_F20_2016_2019': 1, '1시리즈_F40_2020_2024': 2, '2008_2015_2017': 3, '2시리즈_그란쿠페_F44_2020_2024': 4, '2시리즈_액티브_투어러_F45_2019_2021': 5, '2시리즈_액티브_투어러_U06_2022_2024': 6, '3008_2세대_2018_2023': 7, '3시리즈_E90_2005_2012': 8, '3시리즈_F30_2013_2018': 9, '3시리즈_G20_2019_2022': 10, '3시리즈_G20_2023_2025': 11, '3시리즈_GT_F34_2014_2021': 12, '4시리즈_F32_2014_2020': 13, '4시리즈_G22_2021_2023': 14, '4시리즈_G22_2024_2025': 15, '5008_2세대_2018_2019': 16, '5008_2세대_2021_2024': 17, '5시리즈_F10_2010_2016': 18, '5시리즈_G30_2017_2023': 19, '5시리즈_G60_2024_2025': 20, '5시리즈_GT_F07_2010_2017': 21, '6시리즈_F12_2011_2018': 22, '6시리즈_GT_G32_2018_2020': 23, '6시리즈_GT_G32_2021_2024': 24, '718_박스터_2017_2024': 25, '718_카이맨_2017_2024': 26, '7시리즈_F01_2009_2015': 27, '7시리즈_G11_2016_20

Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:01<00:00, 197MB/s] 


✔ 모델 정의 완료 (ViT-B/16 Fine-tuning)

▶ 6) 모델 훈련 시작


Epoch 1/30 Train:   0%|          | 0/829 [00:00<?, ?it/s]

Epoch 1/30 Val:   0%|          | 0/208 [00:00<?, ?it/s]

Epoch 1/30 | Train Loss: 6.0028, Train Acc: 0.0035 | Val Loss: 5.8857, Val Acc: 0.0050
⭐ Saved best model with Val Acc: 0.0050 ⭐


Epoch 2/30 Train:   0%|          | 0/829 [00:00<?, ?it/s]

Epoch 2/30 Val:   0%|          | 0/208 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7c0bf23a9620>
Exception ignored in: Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x7c0bf23a9620>  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

    Traceback (most recent call last):
self._shutdown_workers()  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__

  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    self._shutdown_workers()Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7c0bf23a9620>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1564, in _shutdown_workers
    