# 🚀 2강 (CPU 실습 + GPU 옵션): **DeiT-Tiny 전이학습** — 개념 → 데이터 준비 → 구조 탐색 → 학습 모니터링 → Attention → 토론/챌린지

> **촬영용 스크립트 가이드** 포함. CPU에서는 **Feature Extraction**까지만 실행, **Fine-tuning은 GPU 옵션**으로 결과만 설명하세요.


## 0. 환경 준비

In [None]:
# !pip install -q transformers datasets accelerate evaluate
import os, numpy as np, random, matplotlib.pyplot as plt, torch, torch.nn.functional as F
from torchvision import datasets as tvdatasets
from transformers import AutoImageProcessor, AutoModelForImageClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)

def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seed(42)

## 1. 전이학습 개념 도입 (≈5분)

**말하기 포인트:**  
- **ImageNet**: 1천 개 클래스, 수백만 장 이미지 → 일반적 시각 패턴을 미리 학습  
- **비유**: 영어를 잘하면 독일어도 빨리 배운다(문자/문법/표현 공유)  
- **전략 비교**: Feature Extraction(백본 freeze) vs Fine-tuning(일부/전체 unfreeze)


## 2. 데이터 준비 심화 (≈5분)

**질문:** “작은 데이터로도 학습이 가능할까요?”  
- CIFAR-10 이미지를 224×224로 resize (모델 입력 규격)  
- 원본 vs resize 비교 시각화


In [None]:
root='./data'
train_full = tvdatasets.CIFAR10(root=root, train=True, download=True)
test_set   = tvdatasets.CIFAR10(root=root, train=False, download=True)
class_names = train_full.classes

def to_numpy_list(tv_dataset):
    imgs, labs = [], []
    for img, lab in tv_dataset:
        imgs.append(np.array(img)); labs.append(lab)
    return imgs, labs

images_train, labels_train = to_numpy_list(train_full)
images_test,  labels_test  = to_numpy_list(test_set)

# Subset for CPU
SUBSET=0.2
idx = np.random.RandomState(42).permutation(len(images_train))
sel = idx[:int(len(images_train)*SUBSET)]
images_train = [images_train[i] for i in sel]
labels_train = [labels_train[i] for i in sel]

tr_imgs, va_imgs, tr_lbls, va_lbls = train_test_split(images_train, labels_train, test_size=0.2, stratify=labels_train, random_state=42)

# 원본 vs resize 미리보기
from PIL import Image
sample = Image.fromarray(tr_imgs[0])
resized = sample.resize((224,224))
plt.figure(figsize=(6,3))
plt.subplot(1,2,1); plt.imshow(sample); plt.title('Original'); plt.axis('off')
plt.subplot(1,2,2); plt.imshow(resized); plt.title('Resized 224'); plt.axis('off')
plt.tight_layout(); plt.show()

ds = DatasetDict({
    "train": Dataset.from_dict({"image": tr_imgs, "label": tr_lbls}),
    "validation": Dataset.from_dict({"image": va_imgs, "label": va_lbls}),
    "test": Dataset.from_dict({"image": images_test, "label": labels_test})
})

## 3. 사전학습 모델 구조 탐색 (≈5분)

**말하기 포인트:**  
- `model.config`로 **hidden_size, num_hidden_layers, num_attention_heads** 확인  
- 파라미터 수 계산 → 1강 CNN과 비교  
- **패치 분할**: 224×224 이미지를 16×16 패치 → 14×14=196 토큰 + CLS


In [None]:
model_name = "facebook/deit-tiny-patch16-224"
image_processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModelForImageClassification.from_pretrained(
    model_name,
    num_labels=10,
    id2label={i:c for i,c in enumerate(class_names)},
    label2id={c:i for i,c in enumerate(class_names)},
).to(device)

print(model.config)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Trainable params (initial):', num_params)

In [None]:
def preprocess_examples(examples):
    inputs = image_processor(images=examples["image"], return_tensors="pt")
    inputs["labels"] = torch.tensor(examples["label"])
    return inputs

ds = ds.with_transform(preprocess_examples)

## 4. (CPU) Feature Extraction 학습 & 모니터링 (≈5분)

**말하기 포인트:**  
- 백본 동결로 **빠르게 수렴**하는 모습을 Accuracy 곡선으로 보여줍니다.


In [None]:
# Freeze backbone
for p in model.base_model.parameters():
    p.requires_grad = False

def collate_fn(batch):
    out = {"pixel_values": torch.stack([b["pixel_values"] for b in batch])}
    out["labels"] = torch.tensor([b["labels"] for b in batch])
    return out

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds), "f1_macro": f1_score(labels, preds, average="macro")}

args = TrainingArguments(
    output_dir="./deit_tiny_fe",
    learning_rate=5e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    no_cuda=not torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=image_processor,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

trainer.train()
fe_metrics = trainer.evaluate()
fe_metrics

In [None]:
# Accuracy 곡선 플롯: Trainer 로그에서 추출
logs = trainer.state.log_history
ep, tr_acc, va_acc = [], [], []
for l in logs:
    if 'epoch' in l and ('eval_accuracy' in l or 'accuracy' in l):
        # eval 단계
        if 'eval_accuracy' in l:
            ep.append(l['epoch']); va_acc.append(l['eval_accuracy'])
    # (주의) train_accuracy는 기본 로그에 없음 → eval만 플롯
plt.figure(figsize=(6,4))
plt.plot(ep, va_acc, marker='o')
plt.title('Validation Accuracy (Feature Extraction)')
plt.xlabel('Epoch'); plt.ylabel('Acc'); plt.tight_layout(); plt.show()

## 5. (GPU 옵션) Fine-tuning — 코드만 소개 (≈5분)

⚠️ CPU에서는 매우 느림. **GPU(Colab 등)** 에서만 실행 권장.  
촬영 시 **코드 설명 + 미리 학습된 결과(스크린샷)** 를 보여주세요.


In [None]:
DO_FINETUNE = False
if DO_FINETUNE:
    # 상위 블록 일부만 unfreeze
    for name, p in model.base_model.named_parameters():
        if any(k in name for k in ["layer.10", "layer.11"]):
            p.requires_grad = True

    args_ft = TrainingArguments(
        output_dir="./deit_tiny_ft",
        learning_rate=1e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="none",
    )
    trainer_ft = Trainer(
        model=model,
        args=args_ft,
        train_dataset=ds["train"],
        eval_dataset=ds["validation"],
        tokenizer=image_processor,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
    )
    trainer_ft.train()
    ft_metrics = trainer_ft.evaluate()
    print(ft_metrics)

## 6. 평가 + Confusion Matrix + Attention 시각화 (≈5분)

**말하기 포인트:**  
- Feature Extraction만으로도 꽤 높은 정확도.  
- **Attention overlay**로 모델이 어디에 주목했는지 직관적으로 설명.


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import DataLoader

test_metrics = trainer.evaluate(ds["test"])
print('[Test] Accuracy:', test_metrics.get('eval_accuracy', None))

@torch.no_grad()
def predict_logits(dset):
    loader = DataLoader(dset, batch_size=64, shuffle=False, collate_fn=collate_fn)
    model.eval()
    preds, labels = [], []
    for batch in loader:
        batch = {k: v.to(device) for k,v in batch.items()}
        out = model(**batch)
        preds.append(out.logits.cpu().numpy())
        labels.append(batch["labels"].cpu().numpy())
    return np.concatenate(preds), np.concatenate(labels)

logits, y_true = predict_logits(ds["test"])
y_pred = logits.argmax(1)

cm = confusion_matrix(y_true,y_pred,labels=list(range(10)))
plt.figure(figsize=(6,5))
plt.imshow(cm, interpolation='nearest'); plt.title('Confusion Matrix (DeiT-Tiny)'); plt.colorbar()
plt.xticks(range(10), class_names, rotation=45); plt.yticks(range(10), class_names)
plt.tight_layout(); plt.xlabel('Pred'); plt.ylabel('True'); plt.show()

print('\n[Classification Report]\n', classification_report(y_true,y_pred,target_names=class_names))

In [None]:
# Attention Rollout (간단) → 마지막 레이어 평균 attention 사용
model.config.output_attentions = True
_ = model.eval()

# 테스트 샘플 하나
from PIL import Image
raw_test = tvdatasets.CIFAR10(root='./data', train=False, download=False)
img_pil, lab = raw_test[0]
img_resized = img_pil.resize((224,224))
inp = image_processor(images=[img_resized], return_tensors="pt")
with torch.no_grad():
    out = model(pixel_values=inp['pixel_values'].to(device))
attn = out.attentions[-1][0]  # (heads, tokens, tokens)
attn_mean = attn.mean(0)      # (tokens, tokens)
cls_to_patches = attn_mean[0, 1:]  # (196,)
attn_map = cls_to_patches.reshape(14,14).cpu().numpy()
attn_map = (attn_map - attn_map.min()) / (attn_map.max() - attn_map.min() + 1e-9)

# 업샘플링 & overlay
import cv2
attn_up = cv2.resize(attn_map, (224,224), interpolation=cv2.INTER_CUBIC)
img_np = np.array(img_resized)
heat = (attn_up*255).astype(np.uint8)
heat_color = cv2.applyColorMap(heat, cv2.COLORMAP_JET)
overlay = cv2.addWeighted(img_np, 0.6, heat_color, 0.4, 0)

plt.figure(figsize=(9,3))
plt.subplot(1,3,1); plt.imshow(img_np); plt.title('Original'); plt.axis('off')
plt.subplot(1,3,2); plt.imshow(heat, cmap='jet'); plt.title('Attention'); plt.axis('off')
plt.subplot(1,3,3); plt.imshow(overlay[:,:,::-1]); plt.title('Overlay'); plt.axis('off')
plt.tight_layout(); plt.show()

## 7. 실전 적용 토론 (≈3분)

- **의료영상**: 병변 후보영역 분류/탐지(윤리·규제 고려)  
- **리테일**: 제품 이미지 분류/검색, 상품 속성 태깅  
- **안전**: CCTV 행동 인식, 화재/연기 감지(데이터 품질/라벨 정확도 중요)


## 8. 실습 챌린지 (≈2분)

- **백본 교체**: `google/vit-small`, `facebook/deit-base`, 또는 MobileNet/EfficientNet(HF/timm)로 비교  
- **데이터 크기 변화**: `SUBSET=0.5`로 늘려 성능/시간 트레이드오프 관찰  
- **정밀 시각화**: Grad-CAM(ConvNet), Attention Rollout(Transformer) 심화
