# 폐렴 진단

In [12]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

import torch
import torch.nn as nn
import torchvision.models as models

import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

from tqdm import tqdm
from sklearn.metrics import f1_score
from tensorboardX import SummaryWriter

In [13]:
# 1. 데이터 전처리
transform_train = transforms.Compose([
    transforms.Resize((224,224)), # CNN 계열 모델(ResNet, DenseNet, VGG 등)은 ImageNet 데이터셋(이미지 크기 224x224)으로 사전 학습됨
    transforms.RandomHorizontalFlip(), # 과적합 방지(50% 반전)
    transforms.RandomRotation(10), # 과적합 방지(+-10도 회전)
    transforms.ToTensor(), # ToTensor가 Normalize보다 먼저와야 tensor로 변환 후 정규화가 적용됨
    transforms.Normalize(mean=[0.485, 0.456, 0.406], # ImageNet 기준값
                         std=[0.229, 0.224, 0.225])
])

transform_test = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

base_dir = "../chest_xray"
splits = ["train", "val", "test"]

train_data = ImageFolder(os.path.join(base_dir, "train"), transform=transform_train)
val_data   = ImageFolder(os.path.join(base_dir, "val"), transform=transform_test)
test_data  = ImageFolder(os.path.join(base_dir, "test"), transform=transform_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True) # 일반적으로 32가 무난하다고 연구됨
val_loader   = DataLoader(val_data, batch_size=32, shuffle=False)
test_loader  = DataLoader(test_data, batch_size=32, shuffle=False)

In [14]:
# 2. 모델 선택
model = models.densenet121(weights="DEFAULT") # CNN 계열은 Data가 수백만장이 아니면 전이학습이 유리
model.classifier = nn.Linear(model.classifier.in_features, 2) # model.classifier.in_features : 이미지의 최종처리 값보다는 AdaptiveAvgPool의 처리에 따라 model 값 사용

In [15]:
# 3. 학습

custom_lr = 1e-4
epochs = 10
step = 0

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=custom_lr) # Adam에 비해 일반화 성능이 더 좋아지고 학습 안정성도 증가

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

writer = SummaryWriter()

model.train()

for epoch in range(epochs):
    for imgs, labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(imgs.to(device))
        loss = criterion(outputs, labels.to(device))
        writer.add_scalar("train/loss", loss, step)
        step += 1

        loss.backward()
        optimizer.step()

    print(f'epoch: {epoch + 1}, loss: {loss.item()}')
    

  0%|          | 0/163 [00:00<?, ?it/s]


UnidentifiedImageError: cannot identify image file <_io.BufferedReader name='../chest_xray/train/PNEUMONIA/person5_bacteria_16.jpeg'>

In [None]:
# 4. 평가

model.eval()

with torch.no_grad():
    for data, label in val_loader:
        pred = model(data.to(device)) 
        
        # 1) 소프트맥스 또는 argmax로 클래스 선택
        pred_class = torch.argmax(pred, dim=1)
        
        # 2) CPU로 옮기고 f1 계산
        f1 = f1_score(label.cpu(), pred_class.cpu(), average='weighted')
        print(f'F1 Score: {f1}')

        images_data = torchvision.utils.make_grid(data)
        imshow(images_data.to('cpu'), label)