In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1. 데이터 불러오기

In [None]:
from google.colab import output
# !cp 파일1 파일2 # 파일1을 파일2로 복사 붙여넣기
!cp "/content/drive/MyDrive/공모전/BOAZ_dacon 컴퓨터비전/data/open/data_2.zip" "data_2.zip"
# data_2.zip을 현재 디렉터리에 압축해제
!unzip "data_2.zip"

Archive:  data_2.zip
  inflating: dirty_mnist_2nd.zip     
  inflating: dirty_mnist_2nd_answer.csv  
  inflating: mnist_data.zip          
  inflating: sample_submission.csv   
  inflating: test_dirty_mnist_2nd.zip  


In [None]:
from google.colab import output
# 현재 디렉터리에 dirty_mnist라는 폴더 생성
!mkdir "./dirty_mnist"
#dirty_mnist.zip라는 zip파일을 dirty_mnist라는 폴더에 압축 풀기
!unzip "dirty_mnist_2nd.zip" -d "./dirty_mnist/"
# 현재 디렉터리에 test_dirty_mnist라는 폴더 생성
!mkdir "./test_dirty_mnist"
#test_dirty_mnist.zip라는 zip파일을 test_dirty_mnist라는 폴더에 압축 풀기
!unzip "test_dirty_mnist_2nd.zip" -d "./test_dirty_mnist/"
# 출력 결과 지우기
output.clear()

# 2. Library Load

In [None]:
!pip install efficientnet_pytorch
!pip install git+https://github.com/cmpark0126/pytorch-polynomial-lr-decay.git

Collecting efficientnet_pytorch
  Downloading https://files.pythonhosted.org/packages/4e/83/f9c5f44060f996279e474185ebcbd8dbd91179593bffb9abe3afa55d085b/efficientnet_pytorch-0.7.0.tar.gz
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.0-cp37-none-any.whl size=16031 sha256=9eb19fda7fb5b788ef7b53ea92c00420290070ed42b94139bf8a1b0bde5b493e
  Stored in directory: /root/.cache/pip/wheels/e9/c6/e1/7a808b26406239712cfce4b5ceeb67d9513ae32aa4b31445c6
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.0
Collecting git+https://github.com/cmpark0126/pytorch-polynomial-lr-decay.git
  Cloning https://github.com/cmpark0126/pytorch-polynomial-lr-decay.git to /tmp/pip-req-build-7bexlhbs
  Running command git clone -q https://github.com/cmpark0126/pytorch-p

In [None]:
import torch
import glob
import os
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import cv2
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import KFold
import time
from efficientnet_pytorch import EfficientNet
import matplotlib.pyplot as plt
from torch_poly_lr_decay import PolynomialLRDecay
import random

import torch.nn as nn
import torch.utils.data as D
import torch.nn.functional as F
import torchvision.transforms as T
import torchvision.models as models


torch.set_num_threads(1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


# 3. Dataset 구성

In [None]:
train_transform = T.Compose([
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        T.Normalize((0.1307,), (0.3081,)),
        T.RandomAffine(20)
        ])

test_transform = T.Compose([
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        T.Normalize((0.1307,), (0.3081,)),
        T.RandomAffine(30)
        ])


class Minist_Dataset(D.Dataset):
    """
    path = {BASE_PATH,DATA_DIR1, DATA_DIR2 ,CSV_PATH}
    Return: pytorch custome dataset format 
    """
    def __init__(self,
                 dir_path,
                 data,label,
                 transforms=train_transform,
                 augmentations=None):
        
        self.dir_path = dir_path # directory path
        self.data = data # image data
        self.label = label #label
        self.transforms = transforms# Transform
        #self.augmentations = augmentations
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # global image
        image = image.open(self.dir_path + self.data[idx])
        label = self.label[idx] 
            
        if self.transform:
            image = self.transform(image)
        
        return image, label



# reproduction을 위한 seed 설정

In [None]:
# https://dacon.io/competitions/official/235697/codeshare/2363?page=1&dtype=recent&ptype=pub
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  

# 4. model 구성

In [None]:
# EfficientNet -b0(pretrained)
# MultiLabel output

class EfficientNet_MultiLabel(nn.Module):
    def __init__(self, in_channels):
        super(EfficientNet_MultiLabel, self).__init__()
        self.network = EfficientNet.from_pretrained('efficientnet-b0', in_channels=in_channels)
        self.output_layer = nn.Linear(1000, 26)

    def forward(self, x):
        x = F.relu(self.network(x))
        x = torch.sigmoid(self.output_layer(x))
        return x

# 모델 선언
#model = EfficientNet_MultiLabel(in_channels=3)
#model

In [None]:
namelist = os.listdir('./dirty_mnist/')
namelist = np.array(namelist)

# 5.학습

In [None]:
# cross validation을 적용하기 위해 KFold 생성
from sklearn.model_selection import KFold
kfold = KFold(n_splits=2, shuffle=True, random_state=0) # 최종에서 5로 변경

dirty_mnist_answer = pd.read_csv("dirty_mnist_2nd_answer.csv")
# dirty_mnist_answer에서 train_idx와 val_idx를 생성
best_models = [] # 폴드별로 가장 validation acc가 높은 모델 저장
for fold_index, (trn_idx, val_idx) in enumerate(kfold.split(dirty_mnist_answer),1):
    
    # cuda cache 초기화
    torch.cuda.empty_cache()
    print(trn_idx)

    #train fold, validation fold 분할
    train_list = namelist[trn_idx]
    test_list = namelist[val_idx]
    train_answer = dirty_mnist_answer.iloc[trn_idx]
    test_answer  = dirty_mnist_answer.iloc[val_idx]

    #Dataset 정의
    train_dataset = Minist_Dataset("dirty_mnist/", train_list, train_answer)
    valid_dataset = Minist_Dataset("dirty_mnist/", test_list, test_answer)

    #DataLoader 정의
    train_data_loader = DataLoader(
        train_dataset,
        batch_size = 128,
        shuffle = True,
        num_workers = 3
    )
    valid_data_loader = DataLoader(
        valid_dataset,
        batch_size = 32,
        shuffle = False,
        num_workers = 3
    )

    # 모델 선언
    model = EfficientNet_MultiLabel(in_channels=3)
    model = nn.DataParallel(model)
    model.to(device)# gpu에 모델 할당

    # 훈련 옵션 설정
    optimizer = torch.optim.Adam(model.parameters(),
                                lr = 0.001)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size = 5,
                                                gamma = 0.75)
    criterion = torch.nn.BCELoss()

    # 훈련 시작
    valid_acc_max = 0
    for epoch in range(1):
        # 1개 epoch 훈련
        train_acc_list = []
        with tqdm(train_data_loader,#train_data_loader를 iterative하게 반환
                total=train_data_loader.__len__(), # train_data_loader의 크기
                unit="batch") as train_bar:# 한번 반환하는 smaple의 단위는 "batch"
            for i, (images, labels) in train_bar:
                train_bar.set_description(f"Train Epoch {epoch}")
                # 갱신할 변수들에 대한 모든 변화도를 0으로 초기화
                # 참고)https://tutorials.pytorch.kr/beginner/pytorch_with_examples.html
                optimizer.zero_grad()

                # tensor를 gpu에 올리기
                images = images.type(torch.FloatTensor).to(device)
                labels = labels.type(torch.FloatTensor).to(device)

                # 모델의 dropoupt, batchnormalization를 train 모드로 설정
                model.train()
                # .forward()에서 중간 노드의 gradient를 계산
                with torch.set_grad_enabled(True):
                    # 모델 예측
                    probs  = model(images)
                    # loss 계산
                    loss = criterion(probs, labels)
                    # 중간 노드의 gradient로
                    # backpropagation을 적용하여
                    # gradient 계산
                    loss.backward()
                    # weight 갱신
                    optimizer.step()

                    # train accuracy 계산
                    probs  = probs.cpu().detach().numpy()
                    labels = labels.cpu().detach().numpy()
                    preds = probs > 0.5
                    batch_acc = (labels == preds).mean()
                    train_acc_list.append(batch_acc)
                    train_acc = np.mean(train_acc_list)

                # 현재 progress bar에 현재 미니배치의 loss 결과 출력
                train_bar.set_postfix(train_loss= loss.item(),
                                      train_acc = train_acc)
                

        # 1개 epoch학습 후 Validation 점수 계산
        valid_acc_list = []
        with tqdm(valid_data_loader,
                total=valid_data_loader.__len__(),
                unit="batch") as valid_bar:
            for i, (images, labels) in train_bar:
                valid_bar.set_description(f"Valid Epoch {epoch}")
                optimizer.zero_grad()
                images = images.type(torch.FloatTensor).to(device)
                labels = labels.type(torch.FloatTensor).to(device)

                # 모델의 dropoupt, batchnormalization를 eval모드로 설정
                model.eval()
                # .forward()에서 중간 노드의 gradient를 계산
                with torch.no_grad():
                    # validation loss만을 계산
                    probs  = model(images)
                    valid_loss = criterion(probs, labels)

                    # train accuracy 계산
                    probs  = probs.cpu().detach().numpy()
                    labels = labels.cpu().detach().numpy()
                    preds = probs > 0.7
                    batch_acc = (labels == preds).mean()
                    valid_acc_list.append(batch_acc)

                valid_acc = np.mean(valid_acc_list)
                valid_bar.set_postfix(valid_loss = valid_loss.item(),
                                      valid_acc = valid_acc)
            
        # Learning rate 조절
        lr_scheduler.step()

        # 모델 저장
        if valid_acc_max < valid_acc:
            valid_acc_max = valid_acc
            best_model = model
            MODEL = "resnet18"
            # 모델을 저장할 구글 드라이브 경로
            path = "/content/drive/MyDrive/공모전/BOAZ_dacon 컴퓨터비전/notebook/models/"
            torch.save(best_model, f'{path}{fold_index}_{MODEL}_{valid_loss.item():2.4f}_epoch_{epoch}.pth')

    # 폴드별로 가장 좋은 모델 저장
    best_models.append(best_model)




  0%|          | 0/196 [00:00<?, ?batch/s][A[A[A

[    0     2     5 ... 49991 49994 49997]
Loaded pretrained weights for efficientnet-b0


  0%|          | 0/196 [00:00<?, ?batch/s]


UnboundLocalError: ignored

# 데이터 분리
- 해당 코드에서는 1fold만 실행합니다.
- 모두 실행하려면 훈련시 반복횟수를 5로 바꾸어주면 됩니다.

In [None]:
# 훈련 시작
total_step = len(train_dataloader)
best_val_acc = 0
EPOCH = 20
for epoch in range(EPOCH):
    train_acc_list = []
    running_loss = 0
    
    model.train()
    for i, (images, labels) in tqdm(enumerate(train_dataloader)):
        images = images.type(torch.FloatTensor).to(device)
        labels = labels.type(torch.FloatTensor).to(device)
        
        optimizer.zero_grad()

        probs= model(images)
        loss = criterion(probs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        probs  = probs.cpu().detach().numpy()
        labels = labels.cpu().detach().numpy()
        preds = probs > 0.75
        batch_acc = (labels == preds).mean()
        train_acc_list.append(batch_acc)
    
    train_acc = np.mean(train_acc_list)
    print(f'Epoch [{epoch+1}/{EPOCH}], Step [{i+1}/{total_step}], Loss: {running_loss/total_step}, Acc {train_acc}')

    model.eval()
    valid_acc_list = []
    with torch.no_grad():
        correct = 0
        total = 0

        for images, labels in val_dataloader:
            images = images.type(torch.FloatTensor).to(device)
            labels = labels.type(torch.FloatTensor).to(device)

            probs = model(images)
            valid_loss = criterion(probs, labels)

            probs  = probs.cpu().detach().numpy()
            labels = labels.cpu().detach().numpy()
            preds = probs > 0.75
            batch_acc = (labels == preds).mean()
            valid_acc_list.append(batch_acc)
            
        val_acc = np.mean(valid_acc_list)
        print(f'Validation acc: {val_acc}')

    lr_scheduler.step()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        save('best', save_dir, epoch, model, optimizer)
    save('last', save_dir, epoch, model, optimizer)



0it [00:00, ?it/s][A[A

UnboundLocalError: ignored

# Test

In [None]:
test_namelist = os.listdir('./test_dirty_mnist/')
test_labels = pd.read_csv("sample_submission.csv").to_numpy()[:, 1:]

test_transforms = T.Compose([
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize((0.1307,), (0.3081,)),
    T.RandomAffine(20)
])

test_dataset = CustomDataset('./test_dirty_mnist/', test_namelist, test_labels, test_transforms)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, shuffle=False)

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:07<00:00, 645.88it/s]


# Test 추론


In [None]:
# model.load_state_dict(torch.load('save_file/best.path.tar'))
model.eval()
prediction_list = []
with torch.no_grad():
    for images, labels in tqdm(test_dataloader):
        images = images.type(torch.FloatTensor).to(device)
        labels = labels.type(torch.FloatTensor).to(device)

        probs = model(images)
        
        probs = probs.cpu().detach().numpy()
        preds = probs > 0.75
        prediction_list.append(preds[0].astype(np.int))

Loaded pretrained weights for efficientnet-b0


  del sys.path[0]
100%|████████████████████████████████████████████████████████████████████████████████| 157/157 [00:13<00:00, 12.01it/s]


# 제출물 생성

In [None]:
file_name = '/content/drive/MyDrive/공모전/BOAZ_dacon 컴퓨터비전/Efficient_epoch20_prediction'

test_labels_DF = pd.read_csv("sample_submission.csv")
test_labels_DF.iloc[:, 1:] = prediction_list
test_labels_DF.to_csv(file_name +'.csv', index=False)