# 작물 병해 분류 AI 경진대회 Private 2위, Private Score: 0.99848,  

## yjune팀, 사용모델: SE-ResNeXt101-32x4d

## 대회 개요
Task: Multi-class image classification\
평가 산식: f1-macro

### 실험과정
* train.csv 데이터를 5 fold로 나누어 5개의 모델을 학습하고 앙상블 적용
* test 데이터에서 각 클래스에 대한 softmax를 거친 예측값이 0.85 이상일 경우 pseudo-labeling 하여 train data에 추가
* pseudo-labeling 데이터가 추가된 최종데이터를 5fold로 나누고 학습하고 5개의 모델을 앙상블
* 최종 모델 완성

## 라이브러리

In [3]:
import os
from glob import glob
from pytorch_lightning import callbacks
from pytorch_lightning.accelerators import accelerator
import torch
import torchvision
from pytorch_lightning.plugins import DDPPlugin
from pytorch_lightning import seed_everything
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
# from config import Config
# from models.model import FDModule
# from dataset import FDDataModule
from torch import nn
import torch.nn.functional as F
# from config import Config
import timm
from pytorch_lightning import LightningModule
from torchmetrics.functional import accuracy
from torchmetrics import F1
import numpy as np
import pandas as pd
from datetime import datetime
# from utils.loss import FocalLoss
# from utils.utils import mixup_data, mixup_criterion
from torchvision import transforms
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning import LightningDataModule
from sklearn.model_selection import StratifiedKFold
from torchvision.transforms.transforms import ColorJitter, RandomCrop, RandomHorizontalFlip

from config import Config
import cv2
import albumentations as A
import albumentations.pytorch as Ap

## EDA

In [6]:
DATA_DIR = 'fd_data'

train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

print(train_df.head())
print(train_df.shape)
print(test_df.shape)

print(train_df['disease_code'].value_counts())
"""
0    106
1     46
2     30
3     29
4     17
5     12
6     10
"""
"""
0    1745
1     846
2     640
3     611
4     399
5     303
6     206
"""


     uid              img_path    disease  disease_code
0  10000  train_imgs/10000.jpg    시설포도노균병             1
1  10001  train_imgs/10001.jpg    시설포도노균병             1
2  10002  train_imgs/10002.jpg  시설포도노균병반응             2
3  10003  train_imgs/10003.jpg        축과병             4
4  10004  train_imgs/10004.jpg    시설포도노균병             1
(250, 4)
(4750, 2)
0    106
1     46
2     30
3     29
4     17
5     12
6     10
Name: disease_code, dtype: int64


'\n0    1745\n1     846\n2     640\n3     611\n4     399\n5     303\n6     206\n'

* Train data가 매우 적고, imblanced class
* f1-macro 점수 향상을 위해서는 데이터가 적은 class에서도 좋은 점수를 얻어야합니다.

# Code

### Config & pre-defined function

In [7]:
# mixup augmentation을 위한 코드입니다.

def mixup_data(x, y, alpha=1.0):
    """Mixup for binary classification
    Args:
        x (torch.Tensor): batch of inputs
        y (torch.Tensor): batch of binary labels 
        alpha (float, optional): Defaults to 1.0.
    Returns:
        mixed_x, y_a, y_b, lam
    """

    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]

    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam, weight):
    return lam * criterion(pred, y_a, weight=weight) + (1 - lam) * criterion(pred, y_b,weight=weight)

In [8]:
class Config:
    exp = 'exp_1' # exp_1: 기존 train data만 사용, exp_2: pseudo labeling 추가한 최종 train data 사용
    phase = 'train' # train or test
    data_dir = 'fd_data' 
    model_name = 'gluon_seresnext101_32x4d' # timm의 ImageNet pretrained 모델
    fold_num = 5 # k-fold 
    batch_size = 64 # 
    num_workers = 4
    seed = 555
    tta = True # Test Time Augmentation


## Dataset

In [10]:
# Pytorch lightning Dataset 

class FDDataset(Dataset):
    def __init__(self, cfg:Config, df:pd.DataFrame, aug:bool = True):
        super(FDDataset, self).__init__()
        self.cfg = cfg
        self.df = df
        self.aug = aug
        if self.aug:
            self.transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Resize((256,256)),
                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.RandomVerticalFlip(p=0.5),
                transforms.RandomAffine(
                degrees=(-90,90),
                translate=(0.2, 0.2),
                scale=(0.8, 1.2), shear=15
                ),
            ]
            )
        else:
            self.transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                transforms.Resize((256,256)),
            ]
            )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.cfg.data_dir, self.df.loc[idx, 'img_path'])
        
        img = cv2.imread(img_path)
        img = self.transform(img)
        if self.cfg.phase == 'test':
            return img, self.df.loc[idx, 'uid']
        label = self.df.loc[idx, 'disease_code']
        return img, label
        
class FDDataModule(LightningDataModule):
    def __init__(self, cfg:Config):
        super().__init__()
        self.cfg = cfg
        
        self.test_df = pd.read_csv(os.path.join(cfg.data_dir, 'test.csv'))
        self.train_df = pd.read_csv(os.path.join(cfg.data_dir, 'train.csv'))
        self.fold_num = 0
        self._split_kfold()

    def set_fold_num(self, fold_num):
        self.fold_num = fold_num

    def get_class_weight(self):
        return 1 / self.train_df['disease_code'].value_counts().sort_index().values

    def setup(self, stage=None):
        if stage != 'test':
            print(f'FOLD NUM:{self.fold_num}')
            train_df = self.train_df[
                self.train_df["kfold"] != self.fold_num
            ].reset_index(drop=True)
            val_df = self.train_df[
                self.train_df["kfold"] == self.fold_num
            ].reset_index(drop=True)

            self.train = FDDataset(self.cfg, train_df, aug=True)
            self.val = FDDataset(self.cfg, val_df, aug=False)
            self.train_fold_df = self.train_df

        if stage == 'test':
            self.test = FDDataset(self.cfg, self.test_df, aug=False)

    def _split_kfold(self):
        skf = StratifiedKFold(
            n_splits=Config.fold_num, shuffle=True, random_state=Config.seed
        )
        # (train_idx, val_idx)
        for n, (_, val_index) in enumerate(
            skf.split(
                X=self.train_df,
                y=self.train_df['disease_code']
            )
        ):  # if valid index, record fold num in 'kfold' column
            self.train_df.loc[val_index, "kfold"] = int(n)
        

    def train_dataloader(self):
        return DataLoader(
            self.train,
            batch_size=self.cfg.batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=True,
            pin_memory=True,
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val,
            batch_size=self.cfg.batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
            pin_memory=True,
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test,
            batch_size=self.cfg.batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
        )


In [11]:
# Pytorch lightning Module

class FDModel(nn.Module):
    def __init__(self, cfg:Config):
        super(FDModel, self).__init__()
        self.cfg = cfg
        self.cnn = timm.create_model( # timm ImageNet pre-trained 모델 load
            cfg.model_name,
            pretrained=True,
            num_classes = 7,
            in_chans = 3
        )
    
    def forward(self, x):
        out = self.cnn(x)
        return out

class FDModule(LightningModule):
    def __init__(self, cfg:Config, class_weight=None):
        super().__init__()
        self.model = FDModel(cfg)
        self.val_metric = F1(num_classes=7, average="macro").cuda()
        self.train_metric =  F1(num_classes=7, average="macro").cuda()
        self.lr = 1e-3
        self.class_weight = class_weight
        self.cfg = cfg
        self.softmax = torch.nn.Softmax(dim=1)
        
        ## TTA에 사용할 augmentation
        self.horizontalflip = transforms.RandomHorizontalFlip(p=1)
        self.verticalflip = transforms.RandomVerticalFlip(p=1)
        self.rotation_left = transforms.RandomRotation(degrees=(-90,-90))
        self.rotation_right = transforms.RandomRotation(degrees=(90,90))

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        if batch_idx % 4 == 0: # 4 step 주기로 mixup 사용
            mixed_x, y_a, y_b, lam = mixup_data(x, y)
            logits = self(mixed_x)
            loss = mixup_criterion(F.cross_entropy, logits, y_a, y_b, lam, torch.Tensor(self.class_weight).cuda())
            self.log_dict({'mixup_loss':loss})
            return loss
        logits = self(x)
        loss = F.cross_entropy(logits, y.long(), weight= torch.Tensor(self.class_weight).cuda())
        preds = torch.argmax(logits, dim=1)
        micro_acc = accuracy(preds, y)
        f1_score = self.train_metric(preds, y)
        self.log_dict(
            {
                "train_loss": loss,
                "train_acc": micro_acc,
                "train_f1_macro": f1_score
            },
            prog_bar=True,
            sync_dist=True,
            on_step=False,
            on_epoch=True

        )
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y.long(), weight= torch.Tensor(self.class_weight).cuda())
        preds = torch.argmax(logits, dim=1)
        micro_acc = accuracy(preds, y)

        f1_score = self.val_metric(preds, y)
        self.log_dict(
            {
                "val_loss": loss,
                "val_acc": micro_acc,
                "val_f1_macro": f1_score                
            },
            prog_bar=True,
            sync_dist=True,
            on_step=False,
            on_epoch=True
        )

    def test_step(self, batch, batch_idx):
        if self.cfg.tta:
            return self.tta(batch,batch_idx)
        x, uid = batch
        logits = self(x)
        prob = self.softmax(logits)
        preds = prob

        return preds, uid

    def tta(self, batch, batch_idx):
        x, uid = batch
        _normal = self.softmax(self(x))
        _h_flip = self.softmax(self(self.horizontalflip(x)))
        _v_flip = self.softmax(self(self.verticalflip(x)))
        _l_rotate = self.softmax(self(self.rotation_left(x)))
        _r_rotate = self.softmax(self(self.rotation_right(x)))
        preds = (_normal + _h_flip + _v_flip + _l_rotate + _r_rotate) / 5
        return preds, uid  

    def test_epoch_end(self, outputs):
        results = self.all_gather(outputs)
        
        # class 별 confidence 저장하는 dataframe
        df = pd.DataFrame(range(20000,24750),columns=['uid'])
        
        df['prob_0'] = -100.0
        df['prob_1'] = -100.0
        df['prob_2'] = -100.0
        df['prob_3'] = -100.0
        df['prob_4'] = -100.0
        df['prob_5'] = -100.0
        df['prob_6'] = -100.0

        
        df = df.set_index('uid')
        for p, u in results:
            prob = p.reshape(-1,7).cpu().numpy()
            u = u.reshape(-1).cpu().numpy()
            for pp, uu in zip(prob,u):
                df.loc[uu] = pp
        df.to_csv(f'result_{self.cfg.exp}.csv')

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=30, verbose=True)
        
        if self.cfg.exp == 'exp_2':
            optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=100, T_mult=1)
            return (
            {
                "optimizer":optimizer,
                "lr_scheduler": {"scheduler":scheduler, "monitor":"val_loss", "interval":"epoch"},
            },
            )
        
        return [optimizer], [scheduler]
        
    

In [17]:
import os
from glob import glob
from pytorch_lightning import callbacks
from pytorch_lightning.accelerators import accelerator

from pytorch_lightning.plugins import DDPPlugin
from pytorch_lightning import seed_everything
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping


def train(cfg: Config, fold_num):
    seed_everything(Config.seed)
    fd_data_module = FDDataModule(cfg)
    # fd_data_module.setup(stage='test')
    
    #! TRAIN
    fd_data_module.set_fold_num(fold_num)
    fd_data_module.setup()
    class_weight = fd_data_module.get_class_weight()

    if cfg.phase=='test':
        fd_module = FDModule(cfg, class_weight=None).load_from_checkpoint(cfg.ckpt,
         cfg=Config)
    else:
        fd_module = FDModule(cfg, class_weight=class_weight)
    

    model_checkpoint = ModelCheckpoint(monitor='val_loss', save_top_k=3, dirpath=f'results/{cfg.exp}/{fd_data_module.fold_num}',
    filename="{epoch:02d}-{val_loss:.6f}-{val_acc:.4f}-{val_f1_macro}.pth", mode='min')

    early_stopping = EarlyStopping(monitor='val_loss', patience=200, verbose=True, mode='min') # for pseudo labeling
    # early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=True, mode='min') # for full train

    trainer = pl.Trainer(
        gpus="0",
        accelerator='dp',
        num_nodes=1,
        deterministic=True,
        check_val_every_n_epoch=1,
        callbacks = [model_checkpoint, early_stopping],
        precision=16,
        log_every_n_steps=4,
        # 500 for train
        # 500 fix
        max_epochs = 500,
        auto_lr_find=True,
        plugins=DDPPlugin(find_unused_parameters=False),
    )
    
    if cfg.phase == 'train':
        trainer.fit(fd_module, fd_data_module)
    else:
        trainer.test(fd_module, fd_data_module)

Global seed set to 555


In [None]:
# Fold 0 학습 및 체크포인트 저장
train(Config, 0)
# Fold 1 학습 및 체크포인트 저장
train(Config, 1)
# Fold 2 학습 및 체크포인트 저장
train(Config, 2)
# Fold 3 학습 및 체크포인트 저장
train(Config, 3)
# Fold 4 학습 및 체크포인트 저장
train(Config, 4)

In [24]:
def test(cfg: Config):
    
    fd_data_module = FDDataModule(cfg)
    fd_data_module.setup(stage='test')
    fd_module = FDModule(cfg, class_weight=None).load_from_checkpoint(cfg.ckpt, cfg=Config)

    trainer = pl.Trainer(
        gpus="0",
        accelerator='dp',
        num_nodes=1,
        deterministic=True,
        check_val_every_n_epoch=1,
        callbacks = [model_checkpoint, early_stopping],
        precision=16,
        log_every_n_steps=4,
        max_epochs = 500,
        auto_lr_find=True,
        plugins=DDPPlugin(find_unused_parameters=False),
    )
    trainer.test(fd_module, fd_data_module)

In [18]:
Config.phase = 'test'

# Fold 0 모델 
Config.exp = 'exp_1_fold_0'
Config.ckpt = glob('results/exp_1/0/*')[0] 
test(Config)

# Fold 1 모델 
Config.exp = 'exp_1_fold_1'
Config.ckpt = glob('results/exp_1/1/*')[0] 
test(Config)

# Fold 2 모델 
Config.exp = 'exp_1_fold_2'
Config.ckpt = glob('results/exp_1/2/*')[0] 
test(Config)

# Fold 3 모델 
Config.exp = 'exp_1_fold_3'
Config.ckpt = glob('results/exp_1/3/*')[0] 
test(Config)

# Fold 4 모델 
Config.exp = 'exp_1_fold_4'
Config.ckpt = glob('results/exp_1/4/*')[0] 
test(Config)

config.Config

## Pseudo Labeling

In [23]:
fold_0 = pd.read_csv('result_exp_1_fold_0.csv')
fold_1 = pd.read_csv('result_exp_1_fold_1.csv')
fold_2 = pd.read_csv('result_exp_1_fold_2.csv')
fold_3 = pd.read_csv('result_exp_1_fold_3.csv')
fold_4 = pd.read_csv('result_exp_1_fold_4.csv')

df = pd.concat([fold_0,fold_1,fold_2,fold_3,fold_4])
df = df.groupby('uid').mean()

df['_max'] = df.max(axis=1)
df['disease_code'] = df.idxmax(axis=1).str[-1].astype(int)

pseudo_label_df = df[df['_max'] > 0.85][['disease_code']] 
pseudo_label_df['img_path'] = 'test_imgs/'+pseudo_label_df.index.astype(str)+'.jpg'
pseudo_label_df = pseudo_label_df.reset_index()[['uid','img_path','disease_code']]

org_train = pd.read_csv('fd_data/train.csv')
full_train_df = pd.concat([org_train[['uid','img_path','disease_code']], pseudo_label_df],axis=0)
full_train_df.to_csv('full_train.csv',index=False)

## Override Dataset

In [None]:
# Pytorch lightning Dataset 

class FDDataset(Dataset):
    def __init__(self, cfg:Config, df:pd.DataFrame, aug:bool = True):
        super(FDDataset, self).__init__()
        self.cfg = cfg
        self.df = df
        self.aug = aug
        if self.aug:
            self.transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Resize((256,256)),
                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.RandomVerticalFlip(p=0.5),
                transforms.RandomAffine(
                degrees=(-90,90),
                translate=(0.2, 0.2),
                scale=(0.8, 1.2), shear=15
                ),
            ]
            )
        else:
            self.transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                transforms.Resize((256,256)),
            ]
            )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.cfg.data_dir, self.df.loc[idx, 'img_path'])
        
        img = cv2.imread(img_path)
        img = self.transform(img)
        if self.cfg.phase == 'test':
            return img, self.df.loc[idx, 'uid']
        label = self.df.loc[idx, 'disease_code']
        return img, label
        
class FDDataModule(LightningDataModule):
    def __init__(self, cfg:Config):
        super().__init__()
        self.cfg = cfg
        
        self.test_df = pd.read_csv(os.path.join(cfg.data_dir, 'test.csv'))
        self.train_df = pd.read_csv(os.path.join(cfg.data_dir, 'full_train.csv'))
        self.fold_num = 0
        self._split_kfold()

    def set_fold_num(self, fold_num):
        self.fold_num = fold_num

    def get_class_weight(self):
        return 1 / self.train_df['disease_code'].value_counts().sort_index().values

    def setup(self, stage=None):
        if stage != 'test':
            print(f'FOLD NUM:{self.fold_num}')
            train_df = self.train_df[
                self.train_df["kfold"] != self.fold_num
            ].reset_index(drop=True)
            val_df = self.train_df[
                self.train_df["kfold"] == self.fold_num
            ].reset_index(drop=True)

            self.train = FDDataset(self.cfg, train_df, aug=True)
            self.val = FDDataset(self.cfg, val_df, aug=False)
            self.train_fold_df = self.train_df

        if stage == 'test':
            self.test = FDDataset(self.cfg, self.test_df, aug=False)

    def _split_kfold(self):
        skf = StratifiedKFold(
            n_splits=Config.fold_num, shuffle=True, random_state=Config.seed
        )
        # (train_idx, val_idx)
        for n, (_, val_index) in enumerate(
            skf.split(
                X=self.train_df,
                y=self.train_df['disease_code']
            )
        ):  # if valid index, record fold num in 'kfold' column
            self.train_df.loc[val_index, "kfold"] = int(n)
        

    def train_dataloader(self):
        return DataLoader(
            self.train,
            batch_size=self.cfg.batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=True,
            pin_memory=True,
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val,
            batch_size=self.cfg.batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
            pin_memory=True,
        )
    
    def test_dataloader(self):
        return DataLoader(
            self.test,
            batch_size=self.cfg.batch_size,
            num_workers=self.cfg.num_workers,
            shuffle=False,
        )


## Training

In [None]:
Config.exp = 'exp_2'

# Fold 0 학습 및 체크포인트 저장
train(Config, 0)
# Fold 1 학습 및 체크포인트 저장
train(Config, 1)
# Fold 2 학습 및 체크포인트 저장
train(Config, 2)
# Fold 3 학습 및 체크포인트 저장
train(Config, 3)
# Fold 4 학습 및 체크포인트 저장
train(Config, 4)