In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import time
import math
import random
import numpy as np
import pandas as pd
from pathlib import Path
import glob
import matplotlib.pyplot as plt
from PIL import Image, ImageEnhance, ImageOps

from tqdm import tqdm, tqdm_notebook

import torch
from torch import nn, cuda
from torch.autograd import Variable 
import torch.nn.functional as F
import torchvision as vision
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, SGD, Optimizer
from torch.optim.lr_scheduler import _LRScheduler, CosineAnnealingLR, ReduceLROnPlateau

from sklearn.metrics import f1_scorem

In [None]:
# seed value fix
# seed 값을 고정해야 hyper parameter 바꿀 때마다 결과를 비교할 수 있습니다.
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 2019
seed_everything(SEED)

In [None]:
use_cuda = cuda.is_available()
use_cuda

In [None]:
class TrainDataset(Dataset):
    def __init__(self, df, mode='train', transforms=None):
        self.df = df
        self.mode = mode
        self.transform = transforms[self.mode]
        
    def __len__(self):
        return len(self.df)
            
    def __getitem__(self, idx):
        
        image = Image.open(TRAIN_IMAGE_PATH / self.df['img_file'][idx]).convert("RGB")

        if self.transform:
            image = self.transform(image)

        label = self.df['class'][idx]

        return image, label


In [None]:
class TestDataset(Dataset):
    def __init__(self, df, mode='test', transforms=None):
        self.df = df
        self.mode = mode
        self.transform = transforms[self.mode]
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        
        image = Image.open(TEST_IMAGE_PATH / self.df[idx]).convert("RGB")
        
        if self.transform:
            image = self.transform(image)
            
        return image  

In [None]:
target_size = (224, 224)

data_transforms = {
    'train': vision.transforms.Compose([
        vision.transforms.Resize(target_size),
        vision.transforms.RandomHorizontalFlip(),
        vision.transforms.RandomRotation(20),
        CIFAR10Policy(),
        vision.transforms.ToTensor(),
        vision.transforms.Normalize(
            [0.485, 0.456, 0.406], 
            [0.229, 0.224, 0.225])
    ]),
    'valid': vision.transforms.Compose([
        vision.transforms.Resize(target_size),
        vision.transforms.RandomResizedCrop(target_size, scale=(0.8,1.0)),
        vision.transforms.RandomHorizontalFlip(),
        vision.transforms.ToTensor(),
        vision.transforms.Normalize(
            [0.485, 0.456, 0.406], 
            [0.229, 0.224, 0.225])
    ]),
        'test': vision.transforms.Compose([
        vision.transforms.Resize((224,224)),
        vision.transforms.RandomResizedCrop(target_size, scale=(0.8,1.0)),
        vision.transforms.ToTensor(),
        vision.transforms.Normalize(
            [0.485, 0.456, 0.406], 
            [0.229, 0.224, 0.225])
    ]),
}

In [None]:
'''
crop된 이미지 사용
reference 허태명님 커널: https://www.kaggle.com/tmheo74/3rd-ml-month-car-image-cropping
'''

TRAIN_IMAGE_PATH = Path('../input/kakl-3rd-cropped-dataset/train_crop/')
TEST_IMAGE_PATH = Path('../input/kakl-3rd-cropped-dataset/test_crop/')
# train_image_path = Path('../input/2019-3rd-ml-month-with-kakr/train/')
# test_image_path = Path('../input/2019-3rd-ml-month-with-kakr/test/')

In [None]:
# 미리 5 fold로 나누어 csv로 저장한 후 불러왔습니다.
# 80프로를 train set으로, 나머지 20프로를 validation set으로 사용합니다. => 수정: 실수로 4 kfold를 해버렸네요 (3/4 train set, 1/4 valid set입니다)
df = pd.read_csv("../input/car-folds/car_4folds.csv")
test_csv = pd.read_csv('../input/2019-3rd-ml-month-with-kakr/test.csv')
df.head()

In [None]:
# class 분포 고려하여 사전에 split 해놨습니다. fold별 개수 확인 가능
len(df[df['fold'] == 0]), len(df[df['fold'] == 1]), len(df[df['fold'] == 2]), len(df[df['fold'] == 3])

In [None]:
train_df = df.loc[df['fold'] != 0]
valid_df = df.loc[df['fold'] == 0]

In [None]:
train_df = train_df[['img_file', 'class']].reset_index(drop=True)
valid_df = valid_df[['img_file', 'class']].reset_index(drop=True)
x_test = test_csv['img_file']
train_df.replace(196, 0, inplace=True) # 대회 데이터 클래스에 0이 없기에 일부러 바꿔줬습니다. model train시 클래스에 0이 없으면 오류 나기 때문에

num_classes = train_df['class'].nunique()
y_true = valid_df['class'].values # for cv score

In [None]:
print("number of train dataset: {}".format(len(train_df)))
print("number of valid dataset: {}".format(len(valid_df)))
print("number of classes to predict: {}".format(num_classes))

In [None]:
def train_one_epoch(model, criterion, train_loader, optimizer, mixup_loss, accumulation_step=2):
    
    model.train()
    train_loss = 0.
    optimizer.zero_grad()

    for i, (inputs, targets) in enumerate(train_loader):
            
        inputs, targets = inputs.cuda(), targets.cuda()

        if mixup_loss:
            inputs, targets_a, targets_b, lam = mixup_data(inputs, targets, alpha=1.0, use_cuda = use_cuda) # alpha in [0.4, 1.0] 선택 가능
            inputs, targets_a, targets_b = map(Variable, (inputs, targets_a, targets_b))
            outputs = model(inputs)
            loss = mixup_criterion(criterion, outputs.cuda(), targets_a.cuda(), targets_b.cuda(), lam)
            
        else:
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        loss.backward()
         if accumulation_step:
            if (i+1) % accumulation_step == 0:  
                optimizer.step()
                optimizer.zero_grad()
        else:
            optimizer.step()
            optimizer.zero_grad()
        

        train_loss += loss.item() / len(train_loader)
        
    return train_loss



In [None]:

def validation(model, criterion, valid_loader):
    
    model.eval()
    valid_preds = np.zeros((len(valid_dataset), num_classes))
    val_loss = 0.
    
    with torch.no_grad():
        for i, (inputs, targets) in enumerate(valid_loader):

            inputs, targets = inputs.cuda(), targets.cuda()
            
            outputs = model(inputs).detach()
            loss = criterion(outputs, targets)
            valid_preds[i * batch_size: (i+1) * batch_size] = outputs.cpu().numpy()
            
            val_loss += loss.item() / len(valid_loader)
            
        y_pred = np.argmax(valid_preds, axis=1)
        val_score = f1_score(y_true, y_pred, average='micro')  
        
    return val_loss, val_score   

In [None]:
# 스코어 기준과 loss 기준. lb 점수가 cv score와 비교했을 때 굉장히
# consistent해서 cv score를 기준으로 합니다.
def pick_best_score(result1, result2):
    if result1['best_score'] < result2['best_score']:
        return result2
    else:
        return result1
    
def pick_best_loss(result1, result2):
    if result1['best_loss'] < result2['best_loss']:
        return result1
    else:
        return result2

In [None]:
def train_model(num_epochs=60, accumulation_step=4, mixup_loss=False, cv_checkpoint=False, fine_tune=False, weight_file_name='weight_best.pt', **train_kwargs):
    
    # choose scheduler
    if fine_tune:
        lr = 0.00001
        optimizer = AdamW(model.parameters(), lr=lr, weight_decay=0.000025)   
        scheduler = ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.1)
    else:    
        lr = 0.01
        optimizer = SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.025)
        eta_min = 1e-6
        T_max = 10
        T_mult = 1
        restart_decay = 0.97
        scheduler = CosineAnnealingWithRestartsLR(optimizer,T_max=T_max, eta_min=eta_min, T_mult=T_mult, restart_decay=restart_decay)

    train_result = {}
    train_result['weight_file_name'] = weight_file_name
    best_epoch = -1
    best_score = 0.
    lrs = []
    score = []
    
    for epoch in range(num_epochs):
        
        start_time = time.time()

        train_loss = train_one_epoch(model, criterion, train_loader, optimizer, mixup_loss, accumulation_step)
        val_loss, val_score = validation(model, criterion, valid_loader)
        score.append(val_score)
    
        # model save (score or loss?)
        if cv_checkpoint:
            if val_score > best_score:
                best_score = val_score
                train_result['best_epoch'] = epoch + 1
                train_result['best_score'] = round(best_score, 5)
                torch.save(model.state_dict(), weight_file_name)
        else:
            if val_loss < best_loss:
                best_loss = val_loss
                train_result['best_epoch'] = epoch + 1
                train_result['best_loss'] = round(best_loss, 5)
                torch.save(model.state_dict(), weight_file_name)
        
        elapsed = time.time() - start_time
        lr = [_['lr'] for _ in optimizer.param_groups]
        
        print("Epoch {} - train_loss: {:.4f}  val_loss: {:.4f}  cv_score: {:.4f}  lr: {:.6f}  time: {:.0f}s".format(
                epoch+1, train_loss, val_loss, val_score, lr[0], elapsed))
        
        for param_group in optimizer.param_groups:
            lrs.append(param_group['lr'])
        
        # scheduler update
        if fine_tune:
            if cv_checkpoint:
                scheduler.step(val_score)
            else:
                scheduler.step(val_loss)
        else:
            scheduler.step()
     
    return train_result, lrs, score

In [None]:
batch_size = 128

train_dataset = TrainDataset(train_df, mode='train', transforms=data_transforms)
valid_dataset = TrainDataset(valid_df, mode='valid', transforms=data_transforms)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# baseline이기 때문에 resnet50 사용합니다. 바꿔보세요!
model = models.resnet50(pretrained=True)
model.fc = nn.Linear(2048, num_classes)
model.cuda()

In [None]:
criterion = nn.CrossEntropyLoss()

train_kwargs = dict(
    train_loader=train_loader,
    valid_loader=valid_loader,
    model=model,
    criterion=criterion,
    )


print("training starts")
num_epochs = 120
result, lrs, score = train_model(num_epochs=num_epochs, accumulation_step=2, mixup_loss=False, cv_checkpoint=True, fine_tune=False, weight_file_name='weight_best.pt', **train_kwargs)
print(result)


# finetuning 부분은 전 버전 참고하시면 좋을것 같습니다.

In [None]:
# 최근에 열린 imet 대회 같은 경우는 학습 시간이 9시간 이상 해야하기 때문에 저장하고 불러오기가 중요합니다
# 보통 kaggle에서 딥러닝 대회는 training과 inference는 따로 커널을 만들어서 진행합니다 (저처럼 local gpu 없을 경우 필수)

model = models.resnet50() 
model.fc = nn.Linear(2048, num_classes)
model.cuda()
model.load_state_dict(torch.load(result['weight_file_name']))

batch_size = 1 # 배치 1로 주면 순서대로 나온다
test_dataset = TestDataset(x_test, mode='test', transforms=data_transforms)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()
test_preds = []

with torch.no_grad():
    for i, images in enumerate(tqdm_notebook(test_loader)):
        images = images.cuda()
    
        preds = model(images).detach()
        test_preds.append(preds.cpu().numpy())
        

In [None]:
outputs = []
for _ in test_preds:
    # argmax를 사용해서 가장 높은 확률로 예측한 class 반환
    predicted_class_indices=np.argmax(_, axis=1).tolist()
    outputs.append(predicted_class_indices)

result = np.concatenate(outputs)

In [None]:
submission = pd.read_csv('../input/2019-3rd-ml-month-with-kakr/sample_submission.csv')
submission["class"] = result
submission["class"].replace(0, 196, inplace=True) # 196에서 0으로 수정했던걸 다시 되돌려준다 
submission.to_csv("submission.csv", index=False)
submission.head()