# Unzip Dataset

In [None]:
!unzip /content/drive/MyDrive/23_Dacon_TV_hand_gesture_recognition/open.zip -d /content/

# csv file shape check
train: TRAIN_000.mp4 ~ TRAIN_609.mp4(610)  
test: TEST_000.mp4 ~ TEST_152.mp4(153)

In [None]:
import pandas as pd

# train.csv
'''
id: 샘플 고유 id
path: 사용자의 TV 제어 동작 영상 경로
label: 5가지 TV제어 동작
  0 : 스마트 TV 볼륨을 높입니다.(오른손 엄지를 위로 올리고, 앞으로 내민다.)
  1 : 스마트 TV 볼륨을 낮춥니다.(오른손 엄지를 아래로 내리고, 앞으로 내민다.)
  2 : 스마트 TV의 재생 영상을 10초 전으로 점프합니다.(오른손을 펴서 오른쪽에서 왼쪽으로 이동)
  3 : 스마트 TV의 재생 영상을 10초 앞으로 점프합니다.(오른손을 펴서 왼쪽에서 오른쪽으로 이동)
  4 : 스마트 TV의 재생 영상을 중지합니다.(손바닥을 펴서 앞으로 내민다.)
'''
pd.read_csv('/content/train.csv')

Unnamed: 0,id,path,label
0,TRAIN_000,./train/TRAIN_000.mp4,3
1,TRAIN_001,./train/TRAIN_001.mp4,0
2,TRAIN_002,./train/TRAIN_002.mp4,1
3,TRAIN_003,./train/TRAIN_003.mp4,4
4,TRAIN_004,./train/TRAIN_004.mp4,4
...,...,...,...
605,TRAIN_605,./train/TRAIN_605.mp4,0
606,TRAIN_606,./train/TRAIN_606.mp4,2
607,TRAIN_607,./train/TRAIN_607.mp4,1
608,TRAIN_608,./train/TRAIN_608.mp4,4


In [None]:
# test.csv
'''
id: 샘플 고유 id
path: 사용자의 TV 제어 동작 영상 경로
'''
pd.read_csv('/content/test.csv')


Unnamed: 0,id,path
0,TEST_000,./test/TEST_000.mp4
1,TEST_001,./test/TEST_001.mp4
2,TEST_002,./test/TEST_002.mp4
3,TEST_003,./test/TEST_003.mp4
4,TEST_004,./test/TEST_004.mp4
...,...,...
148,TEST_148,./test/TEST_148.mp4
149,TEST_149,./test/TEST_149.mp4
150,TEST_150,./test/TEST_150.mp4
151,TEST_151,./test/TEST_151.mp4


In [None]:
# sample_submission.csv
'''
id: 샘플 고유 id
label: 예측한 TV 제어 동작
'''
pd.read_csv('/content/sample_submission.csv')

Unnamed: 0,id,label
0,TEST_000,0
1,TEST_001,0
2,TEST_002,0
3,TEST_003,0
4,TEST_004,0
...,...,...
148,TEST_148,0
149,TEST_149,0
150,TEST_150,0
151,TEST_151,0


# Import Library, hyper-paramter setting

In [3]:
import random
import pandas as pd
import numpy as np
import os
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

'''
Dataset, DataLoader를 이용해 pre-loaded dataset과 가지고 있는 데이터를 모두 사용할 수 있다.

Dataset: 샘플과 label을 저장
DataLoader: Dataset을 샘플에 쉽게 접근할 수 있도록 iterable한 객체로 감싼다.
'''

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings(action='ignore') 

In [4]:
# GPU를 train 할 때 사용할 device로 지정한다.
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [5]:
# Hyperparamter Setting
CFG = {
    'FPS':30,
    'IMG_SIZE':128,
    'EPOCHS':30,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':4,
    'SEED':41
}

In [6]:
# Fixed RandomSeed
def seed_everything(seed):
    random.seed(seed) # random.seed(숫자): 랜덤함수의 결과값 고정, 시드값 고정
    os.environ['PYTHONHASHSEED'] = str(seed)  # 환경변수 PHTHONHASHSEED 값을 지정한다.
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True # deterministic한 알고리즘만 사용하게 한다. 그렇지 않을 경우 Runtime Error
    torch.backends.cudnn.benchmark = True # cudnn은 convolution 과정에서 벤치마킹을 통해 가장 적합한 알고리즘을 선정한다. 이때, 다른 알고리즘이 선정된다면 결과 값이 달라질 수 있다.

seed_everything(CFG['SEED']) # 모델의 학습결과를 reproduction하기 위해 seed를 고정한다.

In [7]:
# Data Load
# pd.read_csv(): csv file을 pandas dataframe으로 읽어온다.3
df = pd.read_csv('./train.csv')
df
type(df)

pandas.core.frame.DataFrame

In [13]:
# Train / Validation split
'''
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(*arrays, test_size, train_size, random_state, shuffle, stratify)
test_size: 전체 데이터셋 중에서 validation set로 지정할 비율
random_state: hyper-parameter tuning 시, validation set으로 설정한 것이 변하지 않도록 한다.
'''

train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CFG['SEED'])

print(len(df), len(train), len(val))
type(train) # DataFrame
type(train['path'])  # Series
type(val['path'].values)  # ndarray(n dimensional array)

610 488 122


numpy.ndarray

# Custom Dataset

In [16]:
class CustomDataset(Dataset):
    def __init__(self, video_path_list, label_list):	# 생성자(constuctor)는 객체가 생성되면 자동으로 실행되는 매서드이다.
        self.video_path_list = video_path_list
        self.label_list = label_list
        
    def __getitem__(self, index):	  # 주어진 index에 해당하는 샘플을 데이터셋에서 불러와서 반환한다.
        frames = self.get_video(self.video_path_list[index])
        
        if self.label_list is not None:
            label = self.label_list[index]
            return frames, label
        else:
            return frames
        
    def __len__(self):
        return len(self.video_path_list)
    
    def get_video(self, path):
        frames = []
        cap = cv2.VideoCapture(path)	# path에 있는 영상을 객체로 가져온다.
        for _ in range(CFG['FPS']):
            '''
            cap.read(): 비디오의 한 프레임씩 읽는다. 
            retval(return value): frame을 제대로 읽었으면 True, 그렇지 않으면 False가 반환된다.
            frame: 읽어온 프레임
            '''
            _, img = cap.read()
            img = cv2.resize(img, (CFG['IMG_SIZE'], CFG['IMG_SIZE']))
            img = img / 255.	# 이미지 값의 범위가 0~255 -> 0~1 가 되도록 하기 위함
            frames.append(img)
        return torch.FloatTensor(np.array(frames)).permute(3, 0, 1, 2)

In [21]:
'''
.values: pandas series의 모든 값들을 ndarray 에 저장하여 반환한다.
'''
train_dataset = CustomDataset(train['path'].values, train['label'].values)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val['path'].values, val['label'].values)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

type(train_dataset) # __main__.CustomDataset
type(train_loader) # torch.utils.data.dataloader.DataLoader

torch.utils.data.dataloader.DataLoader

# Model Define

In [18]:
class BaseModel(nn.Module):
    def __init__(self, num_classes=5):
        super(BaseModel, self).__init__()
        self.feature_extract = nn.Sequential(
            nn.Conv3d(3, 8, (3, 3, 3)),
            nn.ReLU(),
            nn.BatchNorm3d(8),
            nn.MaxPool3d(2),
            nn.Conv3d(8, 32, (2, 2, 2)),
            nn.ReLU(),
            nn.BatchNorm3d(32),
            nn.MaxPool3d(2),
            nn.Conv3d(32, 64, (2, 2, 2)),
            nn.ReLU(),
            nn.BatchNorm3d(64),
            nn.MaxPool3d(2),
            nn.Conv3d(64, 128, (2, 2, 2)),
            nn.ReLU(),
            nn.BatchNorm3d(128),
            nn.MaxPool3d((1, 7, 7)),
        )
        self.classifier = nn.Linear(512, num_classes)
        
    def forward(self, x):
        batch_size = x.size(0)
        x = self.feature_extract(x)
        x = x.view(batch_size, -1)
        x = self.classifier(x)
        return x

In [19]:
# 3D ResNets for Action Recognition (CVPR 2018)
# https://github.com/kenshohara/3D-ResNets-PyTorch

import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict


class _DenseLayer(nn.Sequential):

    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
        super().__init__()
        self.add_module('norm1', nn.BatchNorm3d(num_input_features))
        self.add_module('relu1', nn.ReLU(inplace=True))
        self.add_module(
            'conv1',
            nn.Conv3d(num_input_features,
                      bn_size * growth_rate,
                      kernel_size=1,
                      stride=1,
                      bias=False))
        self.add_module('norm2', nn.BatchNorm3d(bn_size * growth_rate))
        self.add_module('relu2', nn.ReLU(inplace=True))
        self.add_module(
            'conv2',
            nn.Conv3d(bn_size * growth_rate,
                      growth_rate,
                      kernel_size=3,
                      stride=1,
                      padding=1,
                      bias=False))
        self.drop_rate = drop_rate

    def forward(self, x):
        new_features = super().forward(x)
        if self.drop_rate > 0:
            new_features = F.dropout(new_features,
                                     p=self.drop_rate,
                                     training=self.training)
        return torch.cat([x, new_features], 1)


class _DenseBlock(nn.Sequential):

    def __init__(self, num_layers, num_input_features, bn_size, growth_rate,
                 drop_rate):
        super().__init__()
        for i in range(num_layers):
            layer = _DenseLayer(num_input_features + i * growth_rate,
                                growth_rate, bn_size, drop_rate)
            self.add_module('denselayer{}'.format(i + 1), layer)


class _Transition(nn.Sequential):

    def __init__(self, num_input_features, num_output_features):
        super().__init__()
        self.add_module('norm', nn.BatchNorm3d(num_input_features))
        self.add_module('relu', nn.ReLU(inplace=True))
        self.add_module(
            'conv',
            nn.Conv3d(num_input_features,
                      num_output_features,
                      kernel_size=1,
                      stride=1,
                      bias=False))
        self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2))


class DenseNet(nn.Module):
    """Densenet-BC model class
    Args:
        growth_rate (int) - how many filters to add each layer (k in paper)
        block_config (list of 4 ints) - how many layers in each pooling block
        num_init_features (int) - the number of filters to learn in the first convolution layer
        bn_size (int) - multiplicative factor for number of bottle neck layers
          (i.e. bn_size * k features in the bottleneck layer)
        drop_rate (float) - dropout rate after each dense layer
        num_classes (int) - number of classification classes
    """

    def __init__(self,
                 n_input_channels=3,
                 conv1_t_size=7,
                 conv1_t_stride=1,
                 no_max_pool=False,
                 growth_rate=32,
                 block_config=(6, 12, 24, 16),
                 num_init_features=64,
                 bn_size=4,
                 drop_rate=0,
                 num_classes=1000):

        super().__init__()

        # First convolution
        self.features = [('conv1',
                          nn.Conv3d(n_input_channels,
                                    num_init_features,
                                    kernel_size=(conv1_t_size, 7, 7),
                                    stride=(conv1_t_stride, 2, 2),
                                    padding=(conv1_t_size // 2, 3, 3),
                                    bias=False)),
                         ('norm1', nn.BatchNorm3d(num_init_features)),
                         ('relu1', nn.ReLU(inplace=True))]
        if not no_max_pool:
            self.features.append(
                ('pool1', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)))
        self.features = nn.Sequential(OrderedDict(self.features))

        # Each denseblock
        num_features = num_init_features
        for i, num_layers in enumerate(block_config):
            block = _DenseBlock(num_layers=num_layers,
                                num_input_features=num_features,
                                bn_size=bn_size,
                                growth_rate=growth_rate,
                                drop_rate=drop_rate)
            self.features.add_module('denseblock{}'.format(i + 1), block)
            num_features = num_features + num_layers * growth_rate
            if i != len(block_config) - 1:
                trans = _Transition(num_input_features=num_features,
                                    num_output_features=num_features // 2)
                self.features.add_module('transition{}'.format(i + 1), trans)
                num_features = num_features // 2

        # Final batch norm
        self.features.add_module('norm5', nn.BatchNorm3d(num_features))

        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
            elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

        # Linear layer
        self.classifier = nn.Linear(num_features, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                nn.init.kaiming_normal_(m.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm3d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        features = self.features(x)
        out = F.relu(features, inplace=True)
        out = F.adaptive_avg_pool3d(out,
                                    output_size=(1, 1,
                                                 1)).view(features.size(0), -1)
        out = self.classifier(out)
        return out


def generate_model(model_depth, **kwargs):
    assert model_depth in [121, 169, 201, 264]

    if model_depth == 121:
        model = DenseNet(num_init_features=64,
                         growth_rate=32,
                         block_config=(6, 12, 24, 16),
                         **kwargs)
    elif model_depth == 169:
        model = DenseNet(num_init_features=64,
                         growth_rate=32,
                         block_config=(6, 12, 32, 32),
                         **kwargs)
    elif model_depth == 201:
        model = DenseNet(num_init_features=64,
                         growth_rate=32,
                         block_config=(6, 12, 48, 32),
                         **kwargs)
    elif model_depth == 264:
        model = DenseNet(num_init_features=64,
                         growth_rate=32,
                         block_config=(6, 12, 64, 48),
                         **kwargs)

    return model

# Train

In [22]:
model_depth = 264

In [23]:
from datetime import datetime, timezone, timedelta

# 시간 고유값 
kst = timezone(timedelta(hours=9))        
train_serial = datetime.now(tz=kst).strftime("%Y%m%d_%H%M%S")

# 기록 경로
RECORDER_DIR = os.path.join('results', str(model_depth) ,train_serial)
# 현재 시간 기준 폴더 생성
'''
os.makedirs(): 원하는 만큼 디렉토리르 생성할 수 있다.
exist_ok=True: 이미 디렉토리가 존재할 때 excetion error없이 넘어간다.
'''
os.makedirs(RECORDER_DIR, exist_ok=True)

In [24]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    
    best_val_score = 0
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()   # train mode(Dropout layer, BatchNorm layer 활성화)
        train_loss = []
        for videos, labels in tqdm(iter(train_loader)):
            videos = videos.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()   # 기울기(gradient)를 0으로 설정
            
            output = model(videos)
            loss = criterion(output, labels)
            
            loss.backward() # backpropagation
            optimizer.step()
            
            train_loss.append(loss.item())  # loss.item(): loss값을 python float로 추출한다.
                    
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val F1 : [{_val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(_val_score)
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
            best_epoch = epoch
            print('best model found!')
            '''
            # model의 현재 상태를 참조하여 state_dict 객체로 PATH에 저장
            torch.save(model.state_dict(), PATH)
            
            # model을 state_dict 객체로 load
            model.load_state_dict(torch.load(PATH))
            '''
            torch.save(model.state_dict(), os.path.join(RECORDER_DIR, "best-model.pt"))
    
    print('best F1: best_val_score,', ',best epoch:', best_epoch)
    return best_model

In [25]:
def validation(model, criterion, val_loader, device):
    model.eval()    # validation mode(Dropout layer, BatchNorm layer 비활성화)
    val_loss = []
    preds, trues = [], []
    
    with torch.no_grad():   # autograd(auto differentiation)를 사용하지 않음
        for videos, labels in tqdm(iter(val_loader)):
            videos = videos.to(device)
            labels = labels.to(device)
            
            logit = model(videos)
            
            loss = criterion(logit, labels)
            
            val_loss.append(loss.item())
            
            '''
            ※numpy.argmax(a, axis): 주어진 numpy 배열에서 가장 높은 값을 가진 인덱스를 반환한다.

            torch.argmax(input_tensor, dim, keepdim=False) -> LongTensor

            .argmax(1)
            .detach(): tensor에서 이루어진 모든 연산이 기록 되어 있는 graph로 부터 분리한 tensor를 반환한다.
            .cpu(): gpu 메모리에 올려져 있는 tensor를 cpu 메모리로 복사한다.
            .numpy(): tensor를 numpy로 변환한다.
            .tolist(): list로 변환한다.

            # usage
            
            pred = output.argmax(dim=1, keepdim=True)    
            total_pred = np.append(total_pred, pred.cpu().numpy())
            '''
            preds += logit.argmax(1).detach().cpu().numpy().tolist()
            trues += labels.detach().cpu().numpy().tolist()
        
        _val_loss = np.mean(val_loss)
    
    '''
    from sklearn.metrics import f1_score
    f1_score(y_true, y_pred, average='macro')
    '''
    _val_score = f1_score(trues, preds, average='macro')
    return _val_loss, _val_score

# Run

In [None]:
kwargs  =  {'n_input_channels' : 3,
        'conv1_t_size' : 7,
        'conv1_t_stride' : 1,
        'no_max_pool' : False,}

In [None]:
if model_depth == 121:
        model = DenseNet(num_init_features=64,
                         growth_rate=32,
                         block_config=(6, 12, 24, 16),
                         **kwargs)
elif model_depth == 169:
        model = DenseNet(num_init_features=64,
                         growth_rate=32,
                         block_config=(6, 12, 32, 32),
                         **kwargs)
elif model_depth == 201:
        model = DenseNet(num_init_features=64,
                         growth_rate=32,
                         block_config=(6, 12, 48, 32),
                         **kwargs)
elif model_depth == 264:
        model = DenseNet(num_init_features=64,
                         growth_rate=32,
                         block_config=(6, 12, 64, 48),
                         **kwargs)

In [None]:
# model = BaseModel()
model.eval()

# optimizer = torch.optim.AdamW(params = model.parameters(), lr = CFG["LEARNING_RATE"])
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=CFG["LEARNING_RATE"] ,weight_decay=.0004)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.8)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, 
                                                                 T_mult=2, eta_min=0.00001)


infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

# Inference

In [None]:
test = pd.read_csv('./test.csv')

In [None]:
test_dataset = CustomDataset(test['path'].values, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    preds = []
    with torch.no_grad():
        for videos in tqdm(iter(test_loader)):
            videos = videos.to(device)
            
            logit = model(videos)

            preds += logit.argmax(1).detach().cpu().numpy().tolist()
    return preds

In [None]:
preds = inference(infer_model, test_loader, device)

  0%|          | 0/39 [00:00<?, ?it/s]

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['label'] = preds
submit.head()

Unnamed: 0,id,label
0,TEST_000,1
1,TEST_001,3
2,TEST_002,0
3,TEST_003,2
4,TEST_004,4


In [None]:
# os.path.join(...): 입력된 인수들을 연결한 경로를 생성한다.
# object.to_csv(path): object를 path에 csv file로 저장한다.
submit.to_csv(os.path.join(RECORDER_DIR, "best-results.csv"), index=False)