In [1]:
import torch
import torchvision
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
from torchtext.data import Field
from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
from torchsummaryX import summary

import warnings
warnings.filterwarnings("ignore")
import os
import time
import copy

import utils
import models
from dataset import PhoenixDataset, ToTensorVideo, RandomResizedCropVideo

## Dataset statistic

In [2]:
# root = '/mnt/data/public/datasets'
# print('video length: maximum / minimum / average / std')
# print(utils.DatasetStatistic(root, 'train'))
# print(utils.DatasetStatistic(root, 'dev'))
# print(utils.DatasetStatistic(root, 'test'))

## Building vocab

In [3]:
# # eos_token
# TRG = Field(sequential=True, use_vocab=True,
#             init_token='<sos>', eos_token= '<eos>',
#             lower=True, tokenize='spacy',
#             tokenizer_language='de')

# root = '/mnt/data/public/datasets'
# csv_file = utils.get_csv(root)
# tgt_sents = [csv_file.iloc[i, 0].lower().split('|')[3].split()
#              for i in range(len(csv_file))]

# # hyper
# TRG.build_vocab(tgt_sents, min_freq=1)

## Process batch

In [4]:
def collate_fn(batch):
    '''
    process the batch:
        pad the variant video sequence length to the fixed length
        process the annotations
    '''
    # video shape from [C, T, H, W] to [T, C, H, W]
    videos = [item['video'].permute(1,0,2,3) for item in batch]
    annotations = [item['annotation'].split() for item in batch]
    videos= pad_sequence([v for v in videos], batch_first=True)
    # video shape from [N, T, C, H, W] to [N, C, T, H, W]
    videos = videos.permute(0, 2, 1, 3 , 4)
    annotations = TRG.process(annotations)

    return {'videos': videos, 'annotations': annotations}

## Loading dataset

In [5]:
BSZ = 1
# root = '/mnt/data/public/datasets'
# transform = transforms.Compose([ToTensorVideo(),
#                                 RandomResizedCropVideo(112)])

# train_loader = DataLoader(PhoenixDataset(root, 'train', transform=transform),
#                           batch_size=BSZ, shuffle=True, num_workers=4, collate_fn=collate_fn)
# dev_loader = DataLoader(PhoenixDataset(root, 'dev', transform=transform),
#                         batch_size=BSZ, shuffle=True, num_workers=4, collate_fn=collate_fn)
# # False？
# test_loader = DataLoader(PhoenixDataset(root, 'test', transform=transform),
#                          batch_size=BSZ, shuffle=False, num_workers=4, collate_fn=collate_fn)


In [6]:
# videos: [N, C, T, H, W]
# annotations: [L, N]

# batch = next(iter(train_loader))
# print(batch['videos'].shape)
# print(batch['annotations'].shape)

# print(utils.itos(batch['annotations'].squeeze(1), TRG))

# print(len(train_loader))
# print(len(dev_loader))
# print(len(test_loader))

## Define model

In [7]:
D_MODEL = 512
DROPOUT = 0.1
NHEAD = 1
NLAYER = 6
NHID = 64
ACTIVATION = 'relu'
NCLIP = 10
NEPOCH = 1
LR = 1e-4

device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

path = f'bsz:{BSZ}-lr:{LR}-epoch:{NEPOCH}-d_model:{D_MODEL}-nhead:{NHEAD}-nlayer:{NLAYER}\
-nhid:{NHID}-activation:{ACTIVATION}'
writer = SummaryWriter(os.path.join('./log', path))

res3D_pretrained = torchvision.models.video.r3d_18(pretrained=True)

encoder = models.Res3D(res3D_pretrained)

# ? len(TRG.vocab) -> 1000 
decoder = models.Transformer(
    device, 1000, D_MODEL, DROPOUT,
    NHEAD, NLAYER, NHID, ACTIVATION)

model = models.Seq2Seq(NCLIP, encoder, decoder, device).to(device)

# criterion = nn.CrossEntropyLoss(ignore_index=TRG.vocab.stoi['<pad>'])

optimizer = optim.Adam(model.parameters(), lr = LR)

In [8]:
summary(model, torch.ones(1,3,8,112,112), torch.ones(10,1, dtype = torch.long))

                                                           Kernel Shape  \
Layer                                                                     
0_cnn.convs.0.Conv3d_0                                 [3, 64, 3, 7, 7]   
1_cnn.convs.0.BatchNorm3d_1                                        [64]   
2_cnn.convs.0.ReLU_2                                                  -   
3_cnn.convs.1.0.conv1.Conv3DSimple_0                  [64, 64, 3, 3, 3]   
4_cnn.convs.1.0.conv1.BatchNorm3d_1                                [64]   
5_cnn.convs.1.0.conv1.ReLU_2                                          -   
6_cnn.convs.1.0.conv2.Conv3DSimple_0                  [64, 64, 3, 3, 3]   
7_cnn.convs.1.0.conv2.BatchNorm3d_1                                [64]   
8_cnn.convs.1.0.ReLU_relu                                             -   
9_cnn.convs.1.1.conv1.Conv3DSimple_0                  [64, 64, 3, 3, 3]   
10_cnn.convs.1.1.conv1.BatchNorm3d_1                               [64]   
11_cnn.convs.1.1.conv1.Re

Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_cnn.convs.0.Conv3d_0,"[3, 64, 3, 7, 7]","[1, 64, 1, 56, 56]",,
1_cnn.convs.0.BatchNorm3d_1,[64],"[1, 64, 1, 56, 56]",,
2_cnn.convs.0.ReLU_2,-,"[1, 64, 1, 56, 56]",,
3_cnn.convs.1.0.conv1.Conv3DSimple_0,"[64, 64, 3, 3, 3]","[1, 64, 1, 56, 56]",,
4_cnn.convs.1.0.conv1.BatchNorm3d_1,[64],"[1, 64, 1, 56, 56]",,
...,...,...,...,...
571_transformer.transformer.decoder.layers.5.Linear_linear2,"[64, 512]","[10, 1, 512]",33280.0,32768.0
572_transformer.transformer.decoder.layers.5.Dropout_dropout3,-,"[10, 1, 512]",,
573_transformer.transformer.decoder.layers.5.LayerNorm_norm3,[512],"[10, 1, 512]",1024.0,512.0
574_transformer.transformer.decoder.LayerNorm_norm,[512],"[10, 1, 512]",1024.0,512.0


## Train and evaluate

In [9]:
# best_val_bleu = 0.0
# best_val_model = copy.deepcopy(model.state_dict())
# for n_epoch in range(NEPOCH):
#     models.train(model, train_loader, device, criterion, optimizer, TRG, writer, n_epoch)
#     %time val_loss, val_bleu, val_wer = models.evaluate(model, dev_loader, device, criterion, TRG)
#     print(val_loss, val_bleu, val_wer)
    
#     if val_bleu > best_val_bleu:
#         best_val_bleu = val_bleu
#         best_val_model = copy.deepcopy(model.state_dict())
        
# model.load_state_dict(best_val_model)
# test_loss, test_bleu, test_wer = models.evaluate(model, test_loader, device, criterion, TRG)
# print(test_loss, test_bleu, test_wer)

## Save model

In [10]:
# if not os.path.exists('./save'):
#     os.mkdir("save")
# dir_name = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
# torch.save(model.state_dict(), './save/'+dir_name+'.pth')


# # change input shape from [N, C, T, H, W] to [N, T, C, H, W]
# videos = batch['videos'].permute(0, 2, 1, 3, 4)
# texts = batch['annotations'].permute(1, 0)
# texts = [' '.join([TRG.vocab.itos[i] for i in sent]) for sent in texts]
# writer.add_video('input', videos, global_step=0, fps=32)
# writer.add_text('annotations', str(texts), 0)

## Load and test

In [11]:
# %%time
# model.load_state_dict(torch.load('./save/2020-03-01 18:17:57.pth'))
# test_loss, test_bleu, test_wer = models.evaluate(model, test_loader, device, criterion, TRG)