In [1]:
import torch
import pandas as pd
import numpy as np
import os

In [2]:
from torch.utils.data import Dataset
import tqdm

class CSVDataset(Dataset):
    def __init__(self, df, image_path, model_initial, transform=None):
        self.data = df["SENTENCE_NAME"]
        self.labels = df["SENTENCE"]
        self.transform = transform
        self.image_path = image_path
        self.videos = []
        self.labels = []
        self.map_index = []
        for idx in tqdm.tqdm(range(len(self.data))):
            if(idx > 14000):
                break
            if not os.path.exists(image_path+self.data.iloc[idx]+".npy"):
                continue
            with open(self.image_path+self.data.iloc[idx]+".npy", 'rb') as f:
                video = torch.tensor(np.load(f))
                label = torch.tensor(np.load(f))
                self.videos.append(video)
                self.labels.append(label)
            self.map_index.append(idx)

    def __len__(self):
        return len(self.map_index)

    def __getitem__(self, idx):
        return self.videos[idx], self.labels[idx]

def load_dataset(path, image_path, transform=None):
    df = pd.read_csv(path, sep="\t")

    dataset = CSVDataset(df, image_path, transform)
    return dataset

In [3]:
#load datasets
from torchvision import transforms
import torchvision
from torchvision.transforms import Resize

import torch

transform = transforms.Compose([
        Resize((180, 320), antialias=False)
    ])

train_dataset = load_dataset("data/train.csv", "data/train_npy/", transform)
val_dataset = load_dataset("data/val.csv", "data/val_npy/", transform)
test_dataset = load_dataset("data/test.csv", "data/test_npy/", transform)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#print info about datasets
print("Train dataset size: ", len(train_dataset))
print("Val dataset size: ", len(val_dataset))
print("Test dataset size: ", len(test_dataset))
print("video clip size: ", val_dataset[0][0].size(), val_dataset[0][1].size())

 45%|████▍     | 14001/31165 [07:56<09:44, 29.37it/s]
100%|██████████| 1741/1741 [00:59<00:00, 29.25it/s]
100%|██████████| 2357/2357 [01:20<00:00, 29.35it/s]

Train dataset size:  13924
Val dataset size:  1739
Test dataset size:  2343
video clip size:  torch.Size([20, 3, 180, 320]) torch.Size([20])





In [4]:
#load dataloaders
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=56, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=56, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=56, shuffle=True)

In [5]:
from torch import nn
from transformers import BertTokenizer
#from torcheval.metrics.functional import bleu_score
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import time

class VideoToText(nn.Module):
    def __init__(self, cnn, hidden_size, output_size, num_layers):
        super(VideoToText, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.cnn = cnn
        self.encoder = nn.LSTM(input_size=cnn.fc.in_features, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.decoder = nn.LSTM(input_size=hidden_size, hidden_size=output_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(output_size, len(self.tokenizer.get_vocab()))
        #remove last layer of cnn
        self.cnn.fc = nn.Identity()

    def forward(self, src):
        #print(src.shape)
        batch_size, seq_len, c, h, w = src.shape
        src = src.reshape(batch_size*seq_len, c, h, w)

        src = self.cnn(src)
        src = src.reshape(batch_size, seq_len, -1)
        src, (hidden, cell) = self.encoder(src)
        src, (hidden, cell) = self.decoder(src)
        src = self.fc(src)
        return src

#train model function
def train(model, train_dataloader, val_dataloader, epochs, lr, device):
    tokenizer = model.tokenizer
    model = nn.DataParallel(model)
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()

        start = time.time()
        avg_bleu = 0
        avg_loss = 0
        cnt = 0
        for i, (inputs, labels) in enumerate(train_dataloader):
            #print(inputs, labels)
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)

            
            label_text = tokenizer.decode(labels[0].to(torch.long), skip_special_tokens=True)
            output_text = tokenizer.decode(outputs[0].argmax(dim=1), skip_special_tokens=True)


            outputs = outputs.permute(0, 2, 1)
            #print(labels.shape, outputs.shape)

            if(labels.shape[1] < outputs.shape[2]):
                outputs = outputs[:, :, :labels.shape[1]]
            if(labels.shape[1] > outputs.shape[2]):
                labels = labels[:, :outputs.shape[2]]
            loss = criterion(outputs, labels)
            loss.backward(retain_graph=True)
            optimizer.step()
            curr = time.time()
            score = sentence_bleu([label_text], output_text, smoothing_function=SmoothingFunction().method4)
            avg_bleu += score
            avg_loss += loss.item()
            cnt += 1
            print("\rEpoch: {}/{}, Batch: {}/{}, Loss: {:.4f}, Bleu: {:.5f}, Elapsed: {:.2f} sec".format(epoch+1, epochs, i+1, len(train_dataloader), loss.item(), score, curr-start), end="")
        print("\rEpoch: {}/{}, Loss: {:.4f}, Bleu: {:.5f}, Elapsed: {:.2f} sec".format(epoch+1, epochs, avg_loss/cnt, avg_bleu/cnt, curr-start))
        
        model.eval()
        with torch.no_grad():
            avgloss = 0
            avgbleu = 0
            count = 0
            start = time.time()
            for i, (inputs, labels) in enumerate(val_dataloader):
                if(i > 1):
                    break
                count = count+1
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                outputs = outputs.permute(0, 2, 1)
                if(labels.shape[1] < outputs.shape[2]):
                    outputs = outputs[:, :, :labels.shape[1]]
                if(labels.shape[1] > outputs.shape[2]):
                    labels = labels[:, :outputs.shape[2]]
                loss = criterion(outputs, labels)
                label_text = [[tokenizer.decode(x.to(torch.long), skip_special_tokens=True)] for x in labels]
                output_text = [tokenizer.decode(x.argmax(dim=1), skip_special_tokens=True) for x in outputs]
                score = corpus_bleu(label_text, output_text, smoothing_function=SmoothingFunction().method4)
                avgbleu += score
                avgloss += loss.item()
            print("Epoch: {}/{}, Loss: {:.4f}, Bleu: {:.5f}, Elapsed Time: {:.4f}\n".format(epoch+1, epochs, avgloss/count, avgbleu/count, time.time()-start))
        

In [6]:
#load model
from torchvision import models
from torch import nn
cnn_extractor = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model = VideoToText(cnn_extractor, hidden_size=512, output_size=512, num_layers=1)

In [None]:
train(model, train_dataloader, val_dataloader, epochs=250, lr=0.001, device=device)
# watch with: watch -d -n 0.5 nvidia-smi

Epoch: 1/250, Loss: 5.0487, Bleu: 0.00374, Elapsed: 123.65 secpsed: 123.65 sec
Epoch: 1/250, Loss: 4.7406, Bleu: 0.00000, Elapsed Time: 68.3088

Epoch: 2/250, Loss: 4.7620, Bleu: 0.01093, Elapsed: 114.27 secpsed: 114.27 sec
Epoch: 2/250, Loss: 4.6869, Bleu: 0.00001, Elapsed Time: 41.0334

Epoch: 3/250, Loss: 4.7276, Bleu: 0.01461, Elapsed: 114.43 secpsed: 114.43 sec
Epoch: 3/250, Loss: 4.9421, Bleu: 0.00001, Elapsed Time: 41.1866

Epoch: 4/250, Loss: 4.7048, Bleu: 0.01077, Elapsed: 114.47 secpsed: 114.47 sec
Epoch: 4/250, Loss: 5.0847, Bleu: 0.00001, Elapsed Time: 41.4376

Epoch: 5/250, Loss: 4.6804, Bleu: 0.01223, Elapsed: 114.66 secpsed: 114.66 sec
Epoch: 5/250, Loss: 5.3549, Bleu: 0.00001, Elapsed Time: 41.6857

Epoch: 6/250, Loss: 4.6612, Bleu: 0.00932, Elapsed: 114.30 secpsed: 114.30 sec
Epoch: 6/250, Loss: 4.5372, Bleu: 0.00001, Elapsed Time: 41.1163

Epoch: 7/250, Loss: 4.6435, Bleu: 0.00855, Elapsed: 114.76 secpsed: 114.76 sec
Epoch: 7/250, Loss: 4.8765, Bleu: 0.00001, Elapsed 