In [1]:
!cp -r /kaggle/input/video-caption/test_videos ./

In [2]:
!cp -r /kaggle/input/video-caption/train_videos ./

In [3]:
!cp -r /kaggle/input/video-caption/test.csv ./

In [4]:
!cp -r /kaggle/input/video-caption/train.csv ./

In [5]:
!pip install opencv-python



In [6]:
import shutil
import tqdm
import numpy as np
import cv2
import os
import torchvision
from torch import nn

import json
import random
import pandas as pd
import torch
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors
from tokenizers.normalizers import Sequence, Lowercase, NFD, StripAccents
from PIL import Image
import torch.optim as optim

In [7]:
def video_to_frames(video_path, output_dir):
    if os.path.exists(output_dir):
        for file in os.listdir(output_dir):
            os.remove(os.path.join(output_dir, file))
    else:
        os.makedirs(output_dir)
    count = 0
    image_list = []
    # Path to video file
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if ret is False:
            break
        cv2.imwrite(os.path.join(output_dir, 'frame%d.jpg' % count), frame)
        image_list.append(os.path.join(output_dir, 'frame%d.jpg' % count))
        count += 1

    cap.release()
    cv2.destroyAllWindows()
    return image_list

In [8]:
import torch

def model_cnn_load():
# Load pretrained VGG16
    model = torchvision.models.vgg16(weights='IMAGENET1K_V1')
    
    # Remove the last classification layer (keep up to the second-to-last layer)
    model = nn.Sequential(*list(model.children())[:-1])  # Removes final Linear layer
    
    return model

def load_image(path):
    img = cv2.imread(path)
    img = cv2.resize(img, (224, 224))
    return img

def extract_features(video, model):
    video_id = video.split(".")[0]
    print(f'Processing video {video}')

    image_list = video_to_frames(f'train_videos/{video}', 'temporary_images')
    samples = np.round(np.linspace(
        0, len(image_list) - 1, 80)).astype(int)
    image_list = [image_list[sample] for sample in samples]
    images = torch.zeros((len(image_list), 3, 224, 224), dtype=torch.float32)
    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
    for i in range(len(image_list)):
        img = load_image(image_list[i])
        img_np = np.array(img)
        img_np = img_np.transpose(2, 0, 1)
        img = torch.from_numpy(img_np).float()
        img = img.unsqueeze(0) / 255.0
        img = (img - mean) / std
        images[i] = img.squeeze(0)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    images = images.to(device)
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        fc_feats = model(images)
        # img_feats = fc_feats.detach().cpu().numpy()
    shutil.rmtree('temporary_images')
    return fc_feats

def extract_feats_pretrained_cnn():
    model = model_cnn_load()
    print('Model loaded')

    if not os.path.isdir(os.path.join('features_dir')):
        os.mkdir('features_dir')

    video_list = os.listdir('train_videos')
    for video in video_list:
        outfile = os.path.join('features_dir', video.split(".")[0] + '.pt')
        img_feats = extract_features(video, model)
        torch.save(img_feats, outfile)

In [9]:
class HuggingFaceTokenizedPreprocessor:
    def __init__(self, validation_split=0.2, max_seq_length=30):
        self.validation_split = validation_split
        self.max_seq_length = max_seq_length
        self.x_data = {}  # Store video features

        self.special_tokens=[
                ("[BOS]", 1),
                ("[EOS]", 2),
                ("[UNK]", 3),
                ("[PAD]", 0),
                (".", 4),
                (",", 5)
            ]
        self.initial_vocab = {token: idx for token, idx in self.special_tokens}
        
        # Initialize HuggingFace tokenizer
        self.tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]", vocab=self.initial_vocab))
        self.tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
        self.tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
        self.tokenizer.post_processor = processors.TemplateProcessing(
            single="[BOS] $A [EOS]",
            special_tokens=[
                ("[BOS]", 1),
                ("[EOS]", 2),
                ("[UNK]", 3),
                ("[PAD]", 0),
                (".", 4),
                (",", 5)
            ]
        )
        self.special_tokens_string = [token for token, _ in self.special_tokens]

        for token, idx in self.special_tokens:
            self.tokenizer.add_tokens([token])
        
    def preprocess_text(self, caption):
        """Preprocess text keeping punctuation as separate tokens"""
        # Add space around punctuation for proper splitting
        caption = caption.replace('.', ' . ').replace(',', ' , ')
        return caption.strip()
    
    def build_vocabulary(self, df):
        """Train the tokenizer on the dataset"""
        trainer = trainers.WordLevelTrainer(
            vocab_size=1000,
            special_tokens=self.special_tokens_string,
            min_frequency=2,
            initial_alphabet=list(self.initial_vocab.keys())
        )
        
        # Prepare text iterator
        def get_texts():
            for caption in df['caption']:
                yield self.preprocess_text(caption)
                
        self.tokenizer.train_from_iterator(get_texts(), trainer=trainer)
        
    def preprocessing(self):
        """Load and preprocess training data from CSV"""
        train_df = pd.read_csv('train.csv')


        video_list = (os.listdir('train_videos'))
        train_df = train_df[train_df['file_name'].isin(video_list)]
        
        
        # Build vocabulary
        self.build_vocabulary(train_df)
        
        # Process captions and filter by length
        train_list = []
        for _, row in train_df.iterrows():
            processed_text = self.preprocess_text(row['caption'])
            encoding = self.tokenizer.encode(processed_text)
            
            # Filter by content length (excluding special tokens and punctuation)
            content_tokens = [
                t for t in encoding.tokens 
                if t not in self.special_tokens_string
            ]
            
            train_list.append({
                'caption_ids': encoding.ids,
                'id': row['file_name'].split('.')[0],
                'index': row['index']
            })
        
        # Shuffle and split data
        random.shuffle(train_list)
        split_idx = int(len(train_list) * self.validation_split)
        validation_list = train_list[:split_idx]
        training_list = train_list[split_idx:]
        
        # Load video features
        for filename in os.listdir('features_dir'):
            if filename.endswith('.pt'):
                video_id = filename[:-3]
                features = torch.load(os.path.join('features_dir', filename))
                self.x_data[video_id] = features.float()
                # features = np.load(os.path.join('features_dir', filename), allow_pickle=True)
                # self.x_data[video_id] = torch.from_numpy(features).float()
        
        return training_list, validation_list
    
    def text_to_ids(self, text):
        """Convert text to token IDs"""
        processed_text = self.preprocess_text(text)
        return self.tokenizer.encode(processed_text).ids
    
    def ids_to_text(self, ids):
        """Convert token IDs back to text"""
        return self.tokenizer.decode(ids)

    def save(self):
        """Save the tokenizer and preprocessor config"""
        import json
        
        # Create directory if it doesn't exist
        
        # Save tokenizer
        self.tokenizer.save("tokenizer.json")
        
        config = {
            "special_tokens": self.special_tokens,
            "special_tokens_string": self.special_tokens_string
        }
        
        with open("config.json", "w") as f:
            json.dump(config, f)
            
    @classmethod
    def load(cls):
        """Load a saved tokenizer and preprocessor"""
        import json
        
        # Load config
        with open("config.json", "r") as f:
            config = json.load(f)
        
        preprocessor = cls()
        
        # Load tokenizer
        
        preprocessor.tokenizer = Tokenizer.from_file("tokenizer.json")
        preprocessor.special_tokens = config["special_tokens"]
        preprocessor.special_tokens_string = config["special_tokens_string"]
        
        return preprocessor

In [10]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

class VideoCaptionDataset(Dataset):
    def __init__(self, training_list, x_data, tokenizer, max_length, num_decoder_tokens, device):
        """
        PyTorch Dataset for video captioning
        
        Args:
            training_list: List of training samples (caption, video_id)
            x_data: Dictionary of video features {video_id: features}
            tokenizer: HuggingFace tokenizer
            max_length: Maximum sequence length
            num_decoder_tokens: Vocabulary size
        """
        self.training_list = training_list
        self.x_data = x_data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_decoder_tokens = num_decoder_tokens
        self.device = device
        
        # Preprocess all data
        self.video_ids = []
        self.caption_ids = []

        for train_data in training_list:
            train_values_list = list(train_data.values())
            self.video_ids.append(train_values_list[1])
            self.caption_ids.append(train_values_list[0])
                
    def __len__(self):
        return len(self.training_list)
    
    def __getitem__(self, idx):
        # Get video features
        video_features = self.x_data[self.video_ids[idx]].float().to(self.device)
        
        # Process caption
        caption_ids = self.caption_ids[idx]
        
        # Pad/truncate sequence
        if len(caption_ids) <= self.max_length:
            pad_len = (self.max_length + 1) - len(caption_ids)
            caption_ids = caption_ids[:-1] + [self.tokenizer.token_to_id("[PAD]")] * pad_len + [self.tokenizer.token_to_id("[EOS]")] 
        elif len(caption_ids) > self.max_length:
            caption_ids = caption_ids[:self.max_length] + [self.tokenizer.token_to_id("[EOS]")]
        
        # Convert to tensors
        caption_tensor = torch.tensor(caption_ids, device=self.device)
        one_hot = F.one_hot(caption_tensor, num_classes=1000).float()
        
        # Create decoder input (shifted right) and target (shifted left)
        decoder_input = one_hot[:-1]
        decoder_target = one_hot[1:]

        return {
            'encoder_input': video_features,
            'decoder_input': decoder_input,
            'decoder_target': decoder_target,
            'video_id': self.video_ids[idx]
        }

In [11]:
def collate_fn(batch):
    """
    Custom collate function to pad sequences to same length and create batches
    
    Args:
        batch: List of samples from VideoCaptionDataset
    
    Returns:
        Dictionary of batched tensors with shapes:
        - encoder_input: [batch_size, seq_len, input_size]
        - decoder_input: [batch_size, seq_len, num_decoder_tokens]
        - decoder_target: [batch_size, seq_len, num_decoder_tokens]
        - video_ids: List of video IDs
    """
    # Get all encoder inputs and find max sequence length
    encoder_inputs = [item['encoder_input'] for item in batch]
    decoder_inputs = [item['decoder_input'] for item in batch]
    decoder_targets = [item['decoder_target'] for item in batch]
    video_ids = [item['video_id'] for item in batch]

    encoder_input_batch = torch.stack(encoder_inputs, dim=0)
    
    # Stack decoder inputs and targets (they should already be the same length from dataset)
    decoder_input_batch = torch.stack(decoder_inputs, dim=0)
    decoder_target_batch = torch.stack(decoder_targets, dim=0)
    
    return {
        'encoder_input': encoder_input_batch,
        'decoder_input': decoder_input_batch,
        'decoder_target': decoder_target_batch,
        'video_id': video_ids
    }

In [12]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

class Seq2SeqModel(nn.Module):
    def __init__(self):
        super(Seq2SeqModel, self).__init__()
        
        # Encoder
        self.encoder_lstm = nn.LSTM(
            input_size=512,
            hidden_size=512,
            batch_first=True
        )

        # Decoder
        self.decoder_lstm = nn.LSTM(
            input_size=1000,
            hidden_size=512,
            batch_first=True
        )
        self.decoder_dense = nn.Linear(512, 1000)
        
    def forward(self, encoder_inputs, decoder_inputs, encoder_hidden, encoder_cell):
        # Encoder
        _, (hidden, cell) = self.encoder_lstm(encoder_inputs, (encoder_hidden, encoder_cell))
        
        # Decoder
        decoder_outputs, _ = self.decoder_lstm(
            decoder_inputs, 
            (hidden, cell)
        )
        
        decoder_outputs = self.decoder_dense(decoder_outputs)
        return decoder_outputs

class VideoCaptionTrainer:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = Seq2SeqModel().to(self.device)
        
    def train_model(self):
        # Load datasets
        preprocessor = HuggingFaceTokenizedPreprocessor()
        training_list, validation_list = preprocessor.preprocessing()
        
        train_dataset = VideoCaptionDataset(
                            training_list=training_list,
                            x_data=preprocessor.x_data,
                            tokenizer=preprocessor.tokenizer,
                            max_length=30,
                            num_decoder_tokens=1000,
                            device=self.device
                        )
        val_dataset = VideoCaptionDataset(
                            training_list=validation_list,
                            x_data=preprocessor.x_data,
                            tokenizer=preprocessor.tokenizer,
                            max_length=30,
                            num_decoder_tokens=1000,
                            device=self.device
                        )
        train_loader = DataLoader(
                            train_dataset,
                            batch_size=8,
                            shuffle=False,
                            num_workers=0,
                            collate_fn=collate_fn
                        )
        val_loader = DataLoader(
                            val_dataset,
                            batch_size=8,
                            shuffle=False,
                            num_workers=0,
                            collate_fn=collate_fn
                        )
        
        # Loss and optimizer
        criterion = nn.CrossEntropyLoss()  # Ignore padding index
        optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)
        
        # Early stopping
        best_val_loss = float('inf')
        patience = 5
        patience_counter = 0
        
        for epoch in range(150):
            # Training
            self.model.train()
            train_loss = 0.0
            for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
                encoder_input = batch['encoder_input']
                decoder_input = batch['decoder_input']
                decoder_target = batch['decoder_target']
                batch_size = encoder_input.shape[0]

                pool = nn.AdaptiveAvgPool2d((1, 1))
                encoder_input = pool(encoder_input).squeeze(-1).squeeze(-1)

                encoder_input.to(self.device)
                # decoder_input.to(self.device)
                decoder_target.to(self.device)

                h = torch.zeros(1, batch_size, 512).to(self.device)
                c = torch.zeros(1, batch_size, 512).to(self.device)
                
                optimizer.zero_grad()
                outputs = self.model(encoder_input, decoder_input.float().to(self.device), h, c)

                # targets = decoder_target
                
                loss = criterion(outputs, decoder_target)
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
            
            # Validation
            self.model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for batch in val_loader:
                    encoder_input = batch['encoder_input']
                    decoder_input = batch['decoder_input']
                    decoder_target = batch['decoder_target']
                    batch_size = encoder_input.shape[0]

                    pool = nn.AdaptiveAvgPool2d((1, 1))
                    encoder_input = pool(encoder_input).squeeze(-1).squeeze(-1)
    
                    encoder_input.to(self.device)
                    decoder_input.to(self.device)
                    decoder_target.to(self.device)
    
                    h = torch.zeros(1, batch_size, 512).to(self.device)
                    c = torch.zeros(1, batch_size, 512).to(self.device)
                    
                    outputs = self.model(encoder_input, decoder_input.float(), h, c)
                    targets = decoder_target
                    
                    val_loss += criterion(outputs, targets).item()
            
            # Calculate average losses
            train_loss /= len(train_loader)
            val_loss /= len(val_loader)
            
            print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
            
            # Learning rate scheduling
            scheduler.step(val_loss)
            
            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                torch.save(self.model.encoder_lstm.state_dict(), 'encoder_model_weights.pth')
                torch.save(self.model.decoder_lstm.state_dict(), 'decoder_model_weights.pth')
                preprocessor.save()
                
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered")
                    break

In [14]:
extract_feats_pretrained_cnn()

Model loaded
Processing video 158.mp4
Processing video 366.mp4
Processing video 550.mp4
Processing video 444.mp4
Processing video 602.mp4
Processing video 506.mp4
Processing video 370.mp4
Processing video 28.mp4
Processing video 84.mp4
Processing video 116.mp4
Processing video 540.mp4
Processing video 291.mp4
Processing video 363.mp4
Processing video 97.mp4
Processing video 340.mp4
Processing video 490.mp4
Processing video 503.mp4
Processing video 43.mp4
Processing video 412.mp4
Processing video 111.mp4
Processing video 18.mp4
Processing video 79.mp4
Processing video 278.mp4
Processing video 449.mp4
Processing video 437.mp4
Processing video 122.mp4
Processing video 509.mp4
Processing video 225.mp4
Processing video 362.mp4
Processing video 522.mp4
Processing video 388.mp4
Processing video 539.mp4
Processing video 151.mp4
Processing video 533.mp4
Processing video 57.mp4
Processing video 379.mp4
Processing video 246.mp4
Processing video 162.mp4
Processing video 504.mp4
Processing video 40

In [15]:
preprocessor = HuggingFaceTokenizedPreprocessor()
training_list, validation_list = preprocessor.preprocessing()

Ignored unknown kwargs option initial_alphabet


In [16]:
trainer = VideoCaptionTrainer()
trainer.train_model()

Ignored unknown kwargs option initial_alphabet


Epoch 1: 100%|██████████| 61/61 [00:01<00:00, 43.44it/s]


Epoch 1: Train Loss: 0.0938, Val Loss: 0.0905
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 2: 100%|██████████| 61/61 [00:01<00:00, 60.77it/s]


Epoch 2: Train Loss: 0.0877, Val Loss: 0.0884
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 3: 100%|██████████| 61/61 [00:01<00:00, 60.82it/s]


Epoch 3: Train Loss: 0.0835, Val Loss: 0.0855
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 4: 100%|██████████| 61/61 [00:01<00:00, 60.51it/s]


Epoch 4: Train Loss: 0.0788, Val Loss: 0.0822
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 5: 100%|██████████| 61/61 [00:01<00:00, 60.41it/s]


Epoch 5: Train Loss: 0.0730, Val Loss: 0.0783
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 6: 100%|██████████| 61/61 [00:01<00:00, 60.02it/s]


Epoch 6: Train Loss: 0.0683, Val Loss: 0.0770
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 7: 100%|██████████| 61/61 [00:01<00:00, 60.05it/s]


Epoch 7: Train Loss: 0.0640, Val Loss: 0.0757
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 8: 100%|██████████| 61/61 [00:01<00:00, 59.99it/s]


Epoch 8: Train Loss: 0.0603, Val Loss: 0.0746
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 9: 100%|██████████| 61/61 [00:01<00:00, 59.66it/s]


Epoch 9: Train Loss: 0.0575, Val Loss: 0.0745
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 10: 100%|██████████| 61/61 [00:01<00:00, 59.72it/s]


Epoch 10: Train Loss: 0.0550, Val Loss: 0.0720
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 11: 100%|██████████| 61/61 [00:01<00:00, 59.65it/s]


Epoch 11: Train Loss: 0.0508, Val Loss: 0.0728


Epoch 12: 100%|██████████| 61/61 [00:01<00:00, 59.07it/s]


Epoch 12: Train Loss: 0.0482, Val Loss: 0.0722


Epoch 13: 100%|██████████| 61/61 [00:01<00:00, 58.83it/s]


Epoch 13: Train Loss: 0.0460, Val Loss: 0.0716
The OrderedVocab you are attempting to save contains holes for indices [4, 5], your vocabulary could be corrupted !


Epoch 14: 100%|██████████| 61/61 [00:01<00:00, 59.05it/s]


Epoch 14: Train Loss: 0.0439, Val Loss: 0.0733


Epoch 15: 100%|██████████| 61/61 [00:01<00:00, 58.64it/s]


Epoch 15: Train Loss: 0.0416, Val Loss: 0.0722


Epoch 16: 100%|██████████| 61/61 [00:01<00:00, 58.60it/s]


Epoch 16: Train Loss: 0.0397, Val Loss: 0.0719


Epoch 17: 100%|██████████| 61/61 [00:01<00:00, 58.62it/s]


Epoch 17: Train Loss: 0.0374, Val Loss: 0.0728


Epoch 18: 100%|██████████| 61/61 [00:01<00:00, 58.84it/s]


Epoch 18: Train Loss: 0.0361, Val Loss: 0.0744
Early stopping triggered


In [27]:
def extract_test_features(video, model):
    video_id = video.split(".")[0]
    print(f'Processing video {video}')

    image_list = video_to_frames(f'test_videos/{video}', 'test_temporary_images')
    samples = np.round(np.linspace(
        0, len(image_list) - 1, 80)).astype(int)
    image_list = [image_list[sample] for sample in samples]
    images = torch.zeros((len(image_list), 3, 224, 224), dtype=torch.float32)
    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
    for i in range(len(image_list)):
        img = load_image(image_list[i])
        img_np = np.array(img)
        img_np = img_np.transpose(2, 0, 1)
        img = torch.from_numpy(img_np).float()
        img = img.unsqueeze(0) / 255.0
        img = (img - mean) / std
        images[i] = img.squeeze(0)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    images = images.to(device)
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        fc_feats = model(images)
        # img_feats = fc_feats.cpu().numpy()
    shutil.rmtree('test_temporary_images')
    return fc_feats

def extract_test_feats_pretrained_cnn():
    model = model_cnn_load()
    print('Model loaded')

    if not os.path.isdir(os.path.join('test_features_dir')):
        os.mkdir('test_features_dir')

    video_list = os.listdir('test_videos')
    for video in video_list:
        outfile = os.path.join('test_features_dir', video.split(".")[0] + '.pt')
        img_feats = extract_test_features(video, model)
        torch.save(img_feats, outfile)

In [28]:
def get_test_data():
    # Read test CSV
    test_df = pd.read_csv('test.csv')
    
    test_features = []
    test_ids = []
    test_filenames = []
    
    for _, row in test_df.iterrows():
        video_id = row['index']
        filename = row['file_name']
        feature_path = os.path.join('test_features_dir', filename.split(".")[0] + '.pt')
        
        if os.path.exists(feature_path):
            # Load numpy array and convert to torch tensor
            features = torch.load(feature_path)
            test_features.append(features.float())
            test_ids.append(video_id)
            test_filenames.append(filename)

    return test_features, test_ids, test_filenames

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
class Inference_Seq2SeqModel(nn.Module):
    def __init__(self):
        super(Inference_Seq2SeqModel, self).__init__()

        # Encoder
        self.encoder_lstm = nn.LSTM(
            input_size=512,
            hidden_size=512,
            batch_first=True
        )
        
        # Decoder
        self.decoder_lstm = nn.LSTM(
            input_size=1000,
            hidden_size=512,
            batch_first=True
        )
        self.decoder_dense = nn.Linear(512, 1000)

        self.encoder_lstm.load_state_dict(torch.load('encoder_model_weights.pth'))
        self.decoder_lstm.load_state_dict(torch.load('decoder_model_weights.pth'))
        
    def forward(self, encoder_inputs, decoder_inputs, encoder_hidden, encoder_cell):
        _, (hidden, cell) = self.encoder_lstm(encoder_inputs, (encoder_hidden, encoder_cell))
        
        # Decoder
        decoder_outputs, _ = self.decoder_lstm(
            decoder_inputs, 
            (hidden, cell)
        )
        
        decoder_outputs = self.decoder_dense(decoder_outputs)
        return decoder_outputs

In [44]:
def greedy_search(loaded_array):
    # Convert numpy array to tensor
    input_tensor = loaded_array.mean(dim=[2, 3])
    input_tensor = input_tensor.unsqueeze(0)
    
    # Initialize
    _, (hidden, cell) = test_model.encoder_lstm(input_tensor.to(device))
    decoder_input = torch.zeros((1, 1, 1000))  # (batch, seq_len, vocab_size)
    decoder_input[0, 0, preprocessor.tokenizer.token_to_id("[BOS]")] = 1
    hidden.to(device)
    cell.to(device)
    
    sentence = []
    max_length = 30

    prev_word_idx = -1
    for _ in range(max_length):
        # Decoder step
        output, (hidden, cell) = test_model.decoder_lstm(decoder_input.to(device), (hidden, cell))

        output = test_model.decoder_dense(output)
        
        # Get most likely word
        word_idx = torch.argmax(output).item()  # (1, 1, vocab_size) -> scalar
        
        # Stop if EOS token
        if word_idx == preprocessor.tokenizer.token_to_id("[EOS]"):
            break
            
        # Skip if padding token
        if word_idx == preprocessor.tokenizer.token_to_id("[PAD]"):
            continue

        if prev_word_idx != word_idx:
            word = preprocessor.tokenizer.id_to_token(word_idx)
            
            if word is not None:
                sentence.append(word)
            
        # Next input is current output
        decoder_input = torch.zeros_like(decoder_input)
        decoder_input[0, 0, word_idx] = 1

        prev_word_idx = word_idx

    caption = ' '.join(sentence)
    caption = caption.replace(' .', '.').replace(' ,', ',')
    
    return caption.strip()

In [31]:
def test():
    # # Load test data
    # test_features, test_ids, test_filenames = self.get_test_data()
    
    predictions = []
    
    for idx, (features, video_id, filename) in enumerate(zip(test_features, test_ids, test_filenames)):
        
        # Generate caption
        # start_time = time.time()
        caption = greedy_search(features)
        
        # inference_time = time.time() - start_time
        
        predictions.append({
            'index': video_id,
            'file_name': filename,
            'caption': caption
        })

    # Save to CSV
    pd.DataFrame(predictions).to_csv(
        'submission.csv',
        columns=['index', 'file_name', 'caption'],
        index=False
    )

In [32]:
extract_test_feats_pretrained_cnn()

Model loaded
Processing video 158.mp4
Processing video 366.mp4
Processing video 444.mp4
Processing video 506.mp4
Processing video 370.mp4
Processing video 28.mp4
Processing video 84.mp4
Processing video 116.mp4
Processing video 291.mp4
Processing video 363.mp4
Processing video 97.mp4
Processing video 340.mp4
Processing video 490.mp4
Processing video 503.mp4
Processing video 43.mp4
Processing video 412.mp4
Processing video 111.mp4
Processing video 18.mp4
Processing video 79.mp4
Processing video 278.mp4
Processing video 449.mp4
Processing video 437.mp4
Processing video 122.mp4
Processing video 509.mp4
Processing video 225.mp4
Processing video 362.mp4
Processing video 388.mp4
Processing video 151.mp4
Processing video 57.mp4
Processing video 379.mp4
Processing video 246.mp4
Processing video 162.mp4
Processing video 504.mp4
Processing video 402.mp4
Processing video 314.mp4
Processing video 61.mp4
Processing video 348.mp4
Processing video 98.mp4
Processing video 252.mp4
Processing video 108.

In [33]:
test_features, test_ids, test_filenames = get_test_data()

In [34]:
test_model = trainer.model

In [45]:
test()