In [36]:
# Number of inputs
NIN = 10
# Size of the learned representation
NOUT = 20
# Number of examples
EXAMPLES = 9430
# Batch size
BATCH_SIZE = 1000

In [37]:
import logging
from collections import OrderedDict

import pytorch_lightning as pl
from pytorch_lightning import LightningModule
from pytorch_lightning import Trainer
from torch import optim
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
# -
from utils import *
import numpy as np
np.random.seed(0)

In [38]:
X1 = np.random.random((EXAMPLES, NIN))
X2 = np.random.random((EXAMPLES, NIN))
DIST = np.random.random((EXAMPLES,))

In [39]:
# class TableDistanceDataset(torch.utils.data.Dataset):
#     def __init__(self, x1, x2, dist):
#         self.dist = torch.Tensor(dist).float()
#         self.X1 = torch.Tensor(X1).float()
#         self.X2 = torch.Tensor(X2).float()

#     def __len__(self):
#         return self.X1.shape[0]

#     def __getitem__(self, index):
#         return self.X1[index,:], self.X2[index,:], self.dist[index]

# tableDistanceDataset = TableDistanceDataset(X1, X2, DIST)

from data import SpeechDataset
import torchvision
from torchvision.datasets import CIFAR10
from torchvision import transforms
from model import VisionTransformer, CNN_RNN_ENCODER, ESResNeXtFBSP

test_transform = transforms.Compose([
                                    transforms.Resize(int(256 * 76 / 64)),
                                    transforms.RandomResizedCrop((256,256),scale=(0.8,1.0),ratio=(0.9,1.1)),
                                    transforms.ToTensor(),
                                    transforms.Normalize([0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784])
                                    ])

train_transform = transforms.Compose([
                                    transforms.RandomHorizontalFlip(),
                                    transforms.Resize(int(256 * 76 / 64)),
                                    transforms.RandomResizedCrop((256,256),scale=(0.8,1.0),ratio=(0.9,1.1)),
                                    transforms.ToTensor(),
                                    transforms.Normalize([0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784])
                                    ])

trainset = SpeechDataset(root='./../../data/birds', train=True, transform=train_transform)
testset = SpeechDataset(root='./../../data/birds', train=False, transform=test_transform)

Total filenames:  11788 001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg
Load filenames from: ./../../data/birds/train/filenames.pickle (9430)
Total filenames:  11788 001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg
Load filenames from: ./../../data/birds/test/filenames.pickle (2358)


In [40]:
class Table2Representation(pl.LightningModule):
    def __init__(self):
        super().__init__()

        self.nin = NIN
        self.nhid = 32
        self.nout = NOUT

        # build model
        self.__build_model()
    
    def __build_model(self):
        self.fc1 = nn.Linear(self.nin,
                             self.nhid)
        self.do1 = nn.Dropout(0.2)
        self.out = nn.Linear(self.nhid,
                             self.nout)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.do1(x)
        x = self.out(x)
        return x

In [41]:
# Based upon https://github.com/PyTorchLightning/Siamese-Neural-Networks/blob/master/model.py
class TableDistanceModule(pl.LightningModule):
    def __init__(self, trainset, testset):
        super().__init__()

        self.tableDistanceDataset = trainset
        self.datatrain, self.dataval= \
        torch.utils.data.random_split(self.tableDistanceDataset,
                                      [round(EXAMPLES*0.8),
                                       round(EXAMPLES*0.2)])
        self.datatest = testset

        self.table2Representation = Table2Representation()
        self.img_model = VisionTransformer(**{
                            'embed_dim': 256,
                            'hidden_dim': 512,
                            'num_heads': 8,
                            'num_layers': 6,
                            'patch_size': 32,
                            'num_channels': 3,
                            'num_patches': 512,
                            'num_classes': 200,
                            'dropout': 0.2})
        self.audio_model = CNN_RNN_ENCODER()

        # build model
        self.__build_model()
    
    def __build_model(self):
        pass

    # def forward(self, x1, x2):
    #     z1 = self.table2Representation.forward(x1)
    #     z2 = self.table2Representation.forward(x2)
    #     dis = torch.mean(torch.abs(z1 - z2), axis=1)
    #     return dis

    def forward(self, x1, x2, len):
        img_encode = self.img_model.forward(x1)
        audio_encode = self.audio_model.forward(x2, len)

        return img_encode , audio_encode

    def loss(self, pred_dists, true_dists):
        loss_val = F.mse_loss(pred_dists, true_dists)
        return loss_val
    
    def _step(self, batch, batch_idx, name, training_step=False):
        # X1, X2, dist = batch
        imgs, caps, cls_id, key, input_length, labels = batch
        img_encode, audio_encode  = self(imgs, caps, input_length)
        loss_val = F.cross_entropy(img_encode, labels)  + F.cross_entropy(audio_encode, labels)
        #loss_val = self.loss(pred, dist)
        # in DP mode (default) make sure if result is scalar, there's another dim in the beginning
        # if self.trainer.use_ddp2:
        #     loss_val = loss_val.unsqueeze(0)
        tqdm_dict = OrderedDict({name: loss_val})
        if training_step:
            return OrderedDict({
                'loss': loss_val,
                'progress_bar': tqdm_dict,
                'log': tqdm_dict
            })
        else:
            return tqdm_dict
        
    def training_step(self, batch, batch_idx):
        return self._step(batch, batch_idx, name="train_loss", training_step=True)
    def validation_step(self, batch, batch_idx):
        return self._step(batch, batch_idx, name="val_loss", training_step=False)
    def test_step(self, batch, batch_idx):
        return self._step(batch, batch_idx, name="test_loss", training_step=False)

    def _epoch_end(self, outputs, name):
        # With DP training I think you have to average the things individually? Not sure
        # Look at the pytorch lightning siamese network code
        #if self.trainer.use_dp or self.trainer.use_ddp2:
        #    val_acc = torch.mean(val_acc)
        avg_loss = torch.stack([x[name] for x in outputs]).mean()
        tqdm_dict = {name: avg_loss}
        result = OrderedDict({name: avg_loss, 'progress_bar': tqdm_dict, 'log': tqdm_dict})
        return result
        
    def validation_epoch_end(self, outputs):
        return self._epoch_end(outputs, name="val_loss")
    def test_epoch_end(self, outputs):
        return self._epoch_end(outputs, name="test_loss")
        
    # ---------------------
    # TRAINING SETUP
    # ---------------------
    def configure_optimizers(self):
        """
        return whatever optimizers we want here
        :return: list of optimizers
        """
        optimizer = optim.SGD(self.parameters(),
                             lr=0.01, momentum=0.90)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=10)
        return [optimizer], [scheduler]

    def __dataloader(self, train, dataset):
        # when using multi-node (ddp) we need to add the  datasampler
        train_sampler = None
        batch_size = BATCH_SIZE

        # if self.use_ddp:
        #     train_sampler = DistributedSampler(dataset)

        should_shuffle = train and train_sampler is None
        loader = DataLoader(
            dataset=dataset,
            batch_size=128,
            shuffle=True,
            sampler=train_sampler,
            num_workers=0,
            drop_last=True,
            collate_fn=pad_collate
        )
        
        return loader

    # @pl.data_loader
    def train_dataloader(self):
        logging.info('training data loader called')
        return self.__dataloader(train=True, dataset=self.datatrain)

    # @pl.data_loader
    def val_dataloader(self):
        logging.info('val data loader called')
        return self.__dataloader(train=False, dataset=self.dataval)

    # @pl.data_loader
    def test_dataloader(self):
        logging.info('val data loader called')
        return self.__dataloader(train=False, dataset=self.datatest)

In [42]:
model_cpu = TableDistanceModule(trainset, testset)
trainer_cpu = Trainer(max_epochs=10)
trainer_cpu.fit(model_cpu)

  nn.init.orthogonal(self.hidden.weight.data)
  nn.init.orthogonal(self.hidden.weight.data)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name                 | Type                 | Params
--------------------------------------------------------------
0 | table2Representation | Table2Representation | 1.0 K 
1 | img_model            | VisionTransformer    | 4.1 M 
2 | audio_model          | CNN_RNN_ENCODER      | 7.6 M 
--------------------------------------------------------------
11.7 M    Trainable params
0         Non-trainable params
11.7 M    Total params
46.863    Total estimated model params size (MB)


num_classes: ---------------- 200
Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


RuntimeError: expected scalar type Double but found Float