## Multimodal learning! Toy example...


<div>
<img src="./images/multimodal.png" width="800"/>
</div>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchtext.vocab import GloVe

# Define a simple function to generate text descriptions
def generate_text_description(label):
    descriptions = [
        "This is a photo of an airplane.",
        "This is a photo of an automobile.",
        "This is a photo of a bird.",
        "This is a photo of a cat.",
        "This is a photo of a deer.",
        "This is a photo of a dog.",
        "This is a photo of a frog.",
        "This is a photo of a horse.",
        "This is a photo of a ship.",
        "This is a photo of a truck."
    ]
    return descriptions[label]

# Transform to normalize the image data
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Load the CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


OSError: [WinError 1455] The paging file is too small for this operation to complete. Error loading "C:\Users\tyler\anaconda3\Lib\site-packages\torch\lib\shm.dll" or one of its dependencies.

In [2]:
# Load GloVe embeddings
glove = GloVe(name='6B', dim=50)

# Define the neural network
class CombinedNN(nn.Module):
    def __init__(self, pretrained_embeddings, num_classes=10):
        super(CombinedNN, self).__init__()
        
        # Image CNN
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1_img = nn.Linear(16 * 5 * 5, 120)
        self.fc2_img = nn.Linear(120, 84)
        
        # Text embeddings
        vocab_size, embedding_dim = pretrained_embeddings.shape
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
        self.fc1_txt = nn.Linear(embedding_dim, 50)
        
        # Combined
        self.fc1_combined = nn.Linear(134, 100)  # 84 (image) + 50 (text) = 134
        self.fc2_combined = nn.Linear(100, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, img, text):
        # Image forward pass
        x_img = self.pool(self.relu(self.conv1(img)))
        x_img = self.pool(self.relu(self.conv2(x_img)))
        x_img = x_img.view(-1, 16 * 5 * 5)
        x_img = self.relu(self.fc1_img(x_img))
        x_img = self.relu(self.fc2_img(x_img))
        
        # Text forward pass
        embedded = self.embedding(text)
        embedded_avg = torch.mean(embedded, dim=1)
        x_txt = self.relu(self.fc1_txt(embedded_avg))
        
        # Combined forward pass
        x_combined = torch.cat((x_img, x_txt), dim=1)
        x_combined = self.relu(self.fc1_combined(x_combined))
        x_combined = self.fc2_combined(x_combined)
        x_combined = self.softmax(x_combined)
        
        return x_combined

# Create the model
pretrained_embeddings = glove.vectors
model = CombinedNN(pretrained_embeddings, num_classes=len(classes))


In [3]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # Get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # Generate text descriptions and convert to indices
        descriptions = [generate_text_description(label) for label in labels]
        text_indices = torch.tensor([[glove.stoi[word] for word in desc.split() if word in glove.stoi] for desc in descriptions])

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs, text_indices)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:  
            print(f'[Epoch {epoch + 1}, Mini-batch {i + 1}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')


[Epoch 1, Mini-batch 2000] loss: 2.302
[Epoch 1, Mini-batch 4000] loss: 2.168
[Epoch 1, Mini-batch 6000] loss: 1.900
[Epoch 1, Mini-batch 8000] loss: 1.536
[Epoch 1, Mini-batch 10000] loss: 1.312
[Epoch 1, Mini-batch 12000] loss: 1.227
[Epoch 2, Mini-batch 2000] loss: 1.173
[Epoch 2, Mini-batch 4000] loss: 1.139
[Epoch 2, Mini-batch 6000] loss: 1.110
[Epoch 2, Mini-batch 8000] loss: 1.095
[Epoch 2, Mini-batch 10000] loss: 1.078
[Epoch 2, Mini-batch 12000] loss: 1.050
Finished Training


In [4]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        descriptions = [generate_text_description(label) for label in labels]
        text_indices = torch.tensor([[glove.stoi[word] for word in desc.split() if word in glove.stoi] for desc in descriptions])
        outputs = model(images, text_indices)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct / total}%')


Accuracy of the network on the 10000 test images: 59.65%
