#Step 1: Setting up the Environment

In [1]:
!pip install torch torchvision
!pip install transformers
!pip install huggingface_hub
!pip install datasets



In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms, models
from torchvision.datasets import ImageFolder
from torchvision.transforms import Compose, Resize, ToTensor, Normalize


In [12]:
# Define transformations
transform = Compose([
    Resize((224, 224)),  # Resize images to 224x224
    ToTensor(),  # Convert images to tensor
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize images
])

# Setup the ImageNet dataset
imagenet_data = ImageFolder(root='/Users/tashapais/Downloads/imagenet-1k/train/', transform=transform)

# Setup the DataLoader
train_loader = DataLoader(imagenet_data, batch_size=32, shuffle=True, num_workers=4)


In [34]:
class VimBlock(nn.Module):
    def __init__(self, D, M, N, E):
        super(VimBlock, self).__init__()
        self.norm = nn.LayerNorm(D)
        self.linear1 = nn.Linear(D, M)
        self.linear2 = nn.Linear(M, E)
        self.conv1d = nn.Conv1d(E, N, kernel_size=3, padding=1)
        self.silu = nn.SiLU()
        self.linearA = nn.Linear(N, M)
        self.linearB = nn.Linear(N, N)
        self.linearC = nn.Linear(N, E)
        self.softmax = nn.Softmax(dim=-1)
        self.parameterA = nn.Parameter(torch.randn(E, N))

    def forward(self, x):
        x = self.norm(x)
        x = self.linear1(x)
        x = self.linear2(x)

        # Ensure x is [batch, E] at this point
        x = x.unsqueeze(2)  # Change shape to [batch, E, 1]
        
        # Duplicate the last dimension (length) to simulate bidirectional processing
        x = torch.cat([x, x], dim=2)  # Now x is [batch, E, 2]

        # Now x is [batch, E, 2], make sure to permute dimensions for Conv1d
        x = x.permute(0, 2, 1)  # Correctly rearrange to [batch, 2, E]

        x = self.conv1d(x.permute(0, 2, 1))  # Rearrange to [batch, E, 2] for Conv1d
        x = self.silu(x)
        
        A = self.linearA(x)
        B = self.linearB(x)
        C = self.linearC(x)
        
        delta_o = self.softmax(A + self.parameterA.unsqueeze(0))
        B = delta_o * B
        
        return C * (A + B)

# Test the model to confirm the changes
model = VimBlock(D=224, M=1024, N=512, E=256)
x = torch.rand(10, 224)  # Example input tensor
output = model(x)
print("Output shape:", output.shape)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (5120x2 and 512x1024)

In [30]:
# Step 5: Training Setup
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
train_loader = DataLoader(imagenet_data, batch_size=32, shuffle=True)

def train_epoch(model, train_loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)





In [31]:
# Step 6: Train the Model
for epoch in range(10):  # Adjust the number of epochs as needed
    loss = train_epoch(model, train_loader, criterion, optimizer)
    print(f"Epoch {epoch+1}, Loss: {loss}")

RuntimeError: Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor