In [4]:
pip install torch torchvision einops numpy matplotlib tqdm


Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
import torch.nn as nn

class ImageTokenizer(nn.Module):
    def __init__(self, patch_size=16, in_channels=3, embed_dim=768):
        super().__init__()
        self.patch_size = patch_size
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        B, C, H, W = x.shape
        x = self.proj(x).flatten(2).transpose(1, 2)  # (B, N, D) where N is the number of patches
        return x


In [6]:
import torch

def apply_rope(x, theta=10000):
    seq_len, dim = x.shape[-2:]
    indices = torch.arange(seq_len, dtype=torch.float32).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, dim, 2, dtype=torch.float32) * (-torch.log(torch.tensor(theta)) / dim))
    sinusoids = torch.sin(indices * div_term), torch.cos(indices * div_term)
    x_sin, x_cos = torch.sin(x), torch.cos(x)
    return x_sin * sinusoids[1] + x_cos * sinusoids[0]


In [7]:
class VisionTransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_ratio=4):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * mlp_ratio),
            nn.GELU(),
            nn.Linear(embed_dim * mlp_ratio, embed_dim)
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.mlp(self.norm2(x))
        return x


In [8]:
criterion = nn.CrossEntropyLoss()


In [9]:
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

batch_size = 64  # Define batch size



dataset = datasets.FashionMNIST(root='./Documents/data', train=True, transform=transform, download=True)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

class VisionTransformer(nn.Module):
    def __init__(self, img_size=128, patch_size=16, in_channels=3, embed_dim=768, num_heads=12, depth=12, num_classes=10):
        super().__init__()
        self.tokenizer = ImageTokenizer(patch_size, in_channels, embed_dim)
        self.pos_embed = nn.Parameter(torch.randn(1, (img_size // patch_size) ** 2, embed_dim))  # Positional embedding
        self.blocks = nn.ModuleList([VisionTransformerBlock(embed_dim, num_heads) for _ in range(depth)])
        self.norm = nn.LayerNorm(embed_dim)
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.tokenizer(x) + self.pos_embed
        for block in self.blocks:
            x = block(x)
        x = self.norm(x)
        x = x.mean(dim=1)  # Global average pooling
        x = self.classifier(x)
        return x

# Define ImageTokenizer and VisionTransformerBlock from earlier code
class ImageTokenizer(nn.Module):
    def __init__(self, patch_size=16, in_channels=3, embed_dim=768):
        super().__init__()
        self.patch_size = patch_size
        self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        B, C, H, W = x.shape
        x = self.proj(x).flatten(2).transpose(1, 2)  # (B, N, D) where N is the number of patches
        return x

class VisionTransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_ratio=4):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * mlp_ratio),
            nn.GELU(),
            nn.Linear(embed_dim * mlp_ratio, embed_dim)
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.mlp(self.norm2(x))
        return x


In [8]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Image preprocessing
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Adjust image size
    transforms.Grayscale(num_output_channels=3),  # Convert to 3 channels
    transforms.ToTensor(),
])

# Load FashionMNIST dataset
train_dataset = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.FashionMNIST(root='./data', train=False, transform=transform, download=True)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [9]:
class ImageTokenizer:
    def __init__(self, patch_size=4):
        self.patch_size = patch_size
        self.num_patches = (28 // patch_size) ** 2  # 49 patches for 28x28 images

    def tokenize(self, img):
        """
        Convert an image into non-overlapping patches.
        Output shape: (batch, num_patches, patch_size^2)
        """
        B, C, H, W = img.shape
        img = rearrange(img, "b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1=self.patch_size, p2=self.patch_size)
        return img

tokenizer = ImageTokenizer()


In [10]:
class RoPE(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
        self.theta = 10000  # Scaling factor

    def forward(self, x):
        seq_len, batch, dim = x.shape
        assert dim % 2 == 0, "Embedding dimension must be even for RoPE"

        pos = torch.arange(seq_len, device=x.device).float()
        theta = self.theta ** (-2 * (torch.arange(dim // 2).float() / dim))

        # Compute angles
        pos_theta = torch.einsum("n,d->nd", pos, theta)  # (seq_len, dim//2)
        sin_pos = torch.sin(pos_theta)
        cos_pos = torch.cos(pos_theta)

        # Split embeddings into even and odd indices
        x1, x2 = x[..., 0::2], x[..., 1::2]

        # Apply rotation
        x_rope = torch.cat([x1 * cos_pos - x2 * sin_pos, x1 * sin_pos + x2 * cos_pos], dim=-1)
        return x_rope


In [11]:
class VisionGPT(nn.Module):
    def __init__(self, num_patches=49, embed_dim=128, num_classes=20, num_heads=4, num_layers=4):
        super().__init__()
        self.embedding = nn.Linear(16, embed_dim)  # Patch embedding (for 4x4 patches)
        self.rope = RoPE(embed_dim)  # RoPE for positional encoding

        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # Convert patches to embeddings
        x = self.rope(x)  # Apply RoPE
        x = self.transformer(x)  # Transformer Encoder
        x = x.mean(dim=1)  # Aggregate over patches
        return self.fc(x)


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model
model = VisionGPT().to(device)  # Ensure VisionGPT is defined




In [13]:
from einops import rearrange


In [14]:
for images, labels in train_loader:
    images = tokenizer.tokenize(images)  # Convert images into tokens
    print("Image tokens shape:", images.shape)  # Debugging step
    break  # Check only the first batch


Image tokens shape: torch.Size([32, 1024, 48])


In [15]:
image_size = 28  # FashionMNIST and NIST images are 28x28
patch_size = 4   # Each patch is 4x4

num_patches = (image_size // patch_size) ** 2  # 49 patches for 28x28 images
print(f"Number of patches: {num_patches}")  # Should print 49


Number of patches: 49


In [16]:
embedding_dim = 128  # Define the embedding dimension


In [17]:
(batch_size, num_patches, embedding_dim)


(64, 49, 128)

In [18]:
class VisionGPT(nn.Module):
    def __init__(self, num_patches=49, patch_dim=16, embed_dim=128, num_classes=10, num_heads=4, num_layers=4):
        super().__init__()
        self.embedding = nn.Linear(patch_dim, embed_dim)  # Ensure patch_dim matches tokenizer output
        self.rope = RoPE(embed_dim)

        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # Convert patches to embeddings
        x = self.rope(x)  # Apply RoPE
        x = self.transformer(x)  # Transformer Encoder
        x = x.mean(dim=1)  # Aggregate over patches
        return self.fc(x)


In [19]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [20]:
import torch
import torch.nn as nn

class VisionGPT(nn.Module):
    def __init__(self, image_size=28, patch_size=4, embed_dim=128, num_classes=10):
        super().__init__()
        
        # Patch size is 4x4, so patch_dim is 16
        patch_dim = patch_size * patch_size  # 4 * 4 = 16
        
        # Embedding layer: convert each patch into an embedding of size embed_dim
        self.embedding = nn.Linear(patch_dim, embed_dim)
        
        # A transformer layer or any further processing can be added
        self.transformer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4)
        
        # Final classification layer
        self.fc = nn.Linear(embed_dim, num_classes)  # num_classes = 10 for Fashion MNIST

    def forward(self, x):
        # Flatten the input image to (batch_size, num_patches, patch_dim)
        batch_size = x.size(0)
        x = x.view(batch_size, -1)  # Flatten the image
        
        # Pass through the embedding layer
        x = self.embedding(x)  # Convert patches to embeddings
        
        # Apply transformer or other layers
        x = self.transformer(x)  # Transformer processing (if needed)
        
        # Take mean across patches (if using patch-based processing)
        x = x.mean(dim=1)
        
        # Final classification
        x = self.fc(x)
        return x

# Instantiate the model for Fashion MNIST
model = VisionGPT(image_size=28, patch_size=4, embed_dim=128, num_classes=10)


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Define the SimpleNN model
class SimpleNN(nn.Module):
    def __init__(self, input_size=784, hidden_size=128, num_classes=10):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # From 784 to 128
        self.fc2 = nn.Linear(hidden_size, num_classes)  # From 128 to 10 (output layer)

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # Pass through the first layer with ReLU activation
        x = self.fc2(x)  # Final output layer
        return x

# Set up the transformation for data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalize the image data
])

# Load the Fashion MNIST dataset
train_dataset = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.FashionMNIST(root='./data', train=False, transform=transform, download=True)

# Create DataLoader for training and testing
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
model = SimpleNN(input_size=784, hidden_size=128, num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    for images, labels in train_loader:
        # Flatten the images to (batch_size, 784)
        images = images.view(images.size(0), -1)

        # Zero the gradients, perform forward pass, compute loss, backpropagate, and update weights
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

# Evaluate the model on test data
model.eval()  # Set the model to evaluation mode
correct = 0
total = 0
with torch.no_grad():  # Disable gradient tracking for inference
    for images, labels in test_loader:
        images = images.view(images.size(0), -1)  # Flatten the images
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)  # Get the predicted labels
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")


Epoch 1/5, Loss: 0.8003367146195125
Epoch 2/5, Loss: 0.5228332801541286
Epoch 3/5, Loss: 0.47346758513626
Epoch 4/5, Loss: 0.4455727555318428
Epoch 5/5, Loss: 0.42688448318858135
Test Accuracy: 83.72%


In [59]:
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Transform: Convert images to tensor and normalize
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

# Load Fashion MNIST dataset
train_dataset = datasets.FashionMNIST(root='./Documents/data', train=True, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define the model, loss function, and optimizer
model = SimpleNN(input_size=784, hidden_size=128, num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, labels in train_loader:
        # Flatten the images to (batch_size, 784)
        images = images.view(images.size(0), -1)

        # Zero the gradients, perform forward pass, compute loss, backpropagate, and update weights
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")


Epoch 1/5, Loss: 0.7992495837559832
Epoch 2/5, Loss: 0.5217999573518981
Epoch 3/5, Loss: 0.4730919840684069
Epoch 4/5, Loss: 0.4464393199633942
Epoch 5/5, Loss: 0.42714410435670475


In [22]:
import torch
import torch.nn as nn

# Sample input batch (batch_size = 64, channels = 1, height = 28, width = 28)
batch_size = 64
inputs = torch.randn(batch_size, 1, 28, 28)  # Example input shape: (64, 1, 28, 28)

# Reshape each image in the batch to a vector of 784 elements
inputs = inputs.view(batch_size, -1)  # Shape becomes (64, 784)


In [61]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.optim as optim

# Define transformation to convert images to tensors and normalize
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  # Normalizing the image data
])

# Load Fashion MNIST dataset
train_dataset = datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.FashionMNIST(root='./data', train=False, transform=transform, download=True)

# Create DataLoader for training and testing
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define the neural network model (as above)
model = SimpleNN(input_size=784, hidden_size=128, num_classes=10)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        # Flatten the images to (batch_size, 784)
        images = images.view(images.size(0), -1)  # Flatten the images
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        
        # Calculate the loss
        loss = criterion(outputs, labels)
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Evaluation on test data
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images = images.view(images.size(0), -1)  # Flatten the images
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")


Epoch 1, Loss: 0.8121033735684494
Epoch 2, Loss: 0.5250133015453688
Epoch 3, Loss: 0.47575459896183725
Epoch 4, Loss: 0.4479793735118563
Epoch 5, Loss: 0.42937870982931114
Test Accuracy: 83.68%
