<a href="https://colab.research.google.com/github/udayPatil45/Computer_Vision_SOC/blob/main/paper_replicating_solved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. What is paper replication and why is it important in ML?
Answer:
Paper replication involves reproducing the results of a published machine learning research paper using your own code. It's important because:

It validates the research findings.

Helps you deeply understand the architecture and methods used.

Builds skill in translating academic concepts into working code.



2. Example: Building a small ViT (Vision Transformer) from scratch in PyTorch

In [None]:
import torch
from torch import nn

class PatchEmbedding(nn.Module):
    def __init__(self, in_channels=3, patch_size=16, embedding_dim=768, image_size=224):
        super().__init__()
        self.patch_size = patch_size
        self.projection = nn.Conv2d(in_channels, embedding_dim,
                                    kernel_size=patch_size, stride=patch_size)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embedding_dim))
        self.position_embeddings = nn.Parameter(torch.randn(
            1, (image_size // patch_size) ** 2 + 1, embedding_dim))

    def forward(self, x):
        B = x.shape[0]
        x = self.projection(x)  # (B, embed_dim, H', W')
        x = x.flatten(2).transpose(1, 2)  # (B, num_patches, embed_dim)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)  # prepend cls token
        x = x + self.position_embeddings
        return x


3. Transformer Encoder Block

In [None]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.mlp(self.norm2(x))
        return x


4. Complete ViT Model

In [None]:
class ViT(nn.Module):
    def __init__(self, image_size=224, patch_size=16, in_channels=3,
                 num_classes=10, embed_dim=768, depth=12, heads=12, mlp_dim=3072, dropout=0.1):
        super().__init__()
        self.patch_embed = PatchEmbedding(in_channels, patch_size, embed_dim, image_size)
        self.transformer_blocks = nn.Sequential(*[
            TransformerEncoderBlock(embed_dim, heads, mlp_dim, dropout)
            for _ in range(depth)
        ])
        self.norm = nn.LayerNorm(embed_dim)
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.patch_embed(x)
        x = self.transformer_blocks(x)
        x = self.norm(x)
        return self.classifier(x[:, 0])  # only the [CLS] token


5. Training Summary (typical setup)

In [None]:
from torch.utils.data import DataLoader
from torchvision import transforms, datasets

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

train_data = datasets.FakeData(size=1000, image_size=(3, 224, 224),
                               num_classes=10, transform=transform)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

model = ViT()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# Training loop
for epoch in range(5):
    for X, y in train_loader:
        preds = model(X)
        loss = loss_fn(preds, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} complete.")
