In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
puneet6060_intel_image_classification_path = kagglehub.dataset_download('puneet6060/intel-image-classification')

print('Data source import complete.')


Data source import complete.


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("puneet6060/intel-image-classification")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/intel-image-classification


In [None]:
import os
import torch
import torchvision
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from sklearn.metrics import classification_report, confusion_matrix
from transformers import ViTModel, ViTFeatureExtractor
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Paths
data_dir = "/kaggle/input/intel-image-classification"

# Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Dataset and Dataloader
train_dataset = ImageFolder(os.path.join(data_dir, 'seg_train', 'seg_train'), transform=transform)
test_dataset = ImageFolder(os.path.join(data_dir, 'seg_test', 'seg_test'), transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

# Load pre-trained ViT
class HybridViTModel(nn.Module):
    def __init__(self, num_classes=6):
        super(HybridViTModel, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.bilstm = nn.LSTM(input_size=768, hidden_size=256, num_layers=1,
                              batch_first=True, bidirectional=True)
        self.attention = nn.Linear(512, 1)
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, x):
        outputs = self.vit(pixel_values=x).last_hidden_state  # (B, 197, 768)
        lstm_out, _ = self.bilstm(outputs)
        attn_weights = torch.softmax(self.attention(lstm_out).squeeze(-1), dim=1).unsqueeze(-1)
        context = torch.sum(attn_weights * lstm_out, dim=1)
        logits = self.classifier(context)
        return logits

model = HybridViTModel().to(device)

# Training components
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in tqdm(train_loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Evaluation
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images = images.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=test_dataset.classes))
    print("Confusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))

train_model(model, train_loader, criterion, optimizer, epochs=3)
evaluate_model(model, test_loader)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

100%|██████████| 439/439 [07:31<00:00,  1.03s/it]


Epoch [1/3], Loss: 0.4901


100%|██████████| 439/439 [07:34<00:00,  1.03s/it]


Epoch [2/3], Loss: 0.1389


 48%|████▊     | 209/439 [03:37<03:59,  1.04s/it]


KeyboardInterrupt: 

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from transformers import ViTModel
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Paths
data_dir = "/kaggle/input/intel-image-classification"

# Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Dataset and Dataloader
train_dataset = ImageFolder(os.path.join(data_dir, 'seg_train', 'seg_train'), transform=transform)
test_dataset = ImageFolder(os.path.join(data_dir, 'seg_test', 'seg_test'), transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

# Label Smoothing Loss
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=-1)
        true_dist = torch.zeros_like(pred)
        true_dist.fill_(self.smoothing / (self.cls - 1))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=-1))

# HTHTA-ViT Model
class NovelHTHTAViT(nn.Module):
    def __init__(self, num_classes=6):
        super(NovelHTHTAViT, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.bilstm = nn.LSTM(input_size=768, hidden_size=256, num_layers=1,
                              batch_first=True, bidirectional=True)
        self.multihead_attn = nn.MultiheadAttention(embed_dim=512, num_heads=4, batch_first=True)
        self.norm = nn.LayerNorm(512)
        self.dropout = nn.Dropout(0.2)
        self.cls_proj = nn.Linear(768, 512)
        self.classifier = nn.Linear(1024, num_classes)

    def forward(self, x):
        vit_outputs = self.vit(pixel_values=x).last_hidden_state  # (B, 197, 768)
        cls_token = vit_outputs[:, 0]  # (B, 768)
        patch_tokens = vit_outputs[:, 1:]  # (B, 196, 768)

        lstm_out, _ = self.bilstm(patch_tokens)  # (B, 196, 512)
        attn_out, _ = self.multihead_attn(lstm_out, lstm_out, lstm_out)  # Self-attention
        attn_out = self.norm(attn_out + lstm_out)  # Residual connection
        attn_pooled = attn_out.mean(dim=1)  # (B, 512)

        fused = torch.cat([self.cls_proj(cls_token), attn_pooled], dim=1)  # (B, 1024)
        logits = self.classifier(self.dropout(fused))
        return logits

# Initialize model, criterion, optimizer
model = NovelHTHTAViT().to(device)
criterion = LabelSmoothingLoss(classes=6, smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Training function
def train_model(model, train_loader, criterion, optimizer, scheduler, epochs=3):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in tqdm(train_loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        scheduler.step()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images = images.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())
    print("\nClassification Report:\n")
    print(classification_report(all_labels, all_preds, target_names=test_dataset.classes))
    print("Confusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))

# Run training and evaluation
train_model(model, train_loader, criterion, optimizer, scheduler, epochs=3)
evaluate_model(model, test_loader)


100%|██████████| 439/439 [07:45<00:00,  1.06s/it]


Epoch [1/3], Loss: 0.6375


100%|██████████| 439/439 [07:44<00:00,  1.06s/it]


Epoch [2/3], Loss: 0.5552


100%|██████████| 439/439 [07:44<00:00,  1.06s/it]


Epoch [3/3], Loss: 0.5267


100%|██████████| 94/94 [00:33<00:00,  2.80it/s]


Classification Report:

              precision    recall  f1-score   support

   buildings       0.96      0.94      0.95       437
      forest       1.00      1.00      1.00       474
     glacier       0.90      0.93      0.91       553
    mountain       0.94      0.89      0.91       525
         sea       0.97      0.99      0.98       510
      street       0.95      0.97      0.96       501

    accuracy                           0.95      3000
   macro avg       0.95      0.95      0.95      3000
weighted avg       0.95      0.95      0.95      3000

Confusion Matrix:
[[412   0   1   0   1  23]
 [  0 474   0   0   0   0]
 [  0   1 512  29   8   3]
 [  0   1  53 466   4   1]
 [  1   0   3   2 503   1]
 [ 17   0   0   0   0 484]]





In [None]:
import torch
import torch.nn as nn
from transformers import ViTModel
import torch.nn.functional as F

class NovelHTHTAViT(nn.Module):
    def __init__(self, num_classes=6):
        super(NovelHTHTAViT, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.bilstm = nn.LSTM(input_size=768, hidden_size=256, num_layers=1,
                              batch_first=True, bidirectional=True)

        # Multi-head attention pooling
        self.multihead_attn = nn.MultiheadAttention(embed_dim=512, num_heads=4, batch_first=True)
        self.norm = nn.LayerNorm(512)
        self.dropout = nn.Dropout(0.2)

        # CLS token projection (optional feature fusion)
        self.cls_proj = nn.Linear(768, 512)

        # Final classifier
        self.classifier = nn.Linear(512 * 2, num_classes)

    def forward(self, x):
        vit_outputs = self.vit(pixel_values=x).last_hidden_state  # (B, 197, 768)
        cls_token = vit_outputs[:, 0]  # (B, 768)
        patch_tokens = vit_outputs[:, 1:]  # (B, 196, 768)

        lstm_out, _ = self.bilstm(patch_tokens)  # (B, 196, 512)
        attn_output, _ = self.multihead_attn(lstm_out, lstm_out, lstm_out)
        attn_output = self.norm(attn_output + lstm_out)  # residual + norm

        # Aggregate attention output
        attn_pooled = attn_output.mean(dim=1)  # (B, 512)

        # Combine CLS + attention pooled features
        fused = torch.cat([self.cls_proj(cls_token), attn_pooled], dim=1)  # (B, 1024)

        logits = self.classifier(self.dropout(fused))
        return logits


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Optional: Label smoothing
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=-1)
        true_dist = torch.zeros_like(pred)
        true_dist.fill_(self.smoothing / (self.cls - 1))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=-1))

criterion = LabelSmoothingLoss(classes=6, smoothing=0.1)


In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from transformers import ViTModel
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import random

# Set seed
def set_seed(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Transforms (224x224 compatible with ViT) ===
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# === Dataset ===
data_dir = "/kaggle/input/intel-image-classification"
train_dataset = ImageFolder(os.path.join(data_dir, 'seg_train', 'seg_train'), transform=transform)
test_dataset = ImageFolder(os.path.join(data_dir, 'seg_test', 'seg_test'), transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

# === Label Smoothing Loss ===
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=-1)
        true_dist = torch.zeros_like(pred)
        true_dist.fill_(self.smoothing / (self.cls - 1))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=-1))

# === Mixup Augmentation ===
def mixup_data(x, y, alpha=1.0):
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

# === HTHTA-ViT++ Model ===
class HTHTAViTPlus(nn.Module):
    def __init__(self, num_classes=6):
        super(HTHTAViTPlus, self).__init__()
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

        # Freeze early ViT layers
        for name, param in self.vit.named_parameters():
            if "encoder.layer.0" in name or "embeddings" in name:
                param.requires_grad = False

        self.gru = nn.GRU(768, 256, batch_first=True, bidirectional=True)
        self.attn = nn.MultiheadAttention(embed_dim=512, num_heads=4, batch_first=True)
        self.norm = nn.LayerNorm(512)
        self.bottleneck = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.cls_proj = nn.Linear(768, 256)
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, x):
        vit_out = self.vit(pixel_values=x).last_hidden_state
        cls_token = vit_out[:, 0]
        patch_tokens = vit_out[:, 1:]

        gru_out, _ = self.gru(patch_tokens)
        attn_out, _ = self.attn(gru_out, gru_out, gru_out)
        attn_out = self.norm(attn_out + gru_out)
        pooled = attn_out.mean(dim=1)

        fused = torch.cat([self.cls_proj(cls_token), self.bottleneck(pooled)], dim=1)
        return self.classifier(fused)

# === Initialize model and components ===
model = HTHTAViTPlus().to(device)
criterion = LabelSmoothingLoss(classes=6, smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)

# === Training Loop with Mixup ===
def train_model(model, loader, criterion, optimizer, scheduler, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for images, labels in tqdm(loader):
            images, labels = images.to(device), labels.to(device)
            images, y_a, y_b, lam = mixup_data(images, labels)
            outputs = model(images)
            loss = mixup_criterion(criterion, outputs, y_a, y_b, lam)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        scheduler.step()
        print(f"Epoch {epoch+1} | Loss: {total_loss/len(loader):.4f}")

# === Evaluation with Test-Time Augmentation (TTA) ===
def tta_predict(model, image):
    model.eval()
    image = image.unsqueeze(0)
    flips = torch.cat([
        image,
        torch.flip(image, dims=[3])
    ], dim=0).to(device)
    with torch.no_grad():
        outputs = model(flips)
        return F.softmax(outputs, dim=1).mean(dim=0)

def evaluate_model(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for images, labels in tqdm(loader):
            for i in range(images.size(0)):
                pred = tta_predict(model, images[i].to(device))
                all_preds.append(torch.argmax(pred).cpu().item())
            all_labels.extend(labels.numpy())
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=test_dataset.classes))
    print("Confusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))

# === Run training and evaluation ===
train_model(model, train_loader, criterion, optimizer, scheduler, epochs=5)
evaluate_model(model, test_loader)


100%|██████████| 439/439 [07:17<00:00,  1.00it/s]


Epoch 1 | Loss: 1.0942


100%|██████████| 439/439 [07:16<00:00,  1.01it/s]


Epoch 2 | Loss: 0.9783


100%|██████████| 439/439 [07:16<00:00,  1.01it/s]


Epoch 3 | Loss: 0.9681


100%|██████████| 439/439 [07:16<00:00,  1.01it/s]


Epoch 4 | Loss: 0.9479


100%|██████████| 439/439 [07:16<00:00,  1.01it/s]


Epoch 5 | Loss: 0.9265


100%|██████████| 94/94 [01:29<00:00,  1.05it/s]


Classification Report:
              precision    recall  f1-score   support

   buildings       0.95      0.95      0.95       437
      forest       0.99      1.00      1.00       474
     glacier       0.94      0.88      0.91       553
    mountain       0.90      0.93      0.91       525
         sea       0.97      0.99      0.98       510
      street       0.96      0.96      0.96       501

    accuracy                           0.95      3000
   macro avg       0.95      0.95      0.95      3000
weighted avg       0.95      0.95      0.95      3000

Confusion Matrix:
[[417   0   0   0   2  18]
 [  0 474   0   0   0   0]
 [  0   3 489  52   8   1]
 [  1   1  29 488   6   0]
 [  1   0   3   2 503   1]
 [ 19   0   0   0   0 482]]





In [None]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms

transform_cifar = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

cifar10_train = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_cifar)
cifar10_test = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_cifar)

cifar_train_loader = DataLoader(cifar10_train, batch_size=32, shuffle=True, num_workers=2)
cifar_test_loader = DataLoader(cifar10_test, batch_size=32, shuffle=False, num_workers=2)


100%|██████████| 170M/170M [00:04<00:00, 42.5MB/s]


In [None]:
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# CIFAR-10 transform (ViT requires 224x224)
transform_cifar = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# CIFAR-10 Datasets
cifar10_train = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_cifar)
cifar10_test = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_cifar)

# DataLoaders
cifar10_train_loader = DataLoader(cifar10_train, batch_size=32, shuffle=True, num_workers=2)
cifar10_test_loader = DataLoader(cifar10_test, batch_size=32, shuffle=False, num_workers=2)

# Class names
cifar10_classes = cifar10_train.classes


In [None]:
# CIFAR-100 Datasets
cifar100_train = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_cifar)
cifar100_test = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_cifar)

# DataLoaders
cifar100_train_loader = DataLoader(cifar100_train, batch_size=32, shuffle=True, num_workers=2)
cifar100_test_loader = DataLoader(cifar100_test, batch_size=32, shuffle=False, num_workers=2)

# Class names
cifar100_classes = cifar100_train.classes


In [None]:
model = HTHTAViTPlus(num_classes=10).to(device)  # For CIFAR-10
# model = HTHTAViTPlus(num_classes=100).to(device)  # For CIFAR-100
# model = HTHTAViTPlus(num_classes=200).to(device)  # For Tiny-ImageNet


In [None]:
!pip install transformers ptflops seaborn

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from transformers import ViTModel
from tqdm import tqdm
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from ptflops import get_model_complexity_info

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# CIFAR-10
cifar10_train = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
cifar10_test = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
cifar10_train_loader = DataLoader(cifar10_train, batch_size=32, shuffle=True, num_workers=2)
cifar10_test_loader = DataLoader(cifar10_test, batch_size=32, shuffle=False, num_workers=2)

# CIFAR-100
cifar100_train = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
cifar100_test = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)
cifar100_train_loader = DataLoader(cifar100_train, batch_size=32, shuffle=True, num_workers=2)
cifar100_test_loader = DataLoader(cifar100_test, batch_size=32, shuffle=False, num_workers=2)




In [None]:
!wget http://cs231n.stanford.edu/tiny-imagenet-200.zip
!unzip -q tiny-imagenet-200.zip


--2025-05-17 15:45:22--  http://cs231n.stanford.edu/tiny-imagenet-200.zip
Resolving cs231n.stanford.edu (cs231n.stanford.edu)... 171.64.64.64
Connecting to cs231n.stanford.edu (cs231n.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs231n.stanford.edu/tiny-imagenet-200.zip [following]
--2025-05-17 15:45:22--  https://cs231n.stanford.edu/tiny-imagenet-200.zip
Connecting to cs231n.stanford.edu (cs231n.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 248100043 (237M) [application/zip]
Saving to: ‘tiny-imagenet-200.zip’


2025-05-17 15:45:24 (105 MB/s) - ‘tiny-imagenet-200.zip’ saved [248100043/248100043]



In [None]:
tiny_train = torchvision.datasets.ImageFolder(root="./tiny-imagenet-200/train", transform=transform)
tiny_test = torchvision.datasets.ImageFolder(root="./tiny-imagenet-200/val", transform=transform)

tiny_train_loader = DataLoader(tiny_train, batch_size=32, shuffle=True, num_workers=2)
tiny_test_loader = DataLoader(tiny_test, batch_size=32, shuffle=False, num_workers=2)

tiny_classes = tiny_train.classes


In [None]:
class HTHTAViTPlus(nn.Module):
    def __init__(self, num_classes):
        super(HTHTAViTPlus, self).__init__()
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.gru = nn.GRU(768, 256, batch_first=True, bidirectional=True)
        self.attn = nn.MultiheadAttention(embed_dim=512, num_heads=4, batch_first=True)
        self.norm = nn.LayerNorm(512)
        self.cls_proj = nn.Linear(768, 256)
        self.bottleneck = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, x):
        vit_out = self.vit(pixel_values=x).last_hidden_state
        cls_token = vit_out[:, 0]
        tokens = vit_out[:, 1:]
        gru_out, _ = self.gru(tokens)
        attn_out, _ = self.attn(gru_out, gru_out, gru_out)
        attn_out = self.norm(attn_out + gru_out)
        pooled = attn_out.mean(dim=1)
        fused = torch.cat([self.cls_proj(cls_token), self.bottleneck(pooled)], dim=1)
        return self.classifier(fused)

    def get_attention_weights(self, x):
        vit_out = self.vit(pixel_values=x).last_hidden_state
        tokens = vit_out[:, 1:]
        gru_out, _ = self.gru(tokens)
        _, attn_weights = self.attn(gru_out, gru_out, gru_out)
        return attn_weights


In [None]:
def train_model(model, loader, criterion, optimizer, scheduler, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for images, labels in tqdm(loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        scheduler.step()
        print(f"Epoch {epoch+1} | Loss: {total_loss/len(loader):.4f}")

def evaluate_model(model, loader, class_names):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for images, labels in tqdm(loader):
            images = images.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())

    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=class_names))
    print("Confusion Matrix:")
    cm = confusion_matrix(all_labels, all_preds)
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=class_names, yticklabels=class_names)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    print(f"\nAccuracy: {acc:.4f}, Macro F1 Score: {f1:.4f}")


In [None]:
def analyze_flops(model):
    with torch.cuda.device(0):
        macs, params = get_model_complexity_info(
            model, (3, 224, 224), as_strings=True, print_per_layer_stat=False
        )
    print(f"FLOPs: {macs}")
    print(f"Parameters: {params}")


In [None]:
def visualize_attention(model, loader):
    model.eval()
    sample = next(iter(loader))[0][0].unsqueeze(0).to(device)
    attn_weights = model.get_attention_weights(sample)[0].mean(0).cpu().detach().numpy()

    plt.figure(figsize=(10, 8))
    sns.heatmap(attn_weights, cmap='viridis')
    plt.title("GRU + Multihead Attention Weights (Patch × Patch)")
    plt.xlabel("Patch Index")
    plt.ylabel("Patch Index")
    plt.show()


In [None]:
# === Choose Dataset ===
train_loader = cifar10_train_loader
test_loader = cifar10_test_loader
class_names = cifar10_train.classes
num_classes = 10

# For CIFAR-100:
# train_loader = cifar100_train_loader
# test_loader = cifar100_test_loader
# class_names = cifar100_train.classes
# num_classes = 100

# For Tiny-ImageNet:
# train_loader = tiny_train_loader
# test_loader = tiny_test_loader
# class_names = tiny_train.classes
# num_classes = 200

# === Run Training and Evaluation ===
model = HTHTAViTPlus(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)

train_model(model, train_loader, criterion, optimizer, scheduler, epochs=5)
evaluate_model(model, test_loader, class_names)
analyze_flops(model)
visualize_attention(model, test_loader)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
100%|██████████| 1563/1563 [27:27<00:00,  1.05s/it]


Epoch 1 | Loss: 0.1651


 12%|█▏        | 183/1563 [03:12<24:21,  1.06s/it]

In [None]:


# For CIFAR-100:
 train_loader = cifar100_train_loader
test_loader = cifar100_test_loader
class_names = cifar100_train.classes
num_classes = 100

# For Tiny-ImageNet:
# train_loader = tiny_train_loader
# test_loader = tiny_test_loader
# class_names = tiny_train.classes
# num_classes = 200

# === Run Training and Evaluation ===
model = HTHTAViTPlus(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)

train_model(model, train_loader, criterion, optimizer, scheduler, epochs=5)
evaluate_model(model, test_loader, class_names)
analyze_flops(model)
visualize_attention(model, test_loader)

In [None]:


# For Tiny-ImageNet:
train_loader = tiny_train_loader
test_loader = tiny_test_loader
class_names = tiny_train.classes
num_classes = 200

# === Run Training and Evaluation ===
model = HTHTAViTPlus(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)

train_model(model, train_loader, criterion, optimizer, scheduler, epochs=5)
evaluate_model(model, test_loader, class_names)
analyze_flops(model)
visualize_attention(model, test_loader)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np

# ✅ y_test = true values, y_pred = model predictions

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Optional: only use MAPE if target values are not 0
try:
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
except:
    mape = None

# Display results
print("🚀 Final Model Performance on Test Data:")
print(f"✅ MAE:  {mae:.2f} MW")
print(f"✅ RMSE: {rmse:.2f} MW")
print(f"✅ R² Score: {r2:.3f}")
if mape is not None:
    print(f"✅ MAPE: {mape:.2f}%")
else:
    print("⚠️  MAPE skipped due to zero values in targets")


NameError: name 'y_test' is not defined

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_model(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for images, labels in tqdm(loader):
            images = images.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=class_names))
    print("Confusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))


In [None]:
evaluate_model(model, test_loader)


  0%|          | 0/94 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x768 and 512x6)

In [None]:
class ViTOnlyModel(nn.Module):
    def __init__(self, num_classes=6):
        super(ViTOnlyModel, self).__init__()
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, x):
        vit_out = self.vit(pixel_values=x).last_hidden_state
        cls_token = vit_out[:, 0]
        return self.classifier(cls_token)


In [None]:
class ViTOnlyModel(nn.Module):
    def __init__(self, num_classes=6):
        super(ViTOnlyModel, self).__init__()
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.classifier = nn.Linear(768, num_classes)  # Must match CLS token dim

    def forward(self, x):
        vit_out = self.vit(pixel_values=x).last_hidden_state
        cls_token = vit_out[:, 0]  # (B, 768)
        return self.classifier(cls_token)



In [None]:
class ViTGRUFusionModel(nn.Module):
    def __init__(self, num_classes=6):
        super(ViTGRUFusionModel, self).__init__()
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.gru = nn.GRU(768, 256, batch_first=True, bidirectional=True)
        self.cls_proj = nn.Linear(768, 256)
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, x):
        vit_out = self.vit(pixel_values=x).last_hidden_state
        cls_token = vit_out[:, 0]
        tokens = vit_out[:, 1:]
        gru_out, _ = self.gru(tokens)
        pooled = gru_out.mean(dim=1)
        fused = torch.cat([self.cls_proj(cls_token), pooled], dim=1)
        return self.classifier(fused)


In [None]:
import matplotlib.pyplot as plt

def plot_accuracy_over_epochs(history_dict):
    plt.figure()
    for label, acc_list in history_dict.items():
        plt.plot(range(1, len(acc_list)+1), acc_list, label=label)
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.title("Accuracy over Epochs")
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
import seaborn as sns
import numpy as np

def plot_confusion_matrix(conf_matrix, class_names, title="Confusion Matrix"):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=class_names, yticklabels=class_names, cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(title)
    plt.show()
