In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/


In [None]:
!kaggle datasets download -d kaustubhb999/tomatoleaf

tomatoleaf.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import zipfile
zip_ref = zipfile.ZipFile('/content/tomatoleaf.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [None]:
pip install datasets



In [None]:
import torch
import torch.nn as nn
from torchvision import *
from transformers import ViTModel, ViTForImageClassification, AutoImageProcessor
from datasets import load_dataset

In [None]:
dataset = load_dataset("imagefolder", data_dir="tomato")

Resolving data files:   0%|          | 0/10001 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
from datasets import load_metric

metric = load_metric("accuracy")

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 1000
    })
})

In [None]:
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

id2label[2]

'Tomato___Late_blight'

## Preprocessing the data

In [None]:
model_checkpoint = 'google/vit-base-patch16-224-in21k'
batch_size = 32

In [None]:
image_processor = AutoImageProcessor.from_pretrained(model_checkpoint)
image_processor

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

ViTImageProcessor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [None]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

normalize = Normalize(mean = image_processor.image_mean, std = image_processor.image_std)

In [None]:
if "height" in image_processor.size:
    size = (image_processor.size["height"], image_processor.size["width"])
    crop_size = size
    max_size = None

elif "shortest_edge" in image_processor.size:
    size = image_processor.size["shortest_edge"]
    crop_size = (size, size)
    max_size = image_processor.size.get("longest_edge")

In [None]:
train_transforms = Compose(
        [
            RandomResizedCrop(crop_size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

val_transforms = Compose(
        [
            Resize(size),
            CenterCrop(crop_size),
            ToTensor(),
            normalize,
        ]
    )

In [None]:
# Defining preprocess functions

def preprocess_train(example_batch):
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["image"]
    ]

    return example_batch

def preprocess_val(example_batch):
    example_batch["pixel_values"] = [
        val_transforms(image.convert("RGB")) for image in example_batch["image"]
    ]
    return example_batch

In [None]:
splits = dataset["train"].train_test_split(test_size=0.1)
train_ds = splits['train']
val_ds = splits['test']

In [None]:
train_ds[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=256x256>,
 'label': 1}

In [None]:
train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

In [None]:
train_ds[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=256x256>,
 'label': 1,
 'pixel_values': tensor([[[ 0.1137,  0.0431,  0.0431,  ..., -0.1059, -0.1059, -0.0902],
          [ 0.1137,  0.0196,  0.0353,  ..., -0.1137, -0.0667, -0.1686],
          [ 0.1216,  0.0118,  0.0667,  ..., -0.0980, -0.0353, -0.0824],
          ...,
          [ 0.1686,  0.1451,  0.1451,  ...,  0.0196, -0.0353,  0.0196],
          [ 0.1216,  0.1922,  0.2000,  ..., -0.1059, -0.0824,  0.0431],
          [ 0.1451,  0.1765,  0.1216,  ..., -0.1686, -0.0431,  0.0039]],
 
         [[ 0.1137,  0.0431,  0.0431,  ..., -0.0980, -0.0980, -0.0902],
          [ 0.1137,  0.0196,  0.0353,  ..., -0.1059, -0.0588, -0.1686],
          [ 0.1216,  0.0118,  0.0667,  ..., -0.0902, -0.0275, -0.0902],
          ...,
          [ 0.2000,  0.1765,  0.1765,  ...,  0.0353, -0.0196,  0.0353],
          [ 0.1529,  0.2235,  0.2314,  ..., -0.0902, -0.0667,  0.0588],
          [ 0.1765,  0.2078,  0.1529,  ..., -0.1529, -0.0275,  0.0196]]

In [None]:
from torchvision.models import resnet50
cnn_base = resnet50(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:03<00:00, 28.7MB/s]


In [None]:
for param in cnn_base.parameters():
    param.requires_grad = False

In [None]:
cnn_base = nn.Sequential(*list(cnn_base.children())[:-1])

In [None]:
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224")

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Connect CNN to ViT
num_classes = 10
class HybridModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn_base = cnn_base
        self.vit_model = vit_model
        self.classifier = nn.Linear(vit_model.config.hidden_size, num_classes)

    def forward(self,x):
        x = self.cnn_base(x)
        x = x.view(-1, *x.shape)
        x = x.flatten(start_dim = 1)
        x = self.vit_model(x)[0]

        x = self.classifier(x)
        return x

In [None]:
model = HybridModel()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
import torchvision

In [None]:
for epoch in range(3):
    for example in train_ds:
        image_tensor = example['pixel_values']
        pixel = example['label']

        #Convert PIL image to tensor
        #image_tensor = torchvision.transforms.ToTensor()(image_tensor)
        image_tensor = image_tensor.unsqueeze(0)
        label_tensor = torch.tensor([label])
        optimizer.zero_grad()
        outputs = model(image_tensor)  # Pass tensor as input
        loss = criterion(outputs, label_tensor)
        loss.backward()
        optimizer.step()



ValueError: not enough values to unpack (expected 4, got 2)

In [None]:
image_tensor = torchvision.transforms.ToTensor()(image)
image_tensor = image_tensor.unsqueeze(0)

In [None]:


image_tensor = train_ds['image']
pixel = example['label']
image_tensor = image_tensor.unsqueeze(0)
print(image_tensor.shape)

KeyboardInterrupt: 