In [1]:
import json
import tensorflow as tf
from transformers import ViTImageProcessor
from transformers import AutoImageProcessor, AutoModel
from transformers import ViTForImageClassification
from transformers import TrainingArguments
from transformers import Trainer
from datasets import Dataset, DatasetDict, Features, ClassLabel, Value, Image
from PIL import Image as PILImage
import evaluate
import os
import torch
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("config.json", "r") as f:
    config = json.load(f)

config

{'batch_size': 32, 'width': 256, 'height': 256}

In [5]:
def create_custom_dataset(directory, limit=None):
    # Define lists to hold file paths, images, and labels
    file_paths = []
    images = []
    labels = []

    # Map folder names to label indices
    label_mapping = {
        "Fire": 0,
        "No_Fire": 1,
        "Smoke": 2
    }

    # Traverse the directory
    for label_name, label_idx in label_mapping.items():
        folder_path = os.path.join(directory, label_name)
        if not os.path.isdir(folder_path):
            continue

        # Get the image files in the folder
        for file_name in tqdm(os.listdir(folder_path), desc=f"Loading {folder_path}"):
            file_path = os.path.join(folder_path, file_name)
            if os.path.isfile(file_path):
                # Add the file path
                file_paths.append(file_path)

                # Open the image and add it to the list
                images.append(PILImage.open(file_path).convert("RGB"))

                # Add the label
                labels.append(label_idx)

                if limit is not None and len(file_path) >= limit:
                    break

    print("Creating dictionary")
    # Create a dictionary suitable for Dataset.from_dict
    data_dict = {
        "image_file_path": file_paths,
        "image": images,
        "label": labels
    }

    print("Defining features")
    # Define features for the dataset
    features = Features({
        "image_file_path": Value("string"),
        "image": Image(),
        "label": ClassLabel(names=["Fire", "No_Fire", "Smoke"])
    })

    print("Creating the dataset")
    # Create the Dataset
    return Dataset.from_dict(data_dict, features=features)

# Create datasets for training and testing
train_dataset = create_custom_dataset("data/train", limit=100)
test_dataset = create_custom_dataset("data/test", limit=100)

# Create a DatasetDict to hold the train and test datasets
ds = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Display the dataset
print(ds)

Loading data/train/Fire: 100%|██████████| 5004/5004 [00:03<00:00, 1401.50it/s]
Loading data/train/No_Fire: 100%|██████████| 4590/4590 [00:03<00:00, 1300.94it/s]
Loading data/train/Smoke:   0%|          | 0/4353 [00:00<?, ?it/s]


Creating dictionary
Defining features
Creating the dataset


KeyboardInterrupt: 

In [4]:
# model_path = "pretrained"
# processor = ViTImageProcessor.from_pretrained(model_path)

In [5]:
# Load model directly
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
model = AutoModel.from_pretrained("google/vit-base-patch16-224-in21k")

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [6]:
def process_example(example):
    inputs = processor(example['image'], return_tensors='pt')
    inputs['labels'] = example['labels']
    return inputs

In [7]:
model

ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTSdpaAttention(
          (attention): ViTSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUAct

In [8]:
def process_example(example):
    inputs = processor(example['image'], return_tensors='pt')
    inputs['labels'] = example['labels']
    return inputs


def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = processor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['labels']
    return inputs


def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }


metric = evaluate.load("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

In [9]:
prepared_ds = ds.with_transform(transform)

In [10]:
# labels = ["Fire", "No_Fire", "Smoke"]
labels = ds['train'].features['labels'].names

model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
  output_dir="./vit-base-beans",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=4,
  fp16=False,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)



In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["validation"],
    tokenizer=processor,
)

In [13]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

  0%|          | 0/2792 [00:00<?, ?it/s]

KeyError: 'image'

In [16]:
ds

DatasetDict({
    train: Dataset({
        features: ['image_file_path', 'label'],
        num_rows: 11157
    })
    validation: Dataset({
        features: ['image_file_path', 'label'],
        num_rows: 2790
    })
    test: Dataset({
        features: ['image_file_path', 'label'],
        num_rows: 2136
    })
})

In [24]:
prepared_ds["train"][0]

KeyError: 'image'

In [22]:
dataset = load_dataset('beans')
dataset

DatasetDict({
    train: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 1034
    })
    validation: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 133
    })
    test: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 128
    })
})

In [25]:
dataset["train"][0]

{'image_file_path': '/home/albert/.cache/huggingface/datasets/downloads/extracted/967f0d9f61a7a8de58892c6fab6f02317c06faf3e19fba6a07b0885a9a7142c7/train/angular_leaf_spot/angular_leaf_spot_train.0.jpg',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x500>,
 'labels': 0}