In [1]:
import torch
import torch.nn as nn
from transformers import ViTFeatureExtractor, ViTForImageClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
from evaluate import load
import numpy as np
from transformers import DefaultDataCollator
from torchvision import transforms
from pynvml import *
import nvidia_smi

from accelerate import Accelerator

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [3]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [4]:
print_gpu_utilization()

GPU memory occupied: 452 MB.


In [5]:
# establish which LLM to train on
model_name = 'google/vit-base-patch16-224-in21k'
run_name = model_name + '_run0'

In [6]:
# build dataset
food_dataset = load_dataset("food101", split="train[:10000]")
food_dataset = food_dataset.train_test_split(test_size=0.2)

Downloading builder script:   0%|          | 0.00/6.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading and preparing dataset food101/default to /home/ref2156/.cache/huggingface/datasets/food101/default/0.0.0/7cebe41a80fb2da3f08fcbef769c8874073a86346f7fb96dc0847d4dfc318295...


Downloading data:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/489k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/75750 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/25250 [00:00<?, ? examples/s]

Dataset food101 downloaded and prepared to /home/ref2156/.cache/huggingface/datasets/food101/default/0.0.0/7cebe41a80fb2da3f08fcbef769c8874073a86346f7fb96dc0847d4dfc318295. Subsequent calls will reuse this data.


In [7]:
labels = food_dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [8]:
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name,
                                                  num_labels=len(labels),
                                                  id2label=id2label,
                                                  label2id=label2id)
# model = nn.DataParallel(model, device_ids=[0])
model.to(device)

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_

In [9]:
# define fine-tuning hyper parameters
epochs = 8
per_dev_batch_size = 32
output_dir = './vit'
lr = 5e-5

In [10]:
data_collator = DefaultDataCollator()

In [11]:
# define image transformations
normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
size = (feature_extractor.size["shortest_edge"]
        if "shortest_edge" in feature_extractor.size
        else (feature_extractor.size["height"], feature_extractor.size["width"])
        )
img_transforms = transforms.Compose([ # transforms.RandomResizedCrop(size), 
                                     transforms.ToTensor(), 
                                     normalize,
                                     transforms.Resize((224,224))])


# define function to prepare dataset for huggingface implementation
def transform_data(examples):
    examples["pixel_values"] = [img_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

In [12]:
food_dataset = food_dataset.with_transform(transform_data)

In [13]:
def compute_metrics(pred):
    acc = load_metric("accuracy")
    prec = load_metric("precision")
    recall = load_metric("recall")
    f1 = load_metric("f1")
    
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    
    res = {"accuracy": acc.compute(predictions=predictions, references=labels)["accuracy"],
           "precision": prec.compute(predictions=predictions, references=labels, average="weighted")["precision"],
           "recall": recall.compute(predictions=predictions, references=labels, average="weighted")["recall"],
           "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]}
    return res

In [14]:
training_args = TrainingArguments(
    output_dir=output_dir + run_name,
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=per_dev_batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=per_dev_batch_size,
    num_train_epochs=epochs,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=food_dataset["train"],
    eval_dataset=food_dataset["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [15]:
nvidia_smi.nvmlInit()

deviceCount = nvidia_smi.nvmlDeviceGetCount()
for i in range(deviceCount):
    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
    info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
    print("Device {}: {}, Memory : ({:.2f}% free): {}(total), {} (free), {} (used)".format(i, nvidia_smi.nvmlDeviceGetName(handle), 100*info.free/info.total, info.total, info.free, info.used))

nvidia_smi.nvmlShutdown()

Device 0: b'Tesla T4', Memory : (86.47% free): 16106127360(total), 13927645184 (free), 2178482176 (used)
Device 1: b'Tesla T4', Memory : (97.05% free): 16106127360(total), 15631581184 (free), 474546176 (used)
Device 2: b'Tesla T4', Memory : (97.05% free): 16106127360(total), 15631581184 (free), 474546176 (used)
Device 3: b'Tesla T4', Memory : (97.05% free): 16106127360(total), 15631581184 (free), 474546176 (used)


In [16]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,4.5541,4.110492,0.487,0.74135,0.487,0.515036
1,3.6802,3.458952,0.7975,0.852971,0.7975,0.803177
2,3.3353,2.991396,0.8815,0.894359,0.8815,0.874625
4,2.7956,2.644207,0.894,0.898884,0.894,0.886704
4,2.5882,2.422663,0.9045,0.907693,0.9045,0.901956
5,2.311,2.278132,0.91,0.912017,0.91,0.908784
6,2.1671,2.207101,0.913,0.915073,0.913,0.912157
7,2.1327,2.193189,0.9155,0.917116,0.9155,0.914674


  


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


***** train metrics *****
  epoch                    =         7.62
  total_flos               = 4405700773GF
  train_loss               =       2.9512
  train_runtime            =   0:20:06.00
  train_samples_per_second =       53.068
  train_steps_per_second   =          0.1


In [18]:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)



***** eval metrics *****
  epoch                   =       7.62
  eval_accuracy           =     0.9155
  eval_f1                 =     0.9147
  eval_loss               =     2.1932
  eval_precision          =     0.9171
  eval_recall             =     0.9155
  eval_runtime            = 0:00:21.15
  eval_samples_per_second =     94.521
  eval_steps_per_second   =      0.756
