In [3]:
%load_ext autoreload
%load_ext tensorboard
%autoreload 2

In [None]:
# only do this to debug
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [4]:
import torch
import torch.nn.functional as F
from transformers import AutoProcessor, Trainer, TrainingArguments, ViTForImageClassification as RealViT
from custom_vit import ViTForImageClassification
from datasets import load_dataset
import evaluate
import random
from copy import deepcopy
import math
import numpy as np

In [5]:
base_model = (
    RealViT.from_pretrained(
        "google/vit-base-patch16-224",
        torch_dtype=torch.bfloat16,
        # attn_implementation="flash_attention_2",
    )
    .to("cuda")
    .eval()
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [6]:
new_model = ViTForImageClassification(deepcopy(base_model.config)).to(device='cuda', dtype=torch.bfloat16)
# now copy the weights
# vit.encoder.input_layer = vit.encoder.layer.0
# vit.encoder.output_layer = vit.encoder.layer.11
# vit.encoder.middle_layer = vit.encoder.layer.1
# everything else remains the same
base_model_named_parameter_dict = dict(base_model.named_parameters())

for new_model_param_key, new_model_param_val in new_model.named_parameters():
    if ".input_layer." in new_model_param_key:
        mapped_key = new_model_param_key.replace(".input_layer.", ".layer.0.")
    elif ".output_layer." in new_model_param_key:
        mapped_key = new_model_param_key.replace(".output_layer.", ".layer.11.")
    elif ".middle_layer." in new_model_param_key:
        mapped_key = new_model_param_key.replace(".middle_layer.", ".layer.1.")
    else:
        mapped_key = new_model_param_key
    if "time_embedding" in new_model_param_key:
        # nothing we can do
        continue
    new_model_param_val.data.copy_(base_model_named_parameter_dict[mapped_key].data)

In [7]:
# prompt: write a function that returns total memory footprint of a pytorch module

def get_model_size(model):
  """
  Calculates the total memory footprint of a PyTorch module.

  Args:
    model: The PyTorch module.

  Returns:
    The total memory footprint in bytes.
  """
  total_size = 0
  for param in model.parameters():
    total_size += param.numel() * param.element_size()
  return total_size

In [8]:
new_model_size = get_model_size(new_model) / (1024 * 1024)
print(f"Model size: {new_model_size:.2f} MB")

Model size: 44.59 MB


In [9]:
processor = AutoProcessor.from_pretrained("google/vit-base-patch16-224", use_fast=True)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [10]:
# load the dataset
train_ds = load_dataset("ILSVRC/imagenet-1k", streaming=True)["train"]
test_ds = load_dataset("ILSVRC/imagenet-1k", streaming=True)["validation"]
print(train_ds)
print(test_ds)

README.md:   0%|          | 0.00/85.4k [00:00<?, ?B/s]

imagenet-1k.py:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

classes.py:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

The repository for ILSVRC/imagenet-1k contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ILSVRC/imagenet-1k.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
IterableDataset({
    features: ['image', 'label'],
    num_shards: 5
})
IterableDataset({
    features: ['image', 'label'],
    num_shards: 1
})


In [11]:
# make data collator i.e process the input, decide how many layers
def find_prob_idx(prob_array: list[float], p) -> int:
    low, high = 0, len(prob_array) - 1

    # Ensure p is within the range of the array
    if p < prob_array[low] or p > prob_array[high]:
        return -1  # p is out of bounds

    while low <= high:
        mid = (low + high) // 2

        # Check if p is between array[mid] and array[mid + 1]
        if prob_array[mid] <= p <= prob_array[mid + 1]:
            return mid
        elif p < prob_array[mid]:  # Move to the left half
            high = mid - 1
        else:  # Move to the right half
            low = mid + 1

    return -1  # Should not reach here if p is within bounds

def create_data_collator(proc, min_steps: int, max_steps: int, use_max_steps: bool = True):
    # eg: 1, 10
    # create probability array
    prob_array = [0]
    steps_count = max_steps - min_steps + 1
    base = steps_count * (steps_count + 1) / 2
    for i in range(1, max_steps+1):
        current_p = i / base
        prob_array.append(prob_array[-1] + current_p)

    def data_collator(samples):
        cpu_inputs: dict = proc(
            [x["image"].convert('RGB') for x in samples],
            return_tensors="pt",
            return_dict=True,
        )

        time_p = random.random()
        time_t = find_prob_idx(prob_array, time_p)

        if use_max_steps:
          cpu_inputs["max_steps"] = torch.IntTensor([time_t + min_steps for _ in samples])  # does this need to be tensor?
        cpu_inputs["labels"] = torch.LongTensor([x["label"] for x in samples])

        return cpu_inputs

    return data_collator

dc = create_data_collator(processor, new_model.config.num_hidden_layers // 2, new_model.config.num_hidden_layers - 2)

In [None]:
# initialize the data collator and huggingface trainer
ds_size = 1_281_167
train_bs = 512
single_bs_lr = 1.5625e-05
training_args = TrainingArguments(
    output_dir="./output",
    max_steps=9 *  ds_size // train_bs,
    per_device_train_batch_size=train_bs,
    per_device_eval_batch_size=train_bs * 2,
    gradient_accumulation_steps=1,
    logging_dir="./logs",
    logging_steps=2,
    save_steps=ds_size // train_bs,
    eval_steps=ds_size // train_bs,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    warmup_ratio=0.005,
    report_to="tensorboard",
    remove_unused_columns=False,
    learning_rate=1.5625e-05 * math.sqrt(train_bs),
    disable_tqdm=False,
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=new_model,
    args=training_args,
    data_collator=dc,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)



Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

max_steps is given, it will override any value given in num_train_epochs


In [13]:
base_dc = create_data_collator(processor, new_model.config.num_hidden_layers // 2, new_model.config.num_hidden_layers - 2, False)
base_model_trainer = Trainer(
    model=base_model,
    args=training_args,
    data_collator=base_dc,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)
base_model_trainer.evaluate()

max_steps is given, it will override any value given in num_train_epochs


{'eval_loss': 0.7556625008583069,
 'eval_model_preparation_time': 0.0032,
 'eval_accuracy': 0.8027,
 'eval_runtime': 656.9699,
 'eval_samples_per_second': 76.107,
 'eval_steps_per_second': 0.075}

In [14]:
# now remove base model trainer and run evaluation on new model
del base_model_trainer

In [15]:
trainer.evaluate()

{'eval_loss': 7.015890121459961,
 'eval_model_preparation_time': 0.0014,
 'eval_accuracy': 0.00178,
 'eval_runtime': 596.3807,
 'eval_samples_per_second': 83.839,
 'eval_steps_per_second': 0.082}

In [16]:
del base_model  # to free up some memory

In [17]:
trainer.train()

Step,Training Loss,Validation Loss,Model Preparation Time,Accuracy
2502,2.1641,2.27391,0.0014,0.48228
5004,1.7344,2.017615,0.0014,0.53068
7506,1.5117,1.928825,0.0014,0.54824
10008,1.4297,1.87843,0.0014,0.55836




Step,Training Loss,Validation Loss,Model Preparation Time,Accuracy
2502,2.1641,2.27391,0.0014,0.48228
5004,1.7344,2.017615,0.0014,0.53068
7506,1.5117,1.928825,0.0014,0.54824
10008,1.4297,1.87843,0.0014,0.55836


KeyboardInterrupt: 