In [3]:
!pip3 install pyspark datasets evaluate accelerate -U
import numpy as np
import torch
from pyspark.sql import SparkSession
from pyspark.ml.torch.distributor import TorchDistributor
from transformers import Trainer, TrainingArguments
from transformers.training_args import OptimizerNames
from datasets import load_dataset
import evaluate
from transformers import AutoImageProcessor, AutoModelForImageClassification
# !wget -c https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
# !tar -xvzf cifar-10-python.tar.gz
dataset_path = 'dataset/'

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting transformers[torch]
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
def get_fine_tuning_trainer_args(output_path, hyperparameters=None):

    return TrainingArguments(
        output_dir=output_path + 'training/',
        logging_dir=output_path + 'logs/',
        per_device_train_batch_size=5,
        per_device_eval_batch_size=5,
        evaluation_strategy="steps",
        num_train_epochs=1,
        save_steps=40,
        eval_steps=40,
        logging_steps=1,
        learning_rate=5.e-05,
        warmup_ratio=0.1,
        warmup_steps=1,
        weight_decay=0,
        save_total_limit=2,
        metric_for_best_model='accuracy',
        greater_is_better=True,
        optim=OptimizerNames.ADAMW_HF,
        remove_unused_columns=False,
        push_to_hub=False,
        load_best_model_at_end=True,
        seed=42,
        gradient_accumulation_steps=4,
    )

def build_metrics():

    _ = evaluate.load("accuracy", cache_dir="metrics/", trust_remote_code=True)

    metric = evaluate.combine(["accuracy"])

    def compute_metrics(p):
        return metric.compute(
            predictions=np.argmax(p.predictions, axis=1),
            references=p.label_ids
        )

    return compute_metrics

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

In [84]:
def startTraining(model_name, distributed_training=False):

    train_dataset = load_dataset('cifar10', split=f"train[:5]", verification_mode='no_checks',
                     cache_dir=dataset_path)

    test_dataset = load_dataset('cifar10', split=f"test[:5]", verification_mode='no_checks',
                                     cache_dir=dataset_path)

    pretrained_model = AutoModelForImageClassification.from_pretrained(model_name, cache_dir='models/', num_labels=10, ignore_mismatched_sizes=True)

    feature_extractor = AutoImageProcessor.from_pretrained(model_name, cache_dir='models/')

    def trainer():
        print(f"Starting fine-tuning on model {model_name}:")


        def preprocess(batchImage):
            inputs = feature_extractor(batchImage['img'], return_tensors='pt')
            inputs['label'] = batchImage['label']
            return inputs

        fine_tune_args = get_fine_tuning_trainer_args("results/")

        # print(fine_tune_args.device)

        fine_tune_trainer = Trainer(
            model=pretrained_model,
            args=fine_tune_args,
            data_collator=collate_fn,
            compute_metrics=build_metrics(),
            train_dataset=train_dataset.with_transform(preprocess),
            eval_dataset=test_dataset.with_transform(preprocess),
        )

        train_results = fine_tune_trainer.train()

        fine_tune_trainer.save_model(output_dir='results/models')

        fine_tune_trainer.log_metrics("train", train_results.metrics)
        fine_tune_trainer.save_state()

        metrics = fine_tune_trainer.evaluate(test_dataset.with_transform(preprocess))
        fine_tune_trainer.log_metrics("eval", metrics)
        fine_tune_trainer.save_metrics("eval", metrics)

        torch.distributed.destroy_process_group()
        return train_results


    if distributed_training:
        import torch.distributed
        torch.distributed.init_process_group(backend="nccl")
        spark = SparkSession.builder. \
          appName("ImageClassification"). \
          master("local[*]"). \
          config("spark.executor.memory", "16G"). \
          config("spark.driver.memory", "16G"). \
          config("spark.executor.resource.gpu.amount", "14").\
          config("spark.driver.resource.gpu.amount","1").\
          config("spark.task.resource.gpu.amount","1").\
          config("spark.executor.resource.gpu.discoveryScript", "/content/dis.sh").\
          config("spark.driver.resource.gpu.discoveryScript", "/content/dis.sh").\
          getOrCreate()
        NUM_PROCESSES = torch.cuda.device_count()
        results = TorchDistributor(num_processes=NUM_PROCESSES, local_mode=True, use_gpu=True).run(trainer)
        spark.stop()
    else:
      trainer()

In [73]:
startTraining('facebook/deit-base-patch16-224', False)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at facebook/deit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fine-tuning on model facebook/deit-base-patch16-224:




Step,Training Loss,Validation Loss,Accuracy
40,0.3106,0.247193,0.9407
80,0.1578,0.144176,0.9608


***** train metrics *****
  epoch                    =      0.9923
  total_flos               = 896127456GF
  train_loss               =      0.5092
  train_runtime            =  0:12:29.83
  train_samples_per_second =       16.67
  train_steps_per_second   =       0.129


***** eval metrics *****
  epoch                   =     0.9923
  eval_accuracy           =     0.9608
  eval_loss               =     0.1442
  eval_runtime            = 0:02:18.74
  eval_samples_per_second =     72.074
  eval_steps_per_second   =      2.256


In [74]:
startTraining('facebook/deit-base-patch16-224', True)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at facebook/deit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:TorchDistributor:Started local training with 1 processes


2024-04-20 19:53:17.984582: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-20 19:53:17.984635: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-20 19:53:17.986121: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Starting fine-tuning on model facebook/deit-base-patch16-224:
  1%|          | 1/97 [00:05<08:06,  5.07s/it]  2%|▏         | 2/97 [00:09<07:36,  4.81s/it]  3%|▎         | 3/97 [00:14<07:27,  4.76s/it]  4%|▍         | 4/97 [00:19<07:23,  4.77s/it]  5%|▌         | 5/97 [00:24<07:20,  4.79s/it]  6%|▌         | 6/97 [00:29<07:22,  4.86s/it]  7%|▋         | 7/9

INFO:TorchDistributor:Finished local training with 1 processes


  0%|          | 0/313 [00:00<?, ?it/s]  1%|          | 2/313 [00:00<01:04,  4.81it/s]  1%|          | 3/313 [00:00<01:31,  3.39it/s]  1%|▏         | 4/313 [00:01<01:46,  2.91it/s]  2%|▏         | 5/313 [00:01<01:54,  2.68it/s]  2%|▏         | 6/313 [00:02<01:59,  2.56it/s]  2%|▏         | 7/313 [00:02<02:02,  2.50it/s]  3%|▎         | 8/313 [00:02<02:04,  2.46it/s]  3%|▎         | 9/313 [00:03<02:05,  2.43it/s]  3%|▎         | 10/313 [00:03<02:05,  2.41it/s]  4%|▎         | 11/313 [00:04<02:05,  2.40it/s]  4%|▍         | 12/313 [00:04<02:07,  2.36it/s]  4%|▍         | 13/313 [00:05<02:08,  2.34it/s]  4%|▍         | 14/313 [00:05<02:08,  2.32it/s]  5%|▍         | 15/313 [00:05<02:08,  2.32it/s]  5%|▌         | 16/313 [00:06<02:07,  2.33it/s]  5%|▌         | 17/313 [00:06<02:07,  2.33it/s]  6%|▌         | 18/313 [00:07<02:08,  2.29it/s]  6%|▌         | 19/313 [00:07<02:11,  2.24it/s]  6%|▋         | 20/313 [00:08<02:12,  2.21it/s]  7%|▋         | 21/313 [00:08<02:12