In [17]:
! pip install datasets tensorboard 

Collecting tensorboard
  Downloading tensorboard-2.11.2-py3-none-any.whl (6.0 MB)
     ---------------------------------------- 6.0/6.0 MB 16.6 MB/s eta 0:00:00
Collecting markdown>=2.6.8
  Using cached Markdown-3.4.1-py3-none-any.whl (93 kB)
Collecting google-auth<3,>=1.6.3
  Downloading google_auth-2.16.0-py2.py3-none-any.whl (177 kB)
     ---------------------------------------- 177.8/177.8 kB ? eta 0:00:00
Collecting google-auth-oauthlib<0.5,>=0.4.1
  Using cached google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
Collecting tensorboard-plugin-wit>=1.6.0
  Using cached tensorboard_plugin_wit-1.8.1-py3-none-any.whl (781 kB)
Collecting absl-py>=0.4
  Downloading absl_py-1.4.0-py3-none-any.whl (126 kB)
     ---------------------------------------- 126.5/126.5 kB ? eta 0:00:00
Collecting protobuf<4,>=3.9.2
  Using cached protobuf-3.20.3-cp38-cp38-win_amd64.whl (904 kB)
Collecting tensorboard-data-server<0.7.0,>=0.6.0
  Using cached tensorboard_data_server-0.6.1-py3-none-any.whl (2

In [5]:
#soruce 
# https://www.philschmid.de/image-classification-huggingface-transformers-keras

# data:
# https://www.kaggle.com/code/paultimothymooney/display-meme-data/data

# meme dataset:
# https://www.kaggle.com/code/paultimothymooney/display-meme-data/data



import os
import datasets

def create_image_folder_dataset(root_path):
    """creates `Dataset` from image folder structure"""
  
    # get class names by folders names
    _CLASS_NAMES= os.listdir(root_path)
    # defines `datasets` features`
    features=datasets.Features({
        "image_file_path": datasets.Value(dtype='string'),
        "image": datasets.Image(decode=True, id=None),
        "labels": datasets.features.ClassLabel(names=_CLASS_NAMES),
    })
    # temp list holding datapoints for creation
    img_data_files=[]
    label_data_files=[]
    image_file_paths=[]
    # load images into list for creation
    for img_class in os.listdir(root_path):
        for img in os.listdir(os.path.join(root_path,img_class)):
            
            path_=os.path.join(root_path,img_class,img)
            
            img_data_files.append(path_)
            label_data_files.append(img_class)
            image_file_paths.append(path_)
            
    # create dataset
    ds = datasets.Dataset.from_dict({"image":img_data_files,"labels":label_data_files, "image_file_path": image_file_paths},features=features)
    return ds

from PIL import Image

def process_image(sample):
    
    im = sample['image']
    
    def resize_image(sample, new_width  = 224, new_height = 224):
        return sample.resize((new_width, new_height), Image.ANTIALIAS)

    def remove_transparency(sample, bg_colour=(255, 255, 255)):
        
        # Only process if image has transparency (http://stackoverflow.com/a/1963146)
        if im.mode not in ('RGB'):# or (im.mode == 'P' and 'transparency' in im.info):

            # Need to convert to RGBA if LA format due to a bug in PIL (http://stackoverflow.com/a/1963146)
            alpha = im.convert('RGBA').split()[-1]

            # Create a new background image of our matt color.
            # Must be RGBA because paste requires both images have the same format
            # (http://stackoverflow.com/a/8720632  and  http://stackoverflow.com/a/9459208)
            bg = Image.new("RGB", im.size, bg_colour + (255,))
            bg.paste(im, mask=alpha)            
            return bg
        return sample
    
#     im = resize_image(im)
    im = remove_transparency(im)
    sample['image'] = im    
    return sample

In [6]:
ds = create_image_folder_dataset("./meme_dataset/")

In [7]:
# test size will be 15% of train dataset
test_size=.15

ds = ds.shuffle().train_test_split(test_size=test_size)
ds

DatasetDict({
    train: Dataset({
        features: ['image', 'labels', 'image_file_path'],
        num_rows: 5655
    })
    test: Dataset({
        features: ['image', 'labels', 'image_file_path'],
        num_rows: 998
    })
})

In [8]:
ds['test'] = ds['test'].map(process_image)
ds['train'] = ds['train'].map(process_image)

  0%|          | 0/998 [00:00<?, ?ex/s]

  0%|          | 0/5655 [00:00<?, ?ex/s]

In [9]:
from transformers import ViTFeatureExtractor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
# model_name_or_path = 'facebook/deit-base-distilled-patch16-224'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path )

Downloading:   0%|          | 0.00/160 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [10]:
def process_example(example):
    inputs = feature_extractor(example['image'], return_tensors='pt')
    inputs['labels'] = example['labels']
    return inputs

In [11]:
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['labels']
    return inputs

prepared_ds = ds.with_transform(transform)

In [12]:
import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [13]:
import numpy as np
from datasets import load_metric

metric = load_metric("f1")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

  metric = load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [14]:
from transformers import ViTForImageClassification

labels = ds['train'].features['labels'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Downloading:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-base-patch16-224-in21k-meme",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=4,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-5,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)

In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["test"],
    tokenizer=feature_extractor,
)

Using cuda_amp half precision backend


In [19]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** Running training *****
  Num examples = 5655
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1416
  Number of trainable parameters = 85800194


Step,Training Loss,Validation Loss,F1
100,0.1382,0.156566,0.952195
200,0.1526,0.127657,0.96146
300,0.0797,0.117153,0.959764
400,0.0405,0.096651,0.971202
500,0.1239,0.093541,0.970378
600,0.0213,0.085315,0.975952
700,0.0242,0.085416,0.976
800,0.0117,0.086551,0.975952
900,0.0406,0.082886,0.977823
1000,0.0239,0.087334,0.975659


***** Running Evaluation *****
  Num examples = 998
  Batch size = 8
Saving model checkpoint to ./vit-base-patch16-224-in21k-meme\checkpoint-100
Configuration saved in ./vit-base-patch16-224-in21k-meme\checkpoint-100\config.json
Model weights saved in ./vit-base-patch16-224-in21k-meme\checkpoint-100\pytorch_model.bin
Feature extractor saved in ./vit-base-patch16-224-in21k-meme\checkpoint-100\preprocessor_config.json
***** Running Evaluation *****
  Num examples = 998
  Batch size = 8
Saving model checkpoint to ./vit-base-patch16-224-in21k-meme\checkpoint-200
Configuration saved in ./vit-base-patch16-224-in21k-meme\checkpoint-200\config.json
Model weights saved in ./vit-base-patch16-224-in21k-meme\checkpoint-200\pytorch_model.bin
Feature extractor saved in ./vit-base-patch16-224-in21k-meme\checkpoint-200\preprocessor_config.json
***** Running Evaluation *****
  Num examples = 998
  Batch size = 8
Saving model checkpoint to ./vit-base-patch16-224-in21k-meme\checkpoint-300
Configuration s

***** train metrics *****
  epoch                    =          4.0
  total_flos               = 1632486288GF
  train_loss               =       0.0698
  train_runtime            =   0:51:38.20
  train_samples_per_second =        7.301
  train_steps_per_second   =        0.457


In [20]:
metrics = trainer.evaluate(prepared_ds['test'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 998
  Batch size = 8


***** eval metrics *****
  epoch                   =        4.0
  eval_f1                 =     0.9778
  eval_loss               =     0.0829
  eval_runtime            = 0:00:32.32
  eval_samples_per_second =     30.877
  eval_steps_per_second   =      3.867


In [5]:
from transformers import ViTForImageClassification
from transformers import ViTFeatureExtractor
from PIL import Image


feature_extractor = ViTFeatureExtractor.from_pretrained('./vit-base-patch16-224-in21k-meme')
model = ViTForImageClassification.from_pretrained('./vit-base-patch16-224-in21k-meme')


In [40]:
image = Image.open('./examples/322715824_889438175516307_5513650897881487462_n.jpg')
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
conf_score  = max(logits.softmax(-1).tolist()[0])
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx], ' with prediction score:', conf_score)



Predicted class: nomeme  with prediction score: 0.9702327251434326
