# Fine Tune CLIP on Tweets

- CLIP on huggingface: https://huggingface.co/openai/clip-vit-base-patch32
- Dataset: https://huggingface.co/datasets/AlekseyDorkin/extended_tweet_emojis/tree/main

## 1. Install Dependencies

In [2]:
# you might want to restart the kernel
# coupling between torch and torchvision: https://pypi.org/project/torchvision/
!pip install torchvision==0.11.1 torch==1.10.0

Collecting torchvision==0.11.1
  Using cached torchvision-0.11.1-cp38-cp38-manylinux1_x86_64.whl (23.3 MB)
Collecting torch==1.10.0
  Using cached torch-1.10.0-cp38-cp38-manylinux1_x86_64.whl (881.9 MB)
Installing collected packages: torch, torchvision
  Attempting uninstall: torch
    Found existing installation: torch 1.10.2+cu113
    Uninstalling torch-1.10.2+cu113:
      Successfully uninstalled torch-1.10.2+cu113
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.11.3
    Uninstalling torchvision-0.11.3:
      Successfully uninstalled torchvision-0.11.3
Successfully installed torch-1.10.0 torchvision-0.11.1
[0m

In [3]:
# you might want to restart the kernel after installation is complete.
!pip install transformers datasets pillow ipywidgets requests jupyter jupyter_client wandb --upgrade --quiet

[0m

## 2. Init Variables and Tools

In [20]:
pwd = !pwd
data_path = pwd[0] + "/emojis"
data_path

'/root/emoji-predictor/fine-tune/emojis'

In [21]:
import wandb
from transformers import TrainingArguments, Trainer

wandb.init(project="emoji-predictor", entity="drift-ai")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

## 3. Setup Data Preprocessors and Trainer

In [24]:
import torch
from torchvision.transforms import Resize, InterpolationMode, ConvertImageDtype, CenterCrop
from transformers import CLIPProcessor, CLIPModel, Trainer
from transformers import default_data_collator, TrainingArguments
from datasets import load_dataset
from torchvision.io import read_image, ImageReadMode
from pathlib import Path

# Loading Data
dataset = load_dataset("AlekseyDorkin/extended_tweet_emojis")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
column_names = train_dataset.column_names
assert "label" in column_names
assert "text" in column_names
image_column = "label"
caption_column = "text"

# Loading Model and Processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
config = model.config
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = processor.tokenizer
feature_extractor = processor.feature_extractor

MAX_TEXT_LENGTH=77
IMAGE_SIZE = config.vision_config.image_size

# Preprocess Text
def tokenize_captions(examples):
    captions = [caption for caption in examples[caption_column]]
    text_inputs = tokenizer(captions, max_length=MAX_TEXT_LENGTH, padding="max_length", truncation=True)
    examples["input_ids"] = text_inputs.input_ids
    examples["attention_mask"] = text_inputs.attention_mask
    return examples


train_dataset = train_dataset.map(
    function=tokenize_captions,
    batched=True,
    remove_columns=[col for col in column_names if col != image_column],
    num_proc=None,
    load_from_cache_file=False,
    desc="Running tokenizer on train dataset",
)

# Preprocess Images
class Transform(torch.nn.Module):
    def __init__(self, image_size):
        super().__init__()
        self.transforms = torch.nn.Sequential(
            # resize and then crop the image to the image_size
            Resize([image_size], interpolation=InterpolationMode.BICUBIC),
            CenterCrop(image_size),
            # convert RGB to floats
            ConvertImageDtype(torch.float),
        )

    def forward(self, x) -> torch.Tensor:
        with torch.no_grad():
            x = self.transforms(x)
        return x


image_transformations = Transform(
    IMAGE_SIZE
)
image_transformations = torch.jit.script(image_transformations)


def transform_images(examples):
    # https://pytorch.org/vision/stable/_modules/torchvision/io/image.html#ImageReadMode
    images = [read_image(str(Path(data_path,f"{c}.png")), ImageReadMode.RGB) for c in examples[image_column]]
    examples["pixel_values"] = [image_transformations(image) for image in images]
    return examples


train_dataset.set_transform(transform_images)


def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    input_ids = torch.tensor([example["input_ids"] for example in examples], dtype=torch.long)
    attention_mask = torch.tensor([example["attention_mask"] for example in examples], dtype=torch.long)
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "return_loss": True,
    }


trainer = Trainer(
    model=model,
    args=TrainingArguments(output_dir="./checkpoints",
                           weight_decay=0.1,
                           dataloader_num_workers=0,
                           per_device_eval_batch_size=64,
                           per_device_train_batch_size=64,
                           num_train_epochs=3.0,
                           warmup_steps=0,
                           learning_rate=5e-05,
                           report_to="wandb"
                           ),
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn
)


Using custom data configuration AlekseyDorkin--extended_tweet_emojis-4eb99ce06d465da0
Reusing dataset parquet (/root/.cache/huggingface/datasets/AlekseyDorkin___parquet/AlekseyDorkin--extended_tweet_emojis-4eb99ce06d465da0/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3b789e359eb706935804a4c9db0b23519332fa3e1fad47b153db4fa81203a261.050ce7e9cbf1d6b484dcac0a3f0777b408c22a3e8bd14260000acef5fbc88aee
text_config_dict is None. Initializing the CLIPTextConfig with default values.
vision_config_dict is None. initializing the CLIPVisionConfig with default values.
Model config CLIPConfig {
  "_name_or_path": "openai/clip-vit-base-patch32",
  "architectures": [
    "CLIPModel"
  ],
  "initializer_factor": 1.0,
  "logit_scale_init_value": 2.6592,
  "model_type": "clip",
  "projection_dim": 512,
  "text_config": {
    "_name_or_path": "",
    "add_cross_attention": false,
    "architectures": null,
    "attention_dropout": 0.0,
    "bad_words_ids": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty"

Running tokenizer on train dataset:   0%|          | 0/48 [00:00<?, ?ba/s]

PyTorch: setting up devices


## 4. Train the Model

In [None]:
trainer.train()


***** Running training *****
  Num examples = 47436
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2226
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
500,3.2507
1000,2.9014
1500,2.6745
2000,2.0944


Saving model checkpoint to ./checkpoints/checkpoint-500
Configuration saved in ./checkpoints/checkpoint-500/config.json
Model weights saved in ./checkpoints/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./checkpoints/checkpoint-1000
Configuration saved in ./checkpoints/checkpoint-1000/config.json
Model weights saved in ./checkpoints/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./checkpoints/checkpoint-1500
Configuration saved in ./checkpoints/checkpoint-1500/config.json
Model weights saved in ./checkpoints/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./checkpoints/checkpoint-2000
Configuration saved in ./checkpoints/checkpoint-2000/config.json
Model weights saved in ./checkpoints/checkpoint-2000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `CLIPModel.forward` and have been ignored: text. If text are n

KeyError: 'pixel_values'

In [26]:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

The following columns in the evaluation set don't have a corresponding argument in `CLIPModel.forward` and have been ignored: text. If text are not expected by `CLIPModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5258
  Batch size = 64


KeyError: 'pixel_values'

In [40]:
dir(trainer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_gather_and_numpify',
 '_get_collator_with_removed_columns',
 '_get_eval_sampler',
 '_get_learning_rate',
 '_get_train_sampler',
 '_globalstep_last_logged',
 '_hp_search_setup',
 '_inner_training_loop',
 '_load_best_model',
 '_load_from_checkpoint',
 '_load_optimizer_and_scheduler',
 '_load_rng_state',
 '_loggers_initialized',
 '_maybe_log_save_evaluate',
 '_memory_tracker',
 '_move_model_to_device',
 '_nested_gather',
 '_pad_across_processes',
 '_prepare_input',
 '_prepare_inputs',
 '_push_from_checkpoint',
 '_remove_unused_columns',
 '_report_to_hp_search',
 '_rotate_checkpoints',
 '_save',
 '_save_checkpoint',
 '_save

In [31]:
kwargs = {
    "finetuned_from": "emoji-predictor", 
    "tasks": "contrastive-image-text-modeling", 
    "dataset": "AlekseyDorkin/extended_tweet_emojis"
}
trainer.push_to_hub(**kwargs)

OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).

In [30]:
!git lfs install

git: 'lfs' is not a git command. See 'git --help'.

The most similar command is
	log


## 5. Evaluate

In [None]:
trainer