# Fine Tune CLIP on Tweets

- CLIP on huggingface: https://huggingface.co/openai/clip-vit-base-patch32
- Dataset: https://huggingface.co/datasets/AlekseyDorkin/extended_tweet_emojis/tree/main

## 1. Install Dependencies

In [2]:
# you might want to restart the kernel
# coupling between torch and torchvision: https://pypi.org/project/torchvision/
!pip install torchvision==0.11.1 torch==1.10.0 --quiet

In [7]:
# you might want to restart the kernel after installation is complete.
!pip install transformers datasets pillow ipywidgets requests jupyter jupyter_client wandb sklearn --upgrade --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## 2. Init Variables and Tools

In [1]:
pwd = !pwd
data_path = pwd[0] + "/emojis"
data_path

'/Users/vincent/Workspace/emoji-predictor/fine-tune/emojis'

In [4]:
import wandb
from transformers import TrainingArguments, Trainer

wandb.init(project="emoji-predictor", entity="drift-ai")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/vincent/.netrc


## 3. Setup Data Preprocessors and Trainer

In [55]:
from pathlib import Path

import torch
from torchvision.transforms import Resize, InterpolationMode, ConvertImageDtype, CenterCrop
from torchvision.io import read_image, ImageReadMode
from transformers import CLIPProcessor, CLIPModel, Trainer
from transformers import default_data_collator, TrainingArguments
from datasets import load_dataset, Dataset

# Loading Data
dataset = load_dataset("vincentclaes/emoji-predictor")
train_dataset = dataset["train"].select(range(16))
val_dataset = dataset["validation"].select(range(16))
test_dataset = dataset["test"]

column_names = train_dataset.column_names
assert "label" in column_names
assert "text" in column_names
image_column = "label"
caption_column = "text"


Using custom data configuration vincentclaes--emoji-predictor-84ee9ecf6ec78809
Reusing dataset parquet (/Users/vincent/.cache/huggingface/datasets/vincentclaes___parquet/vincentclaes--emoji-predictor-84ee9ecf6ec78809/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [56]:
# Loading Model and Processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
config = model.config
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = processor.tokenizer
feature_extractor = processor.feature_extractor

MAX_TEXT_LENGTH=77
IMAGE_SIZE = config.vision_config.image_size

# Preprocess Text
def tokenize_captions(examples):
    captions = [caption for caption in examples[caption_column]]
    text_inputs = tokenizer(captions, max_length=MAX_TEXT_LENGTH, padding="max_length", truncation=True)
    examples["input_ids"] = text_inputs.input_ids
    examples["attention_mask"] = text_inputs.attention_mask
    return examples


train_dataset = train_dataset.map(
    function=tokenize_captions,
    batched=True,
    remove_columns=[col for col in column_names if col != image_column],
    num_proc=None,
    load_from_cache_file=False,
    desc="Running tokenizer on train dataset",
)

val_dataset = val_dataset.map(
    function=tokenize_captions,
    batched=True,
    remove_columns=[col for col in column_names if col != image_column],
    num_proc=None,
    load_from_cache_file=False,
    desc="Running tokenizer on val dataset",
)

test_dataset = test_dataset.map(
    function=tokenize_captions,
    batched=True,
    remove_columns=[col for col in column_names if col != image_column],
    num_proc=None,
    load_from_cache_file=False,
    desc="Running tokenizer on test dataset",
)

# Preprocess Images
class Transform(torch.nn.Module):
    def __init__(self, image_size):
        super().__init__()
        self.transforms = torch.nn.Sequential(
            # resize and then crop the image to the image_size
            Resize([image_size], interpolation=InterpolationMode.BICUBIC),
            CenterCrop(image_size),
            # convert RGB to floats
            ConvertImageDtype(torch.float),
        )

    def forward(self, x) -> torch.Tensor:
        with torch.no_grad():
            x = self.transforms(x)
        return x


image_transformations = Transform(
    IMAGE_SIZE
)
image_transformations = torch.jit.script(image_transformations)


def transform_images(examples):
    # https://pytorch.org/vision/stable/_modules/torchvision/io/image.html#ImageReadMode
    images = [read_image(str(Path(data_path,f"{c}.png")), ImageReadMode.RGB) for c in examples[image_column]]
    examples["pixel_values"] = [image_transformations(image) for image in images]
    return examples


train_dataset.set_transform(transform_images)
val_dataset.set_transform(transform_images)
test_dataset.set_transform(transform_images)

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    input_ids = torch.tensor([example["input_ids"] for example in examples], dtype=torch.long)
    attention_mask = torch.tensor([example["attention_mask"] for example in examples], dtype=torch.long)
    # labels = torch.tensor([example["label"] for example in examples])
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        # "labels": labels,
        "return_loss": True,
    }

# def compute_metrics(p):
#     is_regression = False
#     preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
#     preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
#     if data_args.task_name is not None:
#         result = metric.compute(predictions=preds, references=p.label_ids)
#         if len(result) > 1:
#             result["combined_score"] = np.mean(list(result.values())).item()
#         return result
#     elif is_regression:
#         return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
#     else:
#         return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

from datasets import load_metric
import numpy as np
metric = load_metric("precision")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=TrainingArguments(output_dir="./checkpoints",
                           weight_decay=0.1,
                           dataloader_num_workers=0,
                           per_device_eval_batch_size=8,
                           per_device_train_batch_size=8,
                           num_train_epochs=1,
                           warmup_steps=0,
                           learning_rate=5e-05,
                           report_to="wandb",
                           metric_for_best_model="accuracy",
                           label_smoothing_factor=0.0
                           ),
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    compute_metrics=compute_metrics,
    data_collator=collate_fn
)


loading configuration file https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/config.json from cache at /Users/vincent/.cache/huggingface/transformers/3b789e359eb706935804a4c9db0b23519332fa3e1fad47b153db4fa81203a261.050ce7e9cbf1d6b484dcac0a3f0777b408c22a3e8bd14260000acef5fbc88aee
text_config_dict is None. Initializing the CLIPTextConfig with default values.
vision_config_dict is None. initializing the CLIPVisionConfig with default values.
Model config CLIPConfig {
  "_name_or_path": "openai/clip-vit-base-patch32",
  "architectures": [
    "CLIPModel"
  ],
  "initializer_factor": 1.0,
  "logit_scale_init_value": 2.6592,
  "model_type": "clip",
  "projection_dim": 512,
  "text_config": {
    "_name_or_path": "",
    "add_cross_attention": false,
    "architectures": null,
    "attention_dropout": 0.0,
    "bad_words_ids": null,
    "bos_token_id": 0,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity

Running tokenizer on train dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on val dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

PyTorch: setting up devices


In [11]:
for batch in trainer.get_eval_dataloader(val_dataset):
    batch

In [12]:
print(batch)

{'pixel_values': tensor([[[[0.2784, 0.2784, 0.2784,  ..., 0.2784, 0.2784, 0.2784],
          [0.2784, 0.2784, 0.2784,  ..., 0.2784, 0.2784, 0.2784],
          [0.2784, 0.2784, 0.2784,  ..., 0.2784, 0.2784, 0.2784],
          ...,
          [0.2784, 0.2784, 0.2784,  ..., 0.2784, 0.2784, 0.2784],
          [0.2784, 0.2784, 0.2784,  ..., 0.2784, 0.2784, 0.2784],
          [0.2784, 0.2784, 0.2784,  ..., 0.2784, 0.2784, 0.2784]],

         [[0.4392, 0.4392, 0.4392,  ..., 0.4392, 0.4392, 0.4392],
          [0.4392, 0.4392, 0.4392,  ..., 0.4392, 0.4392, 0.4392],
          [0.4392, 0.4392, 0.4392,  ..., 0.4392, 0.4392, 0.4392],
          ...,
          [0.4392, 0.4392, 0.4392,  ..., 0.4392, 0.4392, 0.4392],
          [0.4392, 0.4392, 0.4392,  ..., 0.4392, 0.4392, 0.4392],
          [0.4392, 0.4392, 0.4392,  ..., 0.4392, 0.4392, 0.4392]],

         [[0.2980, 0.2980, 0.2980,  ..., 0.2980, 0.2980, 0.2980],
          [0.2980, 0.2980, 0.2980,  ..., 0.2980, 0.2980, 0.2980],
          [0.2980, 0.2980

## 4. Train the Model

In [57]:
from transformers.trainer_utils import get_last_checkpoint
# train_result = trainer.train(resume_from_checkpoint=get_last_checkpoint("./checkpoints"))
train_result = trainer.train()


***** Running training *****
  Num examples = 16
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




In [58]:
train_result.metrics

{'train_runtime': 14.1957,
 'train_samples_per_second': 1.127,
 'train_steps_per_second': 0.141,
 'total_flos': 930671967456.0,
 'train_loss': 3.525783061981201,
 'epoch': 1.0}

In [60]:
trainer.evaluate(ignore_keys=["text_model_output", "vision_model_output"])


***** Running Evaluation *****
  Num examples = 16
  Batch size = 8


KeyboardInterrupt: 

In [None]:
trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()

In [None]:
# metrics = trainer.evaluate()
# trainer.log_metrics("eval", metrics)
# trainer.save_metrics("eval", metrics)

In [None]:
kwargs = {
    "finetuned_from": "emoji-predictor", 
    "tasks": "contrastive-image-text-modeling", 
    "dataset": "AlekseyDorkin/extended_tweet_emojis"
}
trainer.push_to_hub(**kwargs)