In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [2]:
!lscpu

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          48 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   12
  On-line CPU(s) list:    0-11
Vendor ID:                AuthenticAMD
  Model name:             AMD Ryzen 5 5600X 6-Core Processor
    CPU family:           25
    Model:                33
    Thread(s) per core:   2
    Core(s) per socket:   6
    Socket(s):            1
    Stepping:             2
    Frequency boost:      enabled
    CPU(s) scaling MHz:   95%
    CPU max MHz:          4651.0000
    CPU min MHz:          550.0000
    BogoMIPS:             7402.24
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall
                           nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep
                          _good nopl xtopology nonstop_tsc cpu

In [5]:
!pip install pandas torch torchvision transformers pytorch-lightning tqdm ipywidgets

Collecting ipywidgets
  Obtaining dependency information for ipywidgets from https://files.pythonhosted.org/packages/22/2d/9c0b76f2f9cc0ebede1b9371b6f317243028ed60b90705863d493bae622e/ipywidgets-8.1.5-py3-none-any.whl.metadata
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Obtaining dependency information for widgetsnbextension~=4.0.12 from https://files.pythonhosted.org/packages/21/02/88b65cc394961a60c43c70517066b6b679738caf78506a5da7b88ffcb643/widgetsnbextension-4.0.13-py3-none-any.whl.metadata
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Obtaining dependency information for jupyterlab-widgets~=3.0.12 from https://files.pythonhosted.org/packages/a9/93/858e87edc634d628e5d752ba944c2833133a28fa87bb093e6832ced36a3e/jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata
  Downloading jupyterlab_widgets-3.0.13-py3-none-any

In [7]:
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
from tqdm import tqdm

In [5]:
!cp /kaggle/input/goldenleaf/image_descriptions.csv .

  pid, fd = os.forkpty()


In [6]:
!mkdir -p src/resources
!cp -r /kaggle/input/goldenleaf/images src/resources

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load model directly

processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
model = AutoModelForZeroShotImageClassification.from_pretrained("openai/clip-vit-large-patch14")

In [14]:
data = pd.read_csv("image_descriptions.csv")


class ImageTextDataset(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor
        self.images = self.load_images()

    def load_images(self):
            loaded_images = {}
            for idx in tqdm(range(len(self.data)), desc="Loading Images"):
                image_path = self.data.iloc[idx]["image"]
                image = Image.open(image_path).convert("RGB")
                image = image.resize((224, 224))
                loaded_images[image_path] = image
            return loaded_images

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path = self.data.iloc[idx]["image"]
        description = self.data.iloc[idx]["description"]
        image = self.images[image_path]

        return {
            "image": image,
            "text": description
        }


In [15]:
dataset = ImageTextDataset(data, processor)

train_size = int(0.8 * len(data))
val_size = len(data) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

Loading Images: 100%|██████████| 115/115 [00:26<00:00,  4.42it/s]


In [17]:
def collate_fn(batch):
    texts = [item['text'] for item in batch]
    images = [item['image'] for item in batch]

    # Utilisation du processor pour traiter les données
    inputs = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=77
    )

    return inputs


batch_size = 4
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,  # Shuffle pour l'entraînement
    collate_fn=collate_fn,
    num_workers=2,
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,  # Pas besoin de shuffle pour la validation
    collate_fn=collate_fn,
    num_workers=2,
)


In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class CLIPFinetuner(pl.LightningModule):
    def __init__(self, model, lr=1e-5):
        super().__init__()
        self.model = model
        self.lr = lr
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, pixel_values, input_ids, attention_mask):
        return self.model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
        )

    def compute_loss(self, logits_per_image, logits_per_text):
        labels = torch.arange(logits_per_image.size(0), device=self.device)
        loss_img_to_txt = self.loss_fn(logits_per_image, labels)
        loss_txt_to_img = self.loss_fn(logits_per_text, labels)
        return (loss_img_to_txt + loss_txt_to_img) / 2

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = self.compute_loss(
            logits_per_image=outputs.logits_per_image,
            logits_per_text=outputs.logits_per_text,
        )
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = self.compute_loss(
            logits_per_image=outputs.logits_per_image,
            logits_per_text=outputs.logits_per_text,
        )
        self.log("val_loss", loss, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)

        # Define a learning rate scheduler
        scheduler = {
            'scheduler': optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1),
            'monitor': 'val_loss',
            'interval': 'epoch',  # Adjust every epoch
            'frequency': 1,  # Apply scheduler every epoch
        }

        return [optimizer], [scheduler]



In [32]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

# Initialisation du modèle Lightning
model = CLIPFinetuner(model)

# Configuration des callbacks (sauvegarde du meilleur modèle)
checkpoint_callback = ModelCheckpoint(
    monitor="val_loss", mode="min", save_top_k=1, filename="clip_finetuned-{epoch:02d}-{val_loss:.4f}"
)

# Configuration du logger pour suivre les métriques
logger = CSVLogger("logs", name="clip_finetuning")

# Initialisation du trainer
trainer = Trainer(
    max_epochs=10,
    accelerator="cpu",  # Utilise le GPU si disponible
    devices=1,          # Nombre de GPU
    callbacks=[checkpoint_callback],
    logger=logger,
    log_every_n_steps=8,
    enable_progress_bar=True,
    accumulate_grad_batches=4,
)

# Entraînement
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | CLIPFinetuner    | 427 M  | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
427 M     Trainable params
0         Non-trainable params
427 M     Total params
1,710.466 Total estimated model params size (MB)
5         Modules in train mode
462       Modules in eval mode


Epoch 0: 100%|██████████| 23/23 [02:04<00:00,  0.19it/s, v_num=5, train_loss_step=1.390]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:  17%|█▋        | 1/6 [00:01<00:08,  0.59it/s][A
Validation DataLoader 0:  33%|███▎      | 2/6 [00:03<00:06,  0.59it/s][A
Validation DataLoader 0:  50%|█████     | 3/6 [00:05<00:05,  0.59it/s][A
Validation DataLoader 0:  67%|██████▋   | 4/6 [00:06<00:03,  0.59it/s][A
Validation DataLoader 0:  83%|████████▎ | 5/6 [00:08<00:01,  0.59it/s][A
Validation DataLoader 0: 100%|██████████| 6/6 [00:09<00:00,  0.62it/s][A
Epoch 1: 100%|██████████| 23/23 [02:07<00:00,  0.18it/s, v_num=5, train_loss_step=1.320, val_loss=1.380, train_loss_epoch=1.390]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/6 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/6 [00:00<?, ?it/s][A
Va

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 23/23 [02:10<00:00,  0.18it/s, v_num=5, train_loss_step=0.419, val_loss=2.880, train_loss_epoch=0.338]


In [34]:
from pytorch_lightning import Trainer

# Load model from checkpoint
checkpoint_path = "logs/clip_finetuning/version_5/checkpoints/clip_finetuned-epoch=00-val_loss=1.3814.ckpt"
model = CLIPFinetuner.load_from_checkpoint(checkpoint_path, model=model)

RuntimeError: Error(s) in loading state_dict for CLIPFinetuner:
	Missing key(s) in state_dict: "model.model.model.model.model.model.model.model.logit_scale", "model.model.model.model.model.model.model.model.text_model.embeddings.token_embedding.weight", "model.model.model.model.model.model.model.model.text_model.embeddings.position_embedding.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.0.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.1.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.2.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.3.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.4.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.5.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.6.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.7.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.8.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.9.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.10.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.layer_norm1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.layer_norm1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.mlp.fc1.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.mlp.fc1.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.mlp.fc2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.mlp.fc2.bias", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.layer_norm2.weight", "model.model.model.model.model.model.model.model.text_model.encoder.layers.11.layer_norm2.bias", "model.model.model.model.model.model.model.model.text_model.final_layer_norm.weight", "model.model.model.model.model.model.model.model.text_model.final_layer_norm.bias", "model.model.model.model.model.model.model.model.vision_model.embeddings.class_embedding", "model.model.model.model.model.model.model.model.vision_model.embeddings.patch_embedding.weight", "model.model.model.model.model.model.model.model.vision_model.embeddings.position_embedding.weight", "model.model.model.model.model.model.model.model.vision_model.pre_layrnorm.weight", "model.model.model.model.model.model.model.model.vision_model.pre_layrnorm.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.0.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.1.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.2.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.3.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.4.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.5.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.6.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.7.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.8.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.9.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.10.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.11.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.12.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.13.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.14.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.15.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.16.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.17.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.18.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.19.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.20.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.21.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.22.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.k_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.k_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.v_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.v_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.q_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.q_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.out_proj.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.out_proj.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.layer_norm1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.layer_norm1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.mlp.fc1.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.mlp.fc1.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.mlp.fc2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.mlp.fc2.bias", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.layer_norm2.weight", "model.model.model.model.model.model.model.model.vision_model.encoder.layers.23.layer_norm2.bias", "model.model.model.model.model.model.model.model.vision_model.post_layernorm.weight", "model.model.model.model.model.model.model.model.vision_model.post_layernorm.bias", "model.model.model.model.model.model.model.model.visual_projection.weight", "model.model.model.model.model.model.model.model.text_projection.weight". 
	Unexpected key(s) in state_dict: "model.model.model.model.model.model.model.logit_scale", "model.model.model.model.model.model.model.text_model.embeddings.token_embedding.weight", "model.model.model.model.model.model.model.text_model.embeddings.position_embedding.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.0.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.0.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.0.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.0.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.0.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.0.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.0.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.0.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.0.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.1.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.1.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.1.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.1.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.1.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.1.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.1.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.1.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.1.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.2.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.2.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.2.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.2.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.2.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.2.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.2.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.2.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.2.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.3.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.3.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.3.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.3.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.3.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.3.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.3.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.3.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.3.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.4.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.4.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.4.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.4.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.4.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.4.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.4.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.4.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.4.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.5.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.5.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.5.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.5.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.5.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.5.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.5.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.5.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.5.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.6.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.6.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.6.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.6.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.6.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.6.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.6.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.6.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.6.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.7.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.7.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.7.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.7.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.7.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.7.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.7.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.7.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.7.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.8.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.8.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.8.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.8.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.8.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.8.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.8.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.8.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.8.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.9.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.9.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.9.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.9.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.9.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.9.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.9.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.9.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.9.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.10.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.10.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.10.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.10.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.10.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.10.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.10.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.10.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.10.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.k_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.k_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.v_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.v_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.q_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.q_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.out_proj.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.11.self_attn.out_proj.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.11.layer_norm1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.11.layer_norm1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.11.mlp.fc1.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.11.mlp.fc1.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.11.mlp.fc2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.11.mlp.fc2.bias", "model.model.model.model.model.model.model.text_model.encoder.layers.11.layer_norm2.weight", "model.model.model.model.model.model.model.text_model.encoder.layers.11.layer_norm2.bias", "model.model.model.model.model.model.model.text_model.final_layer_norm.weight", "model.model.model.model.model.model.model.text_model.final_layer_norm.bias", "model.model.model.model.model.model.model.vision_model.embeddings.class_embedding", "model.model.model.model.model.model.model.vision_model.embeddings.patch_embedding.weight", "model.model.model.model.model.model.model.vision_model.embeddings.position_embedding.weight", "model.model.model.model.model.model.model.vision_model.pre_layrnorm.weight", "model.model.model.model.model.model.model.vision_model.pre_layrnorm.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.0.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.1.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.2.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.3.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.4.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.5.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.6.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.7.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.8.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.9.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.10.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.11.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.12.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.13.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.14.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.15.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.16.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.17.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.18.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.19.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.20.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.21.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.22.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.k_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.k_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.v_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.v_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.q_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.q_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.out_proj.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.self_attn.out_proj.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.layer_norm1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.layer_norm1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.mlp.fc1.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.mlp.fc1.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.mlp.fc2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.mlp.fc2.bias", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.layer_norm2.weight", "model.model.model.model.model.model.model.vision_model.encoder.layers.23.layer_norm2.bias", "model.model.model.model.model.model.model.vision_model.post_layernorm.weight", "model.model.model.model.model.model.model.vision_model.post_layernorm.bias", "model.model.model.model.model.model.model.visual_projection.weight", "model.model.model.model.model.model.model.text_projection.weight". 

In [33]:
torch.save(model.state_dict(), "clip_finetuned.pth")