In [1]:
import os
from itertools import islice

from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch
from transformers import pipeline, CLIPModel, CLIPProcessor

# Configuration
LOADER_PATCH_SIZE = 32
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Cuda Availability:{torch.cuda.is_available()} Training on {device}")



Cuda Availability:True Training on cuda


# Notebook for interactive testing for CLIP

In [4]:

class Cfg:
    model_id: str = "openai/clip-vit-base-patch32"
    batch_size: int = 32
    epochs: int = 20
    seed:   int = 42

    # -------- Optim & Loss ----------
    lr_head: float = 1e-3      # 线性头
    wd_head: float = 1e-4
    lr_lora: float = 1e-4      # LoRA 注入层
    wd_lora: float = 1e-2
    lambda_text: float = 0.3   # 文本对齐辅助损失权重

    # -------- LoRA ----------
    lora_rank: int = 8
    lora_alpha: int = 16
    lora_dropout: float = 0.0
    lora_target: tuple = ("q_proj","k_proj","v_proj","out_proj")  # 只对注意力投影层做LoRA
    # 也可扩展到 MLP 内部 proj，但注意稳定性

    amp: bool = True

cfg = Cfg()

torch.manual_seed(cfg.seed)
model_id = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(model_id).to(device).eval()
processor  = CLIPProcessor.from_pretrained(model_id)

In [5]:
# ---------- 2. Load Dataset ----------
# TODO Switch from processing in epoches to preprocessing
train_set = datasets.Flowers102(root="./data", split="train", download=True)
val_set   = datasets.Flowers102(root="./data", split="val",   download=True)
test_set  = datasets.Flowers102(root="./data", split="test",  download=True)
classname = val_set.classes
classname




['pink primrose',
 'hard-leaved pocket orchid',
 'canterbury bells',
 'sweet pea',
 'english marigold',
 'tiger lily',
 'moon orchid',
 'bird of paradise',
 'monkshood',
 'globe thistle',
 'snapdragon',
 "colt's foot",
 'king protea',
 'spear thistle',
 'yellow iris',
 'globe-flower',
 'purple coneflower',
 'peruvian lily',
 'balloon flower',
 'giant white arum lily',
 'fire lily',
 'pincushion flower',
 'fritillary',
 'red ginger',
 'grape hyacinth',
 'corn poppy',
 'prince of wales feathers',
 'stemless gentian',
 'artichoke',
 'sweet william',
 'carnation',
 'garden phlox',
 'love in the mist',
 'mexican aster',
 'alpine sea holly',
 'ruby-lipped cattleya',
 'cape flower',
 'great masterwort',
 'siam tulip',
 'lenten rose',
 'barbeton daisy',
 'daffodil',
 'sword lily',
 'poinsettia',
 'bolero deep blue',
 'wallflower',
 'marigold',
 'buttercup',
 'oxeye daisy',
 'common dandelion',
 'petunia',
 'wild pansy',
 'primula',
 'sunflower',
 'pelargonium',
 'bishop of llandaff',
 'gaura',

In [6]:
# ---------- 3. DataLoader ----------
# ---------- Process of images is put on epoch loops
def collate_pil(batch):
    # batch: List[ (PIL.Image.Image, int) ]
    images, labels = zip(*batch)           # images: tuple of PIL, labels: tuple of int
    return list(images), torch.tensor(labels)  # 让 processor 接收 list[PIL]，labels 变成 LongTensor

train_loader = DataLoader(
    train_set, batch_size=cfg.batch_size, shuffle=True,
    num_workers=0, pin_memory=True, collate_fn=collate_pil
    #workers should be 4, but got problems in notebook
)
val_loader = DataLoader(
    val_set, batch_size=cfg.batch_size, shuffle=False,
    num_workers=0, pin_memory=True, collate_fn=collate_pil
)
test_loader = DataLoader(
    test_set, batch_size=cfg.batch_size, shuffle=False,
    num_workers=0, pin_memory=True, collate_fn=collate_pil
)

---
## Model Setting and Training

In [12]:
# Word Prompt Embedding
import torch
from tqdm import tqdm

promptTemplate = {
    "A photo of {}.",
    "A photo of flower {}.",
    "Botanic picture of {}",
    "A example picture of type {}"
}
# Use more templates to reduce sensitivity to other contexts

@torch.no_grad()
def build_text_embeddings(names):
    embs = []
    for name in tqdm(names, desc="TextEmbed"):
        prompts = [t.format(name.replace("_"," ")) for t in promptTemplate] # insert class names
        inputs = processor(text=prompts, return_tensors="pt", padding=True).to(device)
        te = clip_model.get_text_features(**inputs)     # [T, D]
        te = te / te.norm(dim=-1, keepdim=True)
        embs.append(te.mean(dim=0))                     # [D]
    text = torch.stack(embs, dim=0)                     # [C, D]
    return text / text.norm(dim=-1, keepdim=True)

text_embs = build_text_embeddings(classname)          # [102, D], 固定不训练



TextEmbed: 100%|██████████| 102/102 [00:01<00:00, 97.86it/s]


----
## Building CLIP model with LoRA and word embedding.

Have to implement a LoRA linear layer ourselves.

In [14]:
# LoRA Injection (Trains LoRA matries only)
import torch, torch.nn as nn
from transformers.models.clip.modeling_clip import CLIPVisionModel

# y = w0 + x*(BA)*alpha/rank
# Shape of A: din by rank / Shape of B: rank by dout
class LoRALinearLayer(nn.Module):
    def __init__(self, base: nn.Linear, r=8, alpha=16, dropout=0.0):

        super().__init__()
        self.base = base # linear layer frozen for training LoRA parameters
        self.r = r
        self.scaling = alpha / r
        dev = base.weight.device
        dt  = base.weight.dtype

        

        if r > 0:
            self.lora_A = nn.Linear(base.in_features, r, bias=False).to(dev, dtype=dt)
            self.lora_B = nn.Linear(r, base.out_features, bias=False).to(dev, dtype=dt)
            self.dropout = nn.Dropout(dropout)
            nn.init.kaiming_uniform_(self.lora_A.weight,a=5**0.5)
            nn.init.zeros_(self.lora_B.weight) # set B to 0, avoid any bias introduced.
        else:
            self.lora_A = None
            self.lora_B = None
            self.dropout = nn.Identity()

            #Frozen
        for p in self.base.parameters():
            p.requires_grad = False

    def forward(self, x):
        if self.r > 0:
            return self.base(x) + self.dropout(self.lora_B(self.lora_A(x))) * self.scaling
        else:
            return self.base(x)


# LoRA Injection with warped LoRA layer shown above.

def lora_injection(clip_model: nn.Module, target_names=("q_proj","k_proj","v_proj","out_proj")):
    """
    """
    assert isinstance(clip_model.vision_model, CLIPVisionModel.__mro__[0].__class__) or hasattr(clip_model, "vision_model")
    lora_params = []
    for name, module in clip_model.vision_model.named_modules():
        # injection to clip/transformer attention layer: q_proj/k_proj/v_proj/out_proj
        for t in target_names:
            if hasattr(module, t):
                lin = getattr(module, t)
                if isinstance(lin, nn.Linear):
                    lora_lin = LoRALinearLayer(lin, r=cfg.lora_rank, alpha=cfg.lora_alpha, dropout=cfg.lora_dropout)
                    setattr(module, t, lora_lin)
                    lora_params += list(lora_lin.lora_A.parameters()) + list(lora_lin.lora_B.parameters())
    # Freeze the parameters
    for p in clip_model.vision_model.parameters():
        p.requires_grad = False
    for p in lora_params:
        p.requires_grad = True
    return lora_params

def build_head_and_optim(clip_model: CLIPModel):
    feat_dim = clip_model.config.projection_dim  # ViT-B/32 = 512
    head = nn.Linear(feat_dim, 102).to(device)

    lora_params = lora_injection(clip_model, target_names=cfg.lora_target)
    clip_model.to(device)

    # 2 parameter groups: LoRA and linear head
    optim = torch.optim.AdamW(
        [
            {"params": head.parameters(),      "lr": cfg.lr_head, "weight_decay": cfg.wd_head},
            {"params": lora_params,            "lr": cfg.lr_lora, "weight_decay": cfg.wd_lora},
        ]
    )
    scaler = torch.amp.GradScaler(enabled=(device=="cuda" and cfg.amp))
    return head, optim, scaler


In [15]:
head, optimizer, scaler = build_head_and_optim(clip_model)
ce = torch.nn.CrossEntropyLoss()

def get_image_feats(images):
    inputs = processor(images=images, return_tensors="pt").to(device)
    feats = clip_model.get_image_features(**inputs)           # [B, D]
    feats = feats / feats.norm(dim=-1, keepdim=True)
    return feats

def supervised_logits(feats):
    return head(feats)                                        # [B, 102]

def text_logits(feats):
    # perform cosine similarity with text embedding.
    return (feats @ text_embs.T) * clip_model.logit_scale.exp()

----
## Main Training Epoch

In [16]:
def run_epoch(loader: DataLoader, train: bool=True):
    if train:
        head.train()
        clip_model.train()
    else:
        head.eval()
        clip_model.eval()

    total, correct_cls, correct_txt = 0, 0, 0
    loss_sum = 0.0
    for images, labels in tqdm(loader, desc="Train" if train else "Eval"):
        labels = labels.to(device)
        with torch.amp.autocast(device_type=device,enabled=(device=="cuda" and cfg.amp)):
            feats = get_image_feats(images)                   # [B, D]

            logits_cls = supervised_logits(feats) # logits of classification score from linear layer head
            loss_cls = ce(logits_cls, labels)

            logits_txt = text_logits(feats) # logits of text embedding trained in transformer
            loss_txt = ce(logits_txt, labels)
            # Alignment between text and img.

            loss = loss_cls + cfg.lambda_text * loss_txt # weighted


        if train:
            # backward propagation
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()


        #Stats

        loss_sum += loss.item() * labels.size(0)
        total += labels.size(0)
        correct_cls += (logits_cls.argmax(dim=-1) == labels).sum().item()
        correct_txt += (logits_txt.argmax(dim=-1) == labels).sum().item()

    return {
    "loss": loss_sum/total,
    "acc_cls": correct_cls/total,   # 线性头准确率
    "acc_txt": correct_txt/total,   # 文本读出准确率（zero-shot 风格）
}



In [None]:
best_val = -1.0
best_head = None

for ep in range(1,cfg.epochs+1):
    training = run_epoch(train_loader, train=True)
    val = run_epoch(val_loader, train=False)
    print(f"[{ep}/{cfg.epochs}] "
          f"Train: loss={training['loss']:.4f} acc_cls={training['acc_cls']:.4f} acc_txt={training['acc_txt']:.4f} | "
          f"Val:   loss={val['loss']:.4f} acc_cls={val['acc_cls']:.4f} acc_txt={val['acc_txt']:.4f}")


    if val["acc_cls"] > best_val:
        best_val = val["acc_cls"]
        best_head = { k: v.detach().cpu() for k, v in head.state_dict().items() } # Detach the parameters from autograd (keeps weights only)


if best_head is not None:
    head.load_state_dict({k: v.to(device) for k, v in best_head.items()})
te = run_epoch(test_loader, train=False)
print(f"Test: loss={te['loss']:.4f}  acc_cls={te['acc_cls']:.4f}  acc_txt={te['acc_txt']:.4f}")


Train:  47%|████▋     | 15/32 [00:04<00:05,  3.11it/s]

## Save trained model
### Save head layer and LoRA layers only

In [48]:
# Save Trained weights, as only head layer is trained
def save_light_model(path,clip_model,head):
    # sort out LoRA layers
    lora_states = {k:v for k,v in clip_model.state_dict().items() if "lora_" in k}
    checkpoint = {
        "clip_name": "openai/clip-vit-base-patch32",
        "num_of_classes": head.out_features,
        "head_state_dict": head.state_dict(),
        "lora_states": lora_states,
    }
    torch.save(checkpoint,path)
    print(f"Light weight model (only contains head and LoRA) saved to {path}")


    # Return clip_lora, head
def load_light_simple(path, device=device, lora_targets=("q_proj","k_proj","v_proj","out_proj")):
    checkpoint = torch.load(path, map_location=device)
    clip_model = CLIPModel.from_pretrained(checkpoint["clip_name"]).to(device)

    # Injection again
    lora_injection(clip_model, target_names=lora_targets)
    # Load weights to injected layers
    missing, unexpected = clip_model.load_state_dict(checkpoint["lora_states"], strict=False) # set to false allow partial loading
    print(missing, unexpected)

    feat_dim = clip_model.config.projection_dim
    head = nn.Linear(feat_dim,checkpoint["num_of_classes"]).to(device)
    head.load_state_dict(checkpoint["head_state_dict"])

    print(f"[light] loaded ← {path}")
    return clip_model, head



from pathlib import Path



# Transformer Style Saving

def save_full_dir(output_dir, clip_model, head):
    output = Path(output_dir)
    output.mkdir(parents=True, exist_ok=True)
    clip_model.save_pretrained(output)           # 保存到目录（含 LoRA 参数）
    torch.save({"num_classes": head.out_features,
                "state_dict": head.state_dict()}, output/"head.pt")
    print(f"[full-dir] saved → {output}")

def load_full_dir(output_dir, device=device, lora_targets=("q_proj","k_proj","v_proj","out_proj")):
    from transformers import CLIPModel
    output = Path(output_dir)

    clip_model = CLIPModel.from_pretrained(output).to(device)  # Load from folder
    # 如你的保存目录包含了 LoRA 权重（因为它在 state_dict 里），这一步就不用再注入；
    # 如果恢复后发现没有 LoRA 结构，可与上面相同：先注入再 load。

    head_ckpt = torch.load(output/"head.pt", map_location=device)
    import torch.nn as nn
    head = nn.Linear(clip_model.config.projection_dim, head_ckpt["num_classes"]).to(device)
    head.load_state_dict(head_ckpt["state_dict"])
    print(f"[full-dir] loaded ← {output}")
    return clip_model, head







# Single File
def save_full_model(path,clip_model,head):
    checkpoint = {
        "clip_name": "openai/clip-vit-base-patch32",
        "num_of_classes": head.out_features,
        "clip_state_dict": clip_model.state_dict(),
        "head_state_dict": head.state_dict(),
    }
    torch.save(checkpoint,path)
    print(f"Full  model  saved to {path}")

def load_full_model(path,device=device, lora_targets=("q_proj","k_proj","v_proj","out_proj")):
    checkpoint = torch.load(path, map_location=device)

    print(checkpoint["clip_name"])
    clip_model = CLIPModel.from_pretrained(checkpoint["clip_name"]).to(device)

    # Injection again
    lora_injection(clip_model, target_names=lora_targets)
    # Load weights to injected layers
    clip_model.load_state_dict(checkpoint["clip_state_dict"], strict=True) # This time true cause loading full model

    head = nn.Linear(clip_model.config.projection_dim,checkpoint["num_of_classes"]).to(device)
    head.load_state_dict(checkpoint["head_state_dict"])

    print(f"Full  model  loaded from {path}")
    return clip_model, head


In [15]:
# Actual Saving Code
os.makedirs("model",exist_ok=True)
save_light_model("model/clip_weights.pt", clip_model, head)

Light weight model (only contains head and LoRA) saved to model/clip_weights.pt


In [27]:
# Try loading one
clip_model, head = load_light_simple("model/clip_weights.pt")

from itertools import islice
clip_model.eval()
head.eval()
processor = CLIPProcessor.from_pretrained(getattr(clip_model, "name_or_path", "openai/clip-vit-base-patch32"))

def collate_pil(batch):
    imgs, labels = zip(*batch)
    return list(imgs), torch.tensor(labels, dtype=torch.long)

val_set  = datasets.Flowers102(root="./data", split="val",  download=True)
test_set = datasets.Flowers102(root="./data", split="test", download=True)
val_loader  = DataLoader(val_set,  batch_size=64, shuffle=False, num_workers=0, collate_fn=collate_pil)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=0, collate_fn=collate_pil)

@torch.no_grad()
def evaluate_first(loader):
    total, correct = 0, 0
    for images, labels in loader:
        labels = labels.to(device)
        inputs = processor(images=images, return_tensors="pt").to(device)
        feats = clip_model.get_image_features(**inputs)          # [B, D]
        feats = feats / feats.norm(dim=-1, keepdim=True)
        logits = head(feats)                                     # [B, C]
        pred = logits.argmax(dim=-1)
        correct += (pred == labels).sum().item()
        total   += labels.size(0)
    return correct / total


val_acc  = evaluate_first(val_loader)
test_acc = evaluate_first(test_loader)
print(f"Val Acc = {val_acc:.4f} | Test Acc = {test_acc:.4f}")

['logit_scale', 'text_model.embeddings.token_embedding.weight', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encode

IndexError: Dimension specified as 0 but tensor has no dimensions

In [49]:
# This block saves full model

save_full_model("model/full_model.pt", clip_model, head)

Full  model  saved to model/full_model.pt


In [50]:
clip_model, head = load_full_model("model/full_model.pt")

openai/clip-vit-base-patch32


RuntimeError: Error(s) in loading state_dict for CLIPModel:
	Missing key(s) in state_dict: "vision_model.encoder.layers.0.self_attn.k_proj.base.weight", "vision_model.encoder.layers.0.self_attn.k_proj.base.bias", "vision_model.encoder.layers.0.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.0.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.0.self_attn.v_proj.base.weight", "vision_model.encoder.layers.0.self_attn.v_proj.base.bias", "vision_model.encoder.layers.0.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.0.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.0.self_attn.q_proj.base.weight", "vision_model.encoder.layers.0.self_attn.q_proj.base.bias", "vision_model.encoder.layers.0.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.0.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.0.self_attn.out_proj.base.weight", "vision_model.encoder.layers.0.self_attn.out_proj.base.bias", "vision_model.encoder.layers.0.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.0.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.1.self_attn.k_proj.base.weight", "vision_model.encoder.layers.1.self_attn.k_proj.base.bias", "vision_model.encoder.layers.1.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.1.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.1.self_attn.v_proj.base.weight", "vision_model.encoder.layers.1.self_attn.v_proj.base.bias", "vision_model.encoder.layers.1.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.1.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.1.self_attn.q_proj.base.weight", "vision_model.encoder.layers.1.self_attn.q_proj.base.bias", "vision_model.encoder.layers.1.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.1.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.1.self_attn.out_proj.base.weight", "vision_model.encoder.layers.1.self_attn.out_proj.base.bias", "vision_model.encoder.layers.1.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.1.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.2.self_attn.k_proj.base.weight", "vision_model.encoder.layers.2.self_attn.k_proj.base.bias", "vision_model.encoder.layers.2.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.2.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.2.self_attn.v_proj.base.weight", "vision_model.encoder.layers.2.self_attn.v_proj.base.bias", "vision_model.encoder.layers.2.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.2.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.2.self_attn.q_proj.base.weight", "vision_model.encoder.layers.2.self_attn.q_proj.base.bias", "vision_model.encoder.layers.2.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.2.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.2.self_attn.out_proj.base.weight", "vision_model.encoder.layers.2.self_attn.out_proj.base.bias", "vision_model.encoder.layers.2.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.2.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.3.self_attn.k_proj.base.weight", "vision_model.encoder.layers.3.self_attn.k_proj.base.bias", "vision_model.encoder.layers.3.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.3.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.3.self_attn.v_proj.base.weight", "vision_model.encoder.layers.3.self_attn.v_proj.base.bias", "vision_model.encoder.layers.3.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.3.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.3.self_attn.q_proj.base.weight", "vision_model.encoder.layers.3.self_attn.q_proj.base.bias", "vision_model.encoder.layers.3.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.3.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.3.self_attn.out_proj.base.weight", "vision_model.encoder.layers.3.self_attn.out_proj.base.bias", "vision_model.encoder.layers.3.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.3.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.4.self_attn.k_proj.base.weight", "vision_model.encoder.layers.4.self_attn.k_proj.base.bias", "vision_model.encoder.layers.4.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.4.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.4.self_attn.v_proj.base.weight", "vision_model.encoder.layers.4.self_attn.v_proj.base.bias", "vision_model.encoder.layers.4.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.4.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.4.self_attn.q_proj.base.weight", "vision_model.encoder.layers.4.self_attn.q_proj.base.bias", "vision_model.encoder.layers.4.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.4.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.4.self_attn.out_proj.base.weight", "vision_model.encoder.layers.4.self_attn.out_proj.base.bias", "vision_model.encoder.layers.4.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.4.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.5.self_attn.k_proj.base.weight", "vision_model.encoder.layers.5.self_attn.k_proj.base.bias", "vision_model.encoder.layers.5.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.5.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.5.self_attn.v_proj.base.weight", "vision_model.encoder.layers.5.self_attn.v_proj.base.bias", "vision_model.encoder.layers.5.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.5.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.5.self_attn.q_proj.base.weight", "vision_model.encoder.layers.5.self_attn.q_proj.base.bias", "vision_model.encoder.layers.5.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.5.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.5.self_attn.out_proj.base.weight", "vision_model.encoder.layers.5.self_attn.out_proj.base.bias", "vision_model.encoder.layers.5.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.5.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.6.self_attn.k_proj.base.weight", "vision_model.encoder.layers.6.self_attn.k_proj.base.bias", "vision_model.encoder.layers.6.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.6.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.6.self_attn.v_proj.base.weight", "vision_model.encoder.layers.6.self_attn.v_proj.base.bias", "vision_model.encoder.layers.6.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.6.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.6.self_attn.q_proj.base.weight", "vision_model.encoder.layers.6.self_attn.q_proj.base.bias", "vision_model.encoder.layers.6.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.6.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.6.self_attn.out_proj.base.weight", "vision_model.encoder.layers.6.self_attn.out_proj.base.bias", "vision_model.encoder.layers.6.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.6.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.7.self_attn.k_proj.base.weight", "vision_model.encoder.layers.7.self_attn.k_proj.base.bias", "vision_model.encoder.layers.7.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.7.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.7.self_attn.v_proj.base.weight", "vision_model.encoder.layers.7.self_attn.v_proj.base.bias", "vision_model.encoder.layers.7.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.7.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.7.self_attn.q_proj.base.weight", "vision_model.encoder.layers.7.self_attn.q_proj.base.bias", "vision_model.encoder.layers.7.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.7.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.7.self_attn.out_proj.base.weight", "vision_model.encoder.layers.7.self_attn.out_proj.base.bias", "vision_model.encoder.layers.7.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.7.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.8.self_attn.k_proj.base.weight", "vision_model.encoder.layers.8.self_attn.k_proj.base.bias", "vision_model.encoder.layers.8.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.8.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.8.self_attn.v_proj.base.weight", "vision_model.encoder.layers.8.self_attn.v_proj.base.bias", "vision_model.encoder.layers.8.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.8.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.8.self_attn.q_proj.base.weight", "vision_model.encoder.layers.8.self_attn.q_proj.base.bias", "vision_model.encoder.layers.8.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.8.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.8.self_attn.out_proj.base.weight", "vision_model.encoder.layers.8.self_attn.out_proj.base.bias", "vision_model.encoder.layers.8.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.8.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.9.self_attn.k_proj.base.weight", "vision_model.encoder.layers.9.self_attn.k_proj.base.bias", "vision_model.encoder.layers.9.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.9.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.9.self_attn.v_proj.base.weight", "vision_model.encoder.layers.9.self_attn.v_proj.base.bias", "vision_model.encoder.layers.9.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.9.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.9.self_attn.q_proj.base.weight", "vision_model.encoder.layers.9.self_attn.q_proj.base.bias", "vision_model.encoder.layers.9.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.9.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.9.self_attn.out_proj.base.weight", "vision_model.encoder.layers.9.self_attn.out_proj.base.bias", "vision_model.encoder.layers.9.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.9.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.10.self_attn.k_proj.base.weight", "vision_model.encoder.layers.10.self_attn.k_proj.base.bias", "vision_model.encoder.layers.10.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.10.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.10.self_attn.v_proj.base.weight", "vision_model.encoder.layers.10.self_attn.v_proj.base.bias", "vision_model.encoder.layers.10.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.10.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.10.self_attn.q_proj.base.weight", "vision_model.encoder.layers.10.self_attn.q_proj.base.bias", "vision_model.encoder.layers.10.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.10.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.10.self_attn.out_proj.base.weight", "vision_model.encoder.layers.10.self_attn.out_proj.base.bias", "vision_model.encoder.layers.10.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.10.self_attn.out_proj.lora_B.weight", "vision_model.encoder.layers.11.self_attn.k_proj.base.weight", "vision_model.encoder.layers.11.self_attn.k_proj.base.bias", "vision_model.encoder.layers.11.self_attn.k_proj.lora_A.weight", "vision_model.encoder.layers.11.self_attn.k_proj.lora_B.weight", "vision_model.encoder.layers.11.self_attn.v_proj.base.weight", "vision_model.encoder.layers.11.self_attn.v_proj.base.bias", "vision_model.encoder.layers.11.self_attn.v_proj.lora_A.weight", "vision_model.encoder.layers.11.self_attn.v_proj.lora_B.weight", "vision_model.encoder.layers.11.self_attn.q_proj.base.weight", "vision_model.encoder.layers.11.self_attn.q_proj.base.bias", "vision_model.encoder.layers.11.self_attn.q_proj.lora_A.weight", "vision_model.encoder.layers.11.self_attn.q_proj.lora_B.weight", "vision_model.encoder.layers.11.self_attn.out_proj.base.weight", "vision_model.encoder.layers.11.self_attn.out_proj.base.bias", "vision_model.encoder.layers.11.self_attn.out_proj.lora_A.weight", "vision_model.encoder.layers.11.self_attn.out_proj.lora_B.weight". 
	Unexpected key(s) in state_dict: "vision_model.encoder.layers.0.self_attn.k_proj.weight", "vision_model.encoder.layers.0.self_attn.k_proj.bias", "vision_model.encoder.layers.0.self_attn.v_proj.weight", "vision_model.encoder.layers.0.self_attn.v_proj.bias", "vision_model.encoder.layers.0.self_attn.q_proj.weight", "vision_model.encoder.layers.0.self_attn.q_proj.bias", "vision_model.encoder.layers.0.self_attn.out_proj.weight", "vision_model.encoder.layers.0.self_attn.out_proj.bias", "vision_model.encoder.layers.1.self_attn.k_proj.weight", "vision_model.encoder.layers.1.self_attn.k_proj.bias", "vision_model.encoder.layers.1.self_attn.v_proj.weight", "vision_model.encoder.layers.1.self_attn.v_proj.bias", "vision_model.encoder.layers.1.self_attn.q_proj.weight", "vision_model.encoder.layers.1.self_attn.q_proj.bias", "vision_model.encoder.layers.1.self_attn.out_proj.weight", "vision_model.encoder.layers.1.self_attn.out_proj.bias", "vision_model.encoder.layers.2.self_attn.k_proj.weight", "vision_model.encoder.layers.2.self_attn.k_proj.bias", "vision_model.encoder.layers.2.self_attn.v_proj.weight", "vision_model.encoder.layers.2.self_attn.v_proj.bias", "vision_model.encoder.layers.2.self_attn.q_proj.weight", "vision_model.encoder.layers.2.self_attn.q_proj.bias", "vision_model.encoder.layers.2.self_attn.out_proj.weight", "vision_model.encoder.layers.2.self_attn.out_proj.bias", "vision_model.encoder.layers.3.self_attn.k_proj.weight", "vision_model.encoder.layers.3.self_attn.k_proj.bias", "vision_model.encoder.layers.3.self_attn.v_proj.weight", "vision_model.encoder.layers.3.self_attn.v_proj.bias", "vision_model.encoder.layers.3.self_attn.q_proj.weight", "vision_model.encoder.layers.3.self_attn.q_proj.bias", "vision_model.encoder.layers.3.self_attn.out_proj.weight", "vision_model.encoder.layers.3.self_attn.out_proj.bias", "vision_model.encoder.layers.4.self_attn.k_proj.weight", "vision_model.encoder.layers.4.self_attn.k_proj.bias", "vision_model.encoder.layers.4.self_attn.v_proj.weight", "vision_model.encoder.layers.4.self_attn.v_proj.bias", "vision_model.encoder.layers.4.self_attn.q_proj.weight", "vision_model.encoder.layers.4.self_attn.q_proj.bias", "vision_model.encoder.layers.4.self_attn.out_proj.weight", "vision_model.encoder.layers.4.self_attn.out_proj.bias", "vision_model.encoder.layers.5.self_attn.k_proj.weight", "vision_model.encoder.layers.5.self_attn.k_proj.bias", "vision_model.encoder.layers.5.self_attn.v_proj.weight", "vision_model.encoder.layers.5.self_attn.v_proj.bias", "vision_model.encoder.layers.5.self_attn.q_proj.weight", "vision_model.encoder.layers.5.self_attn.q_proj.bias", "vision_model.encoder.layers.5.self_attn.out_proj.weight", "vision_model.encoder.layers.5.self_attn.out_proj.bias", "vision_model.encoder.layers.6.self_attn.k_proj.weight", "vision_model.encoder.layers.6.self_attn.k_proj.bias", "vision_model.encoder.layers.6.self_attn.v_proj.weight", "vision_model.encoder.layers.6.self_attn.v_proj.bias", "vision_model.encoder.layers.6.self_attn.q_proj.weight", "vision_model.encoder.layers.6.self_attn.q_proj.bias", "vision_model.encoder.layers.6.self_attn.out_proj.weight", "vision_model.encoder.layers.6.self_attn.out_proj.bias", "vision_model.encoder.layers.7.self_attn.k_proj.weight", "vision_model.encoder.layers.7.self_attn.k_proj.bias", "vision_model.encoder.layers.7.self_attn.v_proj.weight", "vision_model.encoder.layers.7.self_attn.v_proj.bias", "vision_model.encoder.layers.7.self_attn.q_proj.weight", "vision_model.encoder.layers.7.self_attn.q_proj.bias", "vision_model.encoder.layers.7.self_attn.out_proj.weight", "vision_model.encoder.layers.7.self_attn.out_proj.bias", "vision_model.encoder.layers.8.self_attn.k_proj.weight", "vision_model.encoder.layers.8.self_attn.k_proj.bias", "vision_model.encoder.layers.8.self_attn.v_proj.weight", "vision_model.encoder.layers.8.self_attn.v_proj.bias", "vision_model.encoder.layers.8.self_attn.q_proj.weight", "vision_model.encoder.layers.8.self_attn.q_proj.bias", "vision_model.encoder.layers.8.self_attn.out_proj.weight", "vision_model.encoder.layers.8.self_attn.out_proj.bias", "vision_model.encoder.layers.9.self_attn.k_proj.weight", "vision_model.encoder.layers.9.self_attn.k_proj.bias", "vision_model.encoder.layers.9.self_attn.v_proj.weight", "vision_model.encoder.layers.9.self_attn.v_proj.bias", "vision_model.encoder.layers.9.self_attn.q_proj.weight", "vision_model.encoder.layers.9.self_attn.q_proj.bias", "vision_model.encoder.layers.9.self_attn.out_proj.weight", "vision_model.encoder.layers.9.self_attn.out_proj.bias", "vision_model.encoder.layers.10.self_attn.k_proj.weight", "vision_model.encoder.layers.10.self_attn.k_proj.bias", "vision_model.encoder.layers.10.self_attn.v_proj.weight", "vision_model.encoder.layers.10.self_attn.v_proj.bias", "vision_model.encoder.layers.10.self_attn.q_proj.weight", "vision_model.encoder.layers.10.self_attn.q_proj.bias", "vision_model.encoder.layers.10.self_attn.out_proj.weight", "vision_model.encoder.layers.10.self_attn.out_proj.bias", "vision_model.encoder.layers.11.self_attn.k_proj.weight", "vision_model.encoder.layers.11.self_attn.k_proj.bias", "vision_model.encoder.layers.11.self_attn.v_proj.weight", "vision_model.encoder.layers.11.self_attn.v_proj.bias", "vision_model.encoder.layers.11.self_attn.q_proj.weight", "vision_model.encoder.layers.11.self_attn.q_proj.bias", "vision_model.encoder.layers.11.self_attn.out_proj.weight", "vision_model.encoder.layers.11.self_attn.out_proj.bias". 