In [None]:
# # Cell 1 — Install & login
# !pip uninstall -y diffusers -q
# !pip install --upgrade --pre diffusers accelerate peft safetensors transformers huggingface-hub datasets sentencepiece protobuf torchvision pillow tqdm

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()  # click link, paste your HF token

In [None]:
# # Cell 2 — Download & patch SD3’s transformer folder
# from huggingface_hub import snapshot_download
# import json, os

# transformer_dir = snapshot_download(
#     repo_id="stabilityai/stable-diffusion-3-medium-diffusers",
#     subfolder="transformer",
#     use_auth_token=True
# )

# cfg_path = os.path.join(transformer_dir, "config.json")
# cfg = json.load(open(cfg_path))
# cfg.pop("num_attention_heads", None)   # remove broken field
# with open(cfg_path, "w") as f:
#     json.dump(cfg, f, indent=2)
# print("✅ Patched config.json in", transformer_dir)

In [None]:
!ls

In [None]:
# !pip uninstall -y numpy
# !pip install numpy
# !pip install --upgrade --force-reinstall "numpy<2.3.0"

In [1]:
import numpy as np
print("NumPy version:", np.__version__)

NumPy version: 2.2.6


In [2]:
# Cell 3 — Load UNet and wrap in LoRA
import torch
from diffusers import UNet2DConditionModel
from peft import LoraConfig, get_peft_model

transformer_dir = "sd3_local/transformer"

# 1) Load full SD3 UNet into CPU, then move to GPU
unet = UNet2DConditionModel.from_pretrained(
    transformer_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=False,
    device_map=None,
    local_files_only=True
)
unet = unet.to("cuda")

# 2) Configure and attach LoRA
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["to_q","to_k","to_v"],  # only Q/K/V projections
    lora_dropout=0.05,
    bias="none"
)
unet_lora = get_peft_model(unet, lora_cfg)
print("✅ LoRA wrapped. LoRA params:", sum(p.numel() for p in unet_lora.parameters() if p.requires_grad))


Some weights of the model checkpoint at sd3_local/transformer were not used when initializing UNet2DConditionModel: 
 ['transformer_blocks.11.attn.to_add_out.bias, transformer_blocks.10.attn.to_out.0.bias, transformer_blocks.1.attn.to_add_out.weight, transformer_blocks.21.attn.to_k.bias, transformer_blocks.21.ff.net.2.bias, transformer_blocks.15.ff_context.net.0.proj.weight, transformer_blocks.22.attn.add_q_proj.bias, transformer_blocks.15.attn.add_v_proj.weight, transformer_blocks.20.attn.to_k.weight, transformer_blocks.5.ff.net.2.bias, transformer_blocks.14.ff_context.net.2.weight, transformer_blocks.7.attn.to_k.bias, transformer_blocks.17.attn.to_k.bias, transformer_blocks.7.norm1.linear.bias, transformer_blocks.3.attn.add_k_proj.weight, transformer_blocks.20.attn.add_q_proj.weight, transformer_blocks.12.ff.net.0.proj.bias, context_embedder.weight, transformer_blocks.6.attn.add_v_proj.bias, transformer_blocks.19.norm1_context.linear.weight, transformer_blocks.22.ff.net.0.proj.weight

✅ LoRA wrapped. LoRA params: 2652160


In [3]:
# Cell 4 — Fine-tune LoRA on your dataset
import torch, torch.nn.functional as F
from accelerate import Accelerator
from diffusers import StableDiffusion3Pipeline
from torch.utils.data import DataLoader
from transformers import CLIPTokenizer
from datasets import load_dataset
from torchvision import transforms
from PIL import Image
import os

# ───── CONFIG ─────
MODEL_ID    = "stabilityai/stable-diffusion-3-medium-diffusers"
TRAIN_JSON  = "lora_dataset.json"
OUTPUT_DIR  = "lora_sd3_panel"
BATCH_SIZE  = 4
EPOCHS      = 3
LR          = 1e-4
WARMUP      = 0.03
MAX_LEN     = 64
SEED        = 42
# ───────────────────

torch.manual_seed(SEED)
accelerator = Accelerator(mixed_precision="fp16")
device = accelerator.device

# 1) Load pipeline pieces (VAE, scheduler, text encoder)
pipe = StableDiffusion3Pipeline.from_pretrained(
    MODEL_ID, torch_dtype=torch.float16, use_auth_token=True
).to(device)
vae, scheduler, text_encoder = pipe.vae, pipe.scheduler, pipe.text_encoder

tokenizer = CLIPTokenizer.from_pretrained(
    MODEL_ID, subfolder="tokenizer", use_auth_token=True
)
img_tf = transforms.Compose([
    transforms.Resize((512,512), transforms.InterpolationMode.BILINEAR),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3),
])


model_index.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

Fetching 26 files:   0%|          | 0/26 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

text_encoder/model.safetensors:   0%|          | 0.00/247M [00:00<?, ?B/s]

text_encoder_2/model.safetensors:   0%|          | 0.00/1.39G [00:00<?, ?B/s]

text_encoder_3/model-00002-of-00002.safe(…):   0%|          | 0.00/4.53G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/19.9k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/588 [00:00<?, ?B/s]

text_encoder_3/model-00001-of-00002.safe(…):   0%|          | 0.00/4.99G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/705 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/576 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_3/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

transformer/diffusion_pytorch_model.safe(…):   0%|          | 0.00/4.17G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/739 [00:00<?, ?B/s]

vae/diffusion_pytorch_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Keyword arguments {'use_auth_token': True} are not expected by StableDiffusion3Pipeline and will be ignored.


Loading pipeline components...:   0%|          | 0/9 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [4]:

# 2) Dataset → DataLoader
ds = load_dataset("json", data_files=TRAIN_JSON)["train"]
def preprocess(ex):
    img = Image.open(ex["image"]).convert("RGB")
    ex["pixel_values"] = img_tf(img)
    ex["input_ids"] = tokenizer(
        ex["caption"],
        padding="max_length", truncation=True, max_length=MAX_LEN,
        return_tensors="pt"
    ).input_ids[0]
    return ex

ds = ds.map(preprocess, remove_columns=["image","caption"])
ds.set_format(type="torch", columns=["pixel_values","input_ids"])
loader = DataLoader(
    ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True,
    collate_fn=lambda b: {
        "pixel_values": torch.stack([x["pixel_values"] for x in b]),
        "input_ids":    torch.stack([x["input_ids"]    for x in b])
    }
)



Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/844 [00:00<?, ? examples/s]

In [5]:
# 3) Optimizer & LR scheduler on LoRA params
optimizer = torch.optim.AdamW(unet_lora.parameters(), lr=LR)
total_steps = len(loader) * EPOCHS
lr_scheduler = torch.optim.lr_scheduler.LinearLR(
    optimizer, start_factor=WARMUP, total_iters=total_steps
)

# 4) Prepare for fp16/multi-GPU
unet_lora, optimizer, loader, lr_scheduler = accelerator.prepare(
    unet_lora, optimizer, loader, lr_scheduler
)

In [12]:
# ─── LoRA Fine‑Tuning on Local SD3 Pipeline ───

import os
import torch
import torch.nn.functional as F
from accelerate import Accelerator
from diffusers import StableDiffusion3Pipeline, DDPMScheduler
from peft import LoraConfig, get_peft_model
from torch.utils.data import DataLoader
from datasets import load_dataset
from torchvision import transforms
from PIL import Image

# ─── Configuration ─────────────────────────────────────────
LOCAL_PATH   = "sd3_local"           # your fully‑cloned SD3 repo
TRAIN_JSON   = "lora_dataset.json"   # JSON with {"image","caption"}
OUTPUT_DIR   = "lora_sd3_panel"
BATCH_SIZE   = 4
EPOCHS       = 3
LEARNING_RATE= 1e-4
LR_WARMUP    = 0.03
MAX_LEN      = 64
SEED         = 42
# ───────────────────────────────────────────────────────────

torch.manual_seed(SEED)
device = "cuda"

from diffusers import StableDiffusion3Pipeline
import torch

device = "cuda"

pipe = StableDiffusion3Pipeline.from_pretrained(
    "sd3_local",
    torch_dtype=torch.float16,
    local_files_only=True,
    low_cpu_mem_usage=False,
    ignore_mismatched_sizes=True
).to(device)

vae          = pipe.vae
unet         = pipe.unet
text_encoder = pipe.text_encoder
tokenizer    = pipe.tokenizer

print("✅ Pipeline loaded with ignore_mismatched_sizes, UNet on", next(unet.parameters()).device)


# 2) Attach LoRA to the UNet (only Q/K/V projections)
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["to_q", "to_k", "to_v"],
    lora_dropout=0.05,
    bias="none"
)
unet_lora = get_peft_model(unet, lora_cfg)

# 3) Load a DDPM scheduler from local clone
noise_scheduler = DDPMScheduler.from_pretrained(
    LOCAL_PATH,
    subfolder="scheduler",
    torch_dtype=torch.float16,
    local_files_only=True
)

# 4) Prepare your dataset & DataLoader
img_tf = transforms.Compose([
    transforms.Resize((512,512), transforms.InterpolationMode.BILINEAR),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3),
])

ds = load_dataset("json", data_files=TRAIN_JSON)["train"]
def preprocess(ex):
    img = Image.open(ex["image"]).convert("RGB")
    ex["pixel_values"] = img_tf(img)
    ex["input_ids"]    = tokenizer(
        ex["caption"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    ).input_ids[0]
    return ex

ds = ds.map(preprocess, remove_columns=["image","caption"])
ds.set_format(type="torch", columns=["pixel_values","input_ids"])

loader = DataLoader(
    ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
    collate_fn=lambda batch: {
        "pixel_values": torch.stack([x["pixel_values"] for x in batch]),
        "input_ids":    torch.stack([x["input_ids"]    for x in batch])
    }
)

# 5) Optimizer & learning‑rate scheduler (on LoRA params)
optimizer   = torch.optim.AdamW(unet_lora.parameters(), lr=LEARNING_RATE)
total_steps = len(loader) * EPOCHS
lr_scheduler= torch.optim.lr_scheduler.LinearLR(
    optimizer,
    start_factor=LR_WARMUP,
    total_iters=total_steps
)

# 6) Set frozen modules to eval
vae.eval()
text_encoder.eval()

# 7) Prepare for mixed‑precision training
accelerator = Accelerator(mixed_precision="fp16")
unet_lora, optimizer, loader, lr_scheduler = accelerator.prepare(
    unet_lora, optimizer, loader, lr_scheduler
)

# 8) Training loop
for epoch in range(1, EPOCHS + 1):
    unet_lora.train()
    running_loss = 0.0

    for batch in loader:
        # a) Move to GPU
        pixel_values = batch["pixel_values"].to(device, dtype=torch.float16)
        input_ids    = batch["input_ids"].to(device)

        # b) VAE encode → latents
        with torch.no_grad():
            latents = vae.encode(pixel_values).latent_dist.sample()
            latents = latents * vae.config.scaling_factor

        # c) Add noise
        noise      = torch.randn_like(latents)
        timesteps  = torch.randint(
            0,
            noise_scheduler.config.num_train_timesteps,
            (latents.shape[0],),
            device=device
        ).long()
        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

        # d) Text → embeddings
        text_outputs = text_encoder(input_ids, return_dict=True)
        encoder_hidden_states = text_outputs.last_hidden_state

        # e) UNet LoRA forward → predict noise
        model_out = unet_lora(
            noisy_latents,
            timesteps,
            encoder_hidden_states=encoder_hidden_states
        )
        noise_pred = model_out.sample

        # f) Loss & backward
        loss = F.mse_loss(noise_pred, noise)
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        running_loss += loss.item()

    avg_loss = running_loss / len(loader)
    print(f"Epoch {epoch}/{EPOCHS} — avg loss {avg_loss:.4f}")

# 9) Save LoRA adapters
os.makedirs(OUTPUT_DIR, exist_ok=True)
unet_lora.save_pretrained(OUTPUT_DIR)
print("✅ LoRA adapters saved to", OUTPUT_DIR)


Keyword arguments {'ignore_mismatched_sizes': True} are not expected by StableDiffusion3Pipeline and will be ignored.


Loading pipeline components...:   0%|          | 0/9 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: Error(s) in loading state_dict for SD3Transformer2DModel:
	size mismatch for pos_embed.pos_embed: copying a param with shape torch.Size([1, 36864, 1536]) from checkpoint, the shape in current model is torch.Size([1, 36864, 1152]).
	size mismatch for pos_embed.proj.weight: copying a param with shape torch.Size([1536, 16, 2, 2]) from checkpoint, the shape in current model is torch.Size([1152, 16, 2, 2]).
	size mismatch for pos_embed.proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for time_text_embed.timestep_embedder.linear_1.weight: copying a param with shape torch.Size([1536, 256]) from checkpoint, the shape in current model is torch.Size([1152, 256]).
	size mismatch for time_text_embed.timestep_embedder.linear_1.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for time_text_embed.timestep_embedder.linear_2.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for time_text_embed.timestep_embedder.linear_2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for time_text_embed.text_embedder.linear_1.weight: copying a param with shape torch.Size([1536, 2048]) from checkpoint, the shape in current model is torch.Size([1152, 2048]).
	size mismatch for time_text_embed.text_embedder.linear_1.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for time_text_embed.text_embedder.linear_2.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for time_text_embed.text_embedder.linear_2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.0.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.0.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.0.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.0.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.0.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.0.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.0.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.0.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.0.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.0.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.0.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.0.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.0.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.0.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.0.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.0.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.0.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.0.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.0.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.0.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.0.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.0.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.0.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.0.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.0.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.0.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.0.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.0.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.1.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.1.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.1.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.1.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.1.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.1.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.1.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.1.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.1.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.1.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.1.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.1.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.1.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.1.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.1.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.1.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.1.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.1.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.1.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.1.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.1.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.1.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.1.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.1.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.1.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.1.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.1.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.1.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.2.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.2.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.2.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.2.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.2.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.2.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.2.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.2.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.2.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.2.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.2.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.2.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.2.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.2.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.2.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.2.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.2.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.2.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.2.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.2.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.2.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.2.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.2.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.2.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.2.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.2.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.2.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.2.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.3.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.3.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.3.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.3.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.3.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.3.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.3.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.3.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.3.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.3.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.3.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.3.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.3.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.3.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.3.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.3.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.3.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.3.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.3.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.3.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.3.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.3.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.3.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.3.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.3.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.3.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.3.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.3.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.4.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.4.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.4.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.4.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.4.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.4.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.4.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.4.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.4.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.4.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.4.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.4.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.4.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.4.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.4.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.4.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.4.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.4.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.4.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.4.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.4.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.4.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.4.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.4.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.4.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.4.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.4.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.4.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.5.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.5.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.5.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.5.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.5.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.5.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.5.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.5.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.5.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.5.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.5.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.5.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.5.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.5.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.5.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.5.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.5.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.5.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.5.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.5.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.5.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.5.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.5.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.5.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.5.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.5.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.5.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.5.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.6.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.6.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.6.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.6.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.6.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.6.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.6.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.6.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.6.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.6.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.6.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.6.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.6.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.6.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.6.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.6.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.6.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.6.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.6.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.6.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.6.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.6.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.6.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.6.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.6.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.6.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.6.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.6.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.7.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.7.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.7.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.7.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.7.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.7.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.7.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.7.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.7.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.7.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.7.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.7.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.7.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.7.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.7.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.7.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.7.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.7.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.7.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.7.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.7.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.7.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.7.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.7.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.7.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.7.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.7.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.7.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.8.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.8.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.8.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.8.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.8.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.8.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.8.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.8.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.8.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.8.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.8.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.8.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.8.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.8.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.8.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.8.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.8.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.8.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.8.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.8.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.8.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.8.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.8.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.8.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.8.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.8.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.8.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.8.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.9.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.9.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.9.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.9.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.9.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.9.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.9.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.9.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.9.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.9.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.9.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.9.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.9.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.9.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.9.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.9.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.9.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.9.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.9.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.9.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.9.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.9.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.9.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.9.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.9.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.9.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.9.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.9.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.10.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.10.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.10.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.10.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.10.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.10.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.10.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.10.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.10.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.10.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.10.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.10.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.10.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.10.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.10.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.10.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.10.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.10.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.10.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.10.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.10.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.10.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.10.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.10.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.10.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.10.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.10.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.10.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.11.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.11.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.11.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.11.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.11.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.11.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.11.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.11.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.11.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.11.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.11.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.11.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.11.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.11.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.11.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.11.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.11.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.11.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.11.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.11.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.11.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.11.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.11.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.11.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.11.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.11.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.11.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.11.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.12.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.12.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.12.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.12.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.12.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.12.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.12.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.12.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.12.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.12.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.12.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.12.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.12.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.12.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.12.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.12.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.12.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.12.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.12.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.12.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.12.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.12.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.12.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.12.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.12.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.12.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.12.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.12.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.13.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.13.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.13.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.13.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.13.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.13.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.13.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.13.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.13.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.13.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.13.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.13.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.13.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.13.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.13.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.13.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.13.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.13.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.13.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.13.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.13.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.13.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.13.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.13.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.13.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.13.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.13.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.13.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.14.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.14.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.14.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.14.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.14.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.14.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.14.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.14.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.14.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.14.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.14.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.14.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.14.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.14.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.14.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.14.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.14.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.14.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.14.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.14.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.14.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.14.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.14.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.14.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.14.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.14.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.14.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.14.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.15.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.15.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.15.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.15.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.15.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.15.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.15.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.15.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.15.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.15.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.15.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.15.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.15.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.15.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.15.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.15.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.15.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.15.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.15.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.15.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.15.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.15.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.15.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.15.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.15.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.15.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.15.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.15.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.16.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.16.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.16.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.16.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.16.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.16.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.16.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.16.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.16.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.16.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.16.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.16.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.16.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.16.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.16.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.16.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.16.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.16.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.16.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.16.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.16.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.16.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.16.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.16.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.16.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.16.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.16.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.16.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.17.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.17.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.17.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.17.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.17.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.17.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.17.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.17.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.17.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.17.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.17.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.17.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.17.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.17.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.17.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.17.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.17.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.17.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.17.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.17.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.17.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.17.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.17.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.17.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.17.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.17.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.17.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.17.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.18.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.18.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.18.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.18.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.18.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.18.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.18.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.18.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.18.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.18.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.18.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.18.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.18.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.18.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.18.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.18.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.18.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.18.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.18.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.18.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.18.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.18.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.18.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.18.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.18.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.18.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.18.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.18.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.19.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.19.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.19.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.19.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.19.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.19.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.19.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.19.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.19.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.19.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.19.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.19.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.19.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.19.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.19.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.19.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.19.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.19.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.19.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.19.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.19.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.19.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.19.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.19.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.19.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.19.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.19.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.19.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.20.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.20.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.20.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.20.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.20.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.20.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.20.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.20.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.20.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.20.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.20.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.20.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.20.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.20.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.20.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.20.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.20.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.20.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.20.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.20.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.20.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.20.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.20.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.20.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.20.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.20.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.20.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.20.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.21.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.21.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.21.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.21.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.21.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.21.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.21.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.21.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.21.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.21.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.21.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.21.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.21.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.21.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.21.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.21.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.21.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.21.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.21.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.21.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.21.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.21.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.21.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.21.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.21.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.21.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.21.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.21.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.22.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.22.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.22.norm1_context.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.22.norm1_context.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.22.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.22.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.22.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.22.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.22.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.22.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.22.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.22.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.22.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.22.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.22.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.22.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.22.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.22.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.22.attn.to_add_out.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.22.attn.to_add_out.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.22.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.22.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.22.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.22.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.22.ff_context.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.22.ff_context.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.22.ff_context.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.22.ff_context.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.23.norm1.linear.weight: copying a param with shape torch.Size([9216, 1536]) from checkpoint, the shape in current model is torch.Size([6912, 1152]).
	size mismatch for transformer_blocks.23.norm1.linear.bias: copying a param with shape torch.Size([9216]) from checkpoint, the shape in current model is torch.Size([6912]).
	size mismatch for transformer_blocks.23.norm1_context.linear.weight: copying a param with shape torch.Size([3072, 1536]) from checkpoint, the shape in current model is torch.Size([2304, 1152]).
	size mismatch for transformer_blocks.23.norm1_context.linear.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]).
	size mismatch for transformer_blocks.23.attn.to_q.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.23.attn.to_q.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.23.attn.to_k.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.23.attn.to_k.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.23.attn.to_v.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.23.attn.to_v.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.23.attn.add_k_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.23.attn.add_k_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.23.attn.add_v_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.23.attn.add_v_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.23.attn.add_q_proj.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.23.attn.add_q_proj.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.23.attn.to_out.0.weight: copying a param with shape torch.Size([1536, 1536]) from checkpoint, the shape in current model is torch.Size([1152, 1152]).
	size mismatch for transformer_blocks.23.attn.to_out.0.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for transformer_blocks.23.ff.net.0.proj.weight: copying a param with shape torch.Size([6144, 1536]) from checkpoint, the shape in current model is torch.Size([4608, 1152]).
	size mismatch for transformer_blocks.23.ff.net.0.proj.bias: copying a param with shape torch.Size([6144]) from checkpoint, the shape in current model is torch.Size([4608]).
	size mismatch for transformer_blocks.23.ff.net.2.weight: copying a param with shape torch.Size([1536, 6144]) from checkpoint, the shape in current model is torch.Size([1152, 4608]).
	size mismatch for transformer_blocks.23.ff.net.2.bias: copying a param with shape torch.Size([1536]) from checkpoint, the shape in current model is torch.Size([1152]).
	size mismatch for norm_out.linear.weight: copying a param with shape torch.Size([3072, 1536]) from checkpoint, the shape in current model is torch.Size([2304, 1152]).
	size mismatch for norm_out.linear.bias: copying a param with shape torch.Size([3072]) from checkpoint, the shape in current model is torch.Size([2304]).
	size mismatch for proj_out.weight: copying a param with shape torch.Size([64, 1536]) from checkpoint, the shape in current model is torch.Size([64, 1152]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [None]:

# 6) Save adapters
os.makedirs(OUTPUT_DIR, exist_ok=True)
unet_lora.save_pretrained(OUTPUT_DIR)
print("✅ LoRA adapters saved to", OUTPUT_DIR)