# Frozen Model Experiments
## Trainable Vision Encoder + Frozen LLM

This notebook demonstrates training and evaluating the Frozen architecture.

In [70]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '6'
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPTNeoForCausalLM
from torchvision import transforms
from PIL import Image
import timm
from typing import List, Union, Optional
from dataclasses import dataclass
from PIL import Image
import matplotlib.pyplot as plt
# --- Configuration ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {DEVICE}")

Running on cuda


In [71]:
@dataclass
class FrozenConfig:
    vision_encoder_name: str = "resnet50"
    language_model_name: str = "gpt2-large"
    visual_prefix_length: int = 2
    vision_hidden_dim: int = 2048
    lm_hidden_dim: int = 1280

class VisionEncoder(nn.Module):
    def __init__(self, config: FrozenConfig):
        super().__init__()
        self.config = config
        self.backbone = timm.create_model(
            config.vision_encoder_name, pretrained=False, num_classes=0, global_pool=''
        )
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.projection = nn.Linear(config.vision_hidden_dim, config.visual_prefix_length * config.lm_hidden_dim)

    def forward(self, images: torch.Tensor) -> torch.Tensor:
        features = self.backbone(images)
        features = self.pool(features).flatten(1)
        prefix = self.projection(features)
        return prefix.view(-1, self.config.visual_prefix_length, self.config.lm_hidden_dim)

class FrozenModel(nn.Module):
    def __init__(self, config: FrozenConfig):
        super().__init__()
        self.config = config
        self.vision_encoder = VisionEncoder(config)
        if "gpt-neo" in config.language_model_name.lower():
            self.language_model = GPTNeoForCausalLM.from_pretrained(config.language_model_name)
        else:
            self.language_model = GPT2LMHeadModel.from_pretrained(config.language_model_name)

    def get_input_embeddings(self):
        return self.language_model.get_input_embeddings()

In [73]:
# --- Load Model ---
CHECKPOINT_PATH = "../frozen_checkpoints/frozen_checkpoint_epoch_3.pt" # UPDATE THIS to your actual checkpoint

config = FrozenConfig()
print(f"Initializing model ({config.language_model_name})...")
tokenizer = GPT2Tokenizer.from_pretrained(config.language_model_name)
model = FrozenModel(config)

if os.path.exists(CHECKPOINT_PATH):
    print(f"Loading weights from {CHECKPOINT_PATH}...")
    checkpoint = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
    # Load trained vision encoder weights
    if 'vision_encoder_state_dict' in checkpoint:
         model.vision_encoder.load_state_dict(checkpoint['vision_encoder_state_dict'])
    else:
        model.load_state_dict(checkpoint) # Fallback for full model checkpoints
        
    model.to(DEVICE)
    model.eval()
    print("âœ“ Model loaded successfully.")
else:
    print(f"WARNING: Checkpoint not found at {CHECKPOINT_PATH}. Using random vision weights.")
    model.to(DEVICE)
    model.eval()

Initializing model (gpt2-large)...


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 23.68 GiB of which 128.00 KiB is free. Process 8744 has 14.52 GiB memory in use. Including non-PyTorch memory, this process has 9.16 GiB memory in use. Of the allocated memory 7.90 GiB is allocated by PyTorch, and 156.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [67]:
def generate_from_interleaved(
    model: frozen_model,
    tokenizer: GPT2Tokenizer,
    prompt_items: List[Union[str, Image.Image]],
    max_new_tokens: int = 80,
    temperature: float = 0.0, # 0.0 for greedy decoding
    device: str = DEVICE
) -> str:
    """
    Generates text from a mixed sequence of images and text.
    """
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    embeddings_list = []
    with torch.no_grad():
        for item in prompt_items:
            if isinstance(item, Image.Image):
                img_tensor = transform(item).unsqueeze(0).to(device)
                # Corrected: use vision_encoder directly
                visual_prefix = model.vision_encoder(img_tensor)
                embeddings_list.append(visual_prefix)
            elif isinstance(item, str):
                tokens = tokenizer(item, return_tensors="pt", add_special_tokens=False).input_ids.to(device)
                # Corrected: generic way to get embeddings
                text_embeds = model.get_input_embeddings()(tokens)
                embeddings_list.append(text_embeds)

        inputs_embeds = torch.cat(embeddings_list, dim=1)
        generated_tokens = []

        for _ in range(max_new_tokens):
            # Corrected: use language_model attribute
            outputs = model.language_model(inputs_embeds=inputs_embeds)
            next_token_logits = outputs.logits[:, -1, :]
            
            if temperature == 0:
                next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
            else:
                next_token_logits = next_token_logits / temperature
                probs = F.softmax(next_token_logits, dim=-1)
                next_token_id = torch.multinomial(probs, num_samples=1)

            if next_token_id.item() == tokenizer.eos_token_id:
                break
                
            generated_tokens.append(next_token_id.item())
            next_token_embed = model.get_input_embeddings()(next_token_id)
            inputs_embeds = torch.cat([inputs_embeds, next_token_embed], dim=1)

    return tokenizer.decode(generated_tokens, skip_special_tokens=True)

In [69]:
# Ensure you have these images or update paths
try:
    img_phone = Image.open("images/car_2.jpg").convert("RGB")
    
    prompt = [img_phone, "Question: What is this object? Answer:"]
    output = generate_from_interleaved(frozen_model, GPT2Tokenizer, prompt, max_new_tokens=20)
    print(f"Output: {output}")
    
except FileNotFoundError as e:
    print(f"Error loading test images: {e}")
    print("Please ensure 'demo_images/' folder exists with sample images.")

AttributeError: 'FrozenMultimodalModel' object has no attribute 'vision_encoder'