In [None]:
# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
from transformers import (
    AutoProcessor,
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    Gemma3ForConditionalGeneration,
    Qwen2_5_VLForConditionalGeneration, 
    OPTForCausalLM,
    VisionEncoderDecoderConfig,
    VisionEncoderDecoderModel
)
# from src.models.config import VisionLanguageConfig

### GEMMA3

In [3]:
model_name = "google/gemma-3-4b-it"
model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
model = Gemma3ForConditionalGeneration.from_pretrained(
    model_name,
    # config=model_config,
    cache_dir="./.cache",
)
print(model)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 2 files: 100%|██████████| 2/2 [14:11<00:00, 425.85s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.16s/it]


Gemma3ForConditionalGeneration(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(4096, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-26): 27 x SiglipEncoderLayer(
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (self_attn): SiglipAttention(
              (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
            

In [None]:
MODEL_TYPE_MAPPING = {
    'gemma3': Gemma3ForConditionalGeneration,
    'opt': OPTForCausalLM,
    'qwen2_5_vl': Qwen2_5_VLForConditionalGeneration,
}

def get_language_model_class(config):
    """
    매핑 테이블을 활용한 모델 클래스 선택
    """
    model_type = config.get('model_type', '')
    
    # 정확한 매치 먼저 확인
    if model_type in MODEL_TYPE_MAPPING:
        return MODEL_TYPE_MAPPING[model_type]
    
    # 부분 매치 확인
    for key, model_class in MODEL_TYPE_MAPPING.items():
        if key in model_type.lower():
            return model_class
    
    # 기본값
    return AutoModelForCausalLM