In [1]:
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import requests
from accelerate import init_empty_weights
from accelerate import load_checkpoint_and_dispatch

In [2]:
model_name = '/media/drdo/DATA/LLMs/Molmo-7B-D-0924'
checkpoint = model_name

In [7]:
# load the processor
processor = AutoProcessor.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='cuda'
)

In [8]:
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        torch_dtype='auto',
        device_map='cuda'
    )

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 518.00 MiB. GPU 0 has a total capacity of 23.61 GiB of which 264.94 MiB is free. Process 1745730 has 677.57 MiB memory in use. Including non-PyTorch memory, this process has 21.15 GiB memory in use. Of the allocated memory 20.70 GiB is allocated by PyTorch, and 11.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [5]:
model = load_checkpoint_and_dispatch(
    model,
    checkpoint=checkpoint,
    device_map="cuda"
)

You shouldn't move a model that is dispatched using accelerate hooks.


In [6]:
model.device

device(type='cpu')

In [6]:
# process the image and text
inputs = processor.process(
    images=[Image.open(requests.get("https://picsum.photos/id/237/536/354", stream=True).raw)],
    text="Describe this image."
)

In [7]:
# move inputs to the correct device and make a batch of size 1
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

In [8]:
# allenai/Molmo-7B-O-0924 -> 9min 5s
# allenai/MolmoE-1B-0924 -> 

In [9]:
%%time
# generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
output = model.generate_from_batch(
    inputs,
    GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
    tokenizer=processor.tokenizer
)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
# only get generated tokens; decode them to text
generated_tokens = output[0,inputs['input_ids'].size(1):]
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)

# print the generated text
print(generated_text)

# >>> This photograph captures an adorable black Labrador puppy sitting on a weathered
#     wooden deck. The deck's planks, which are a mix of light and dark brown with ...

In [1]:
from nnsight import CONFIG

CONFIG.set_default_api_key("nnsight_api_key")

In [2]:
import os

# llama3.1 70b is a gated model and you need access via your huggingface token
os.environ['HF_TOKEN'] = "hf_key"

In [None]:
from nnsight import LanguageModel

# We'll never actually load the parameters so no need to specify a device_map.
llama = LanguageModel("meta-llama/Meta-Llama-3.1-70B")

# All we need to specify using NDIF vs executing locally is remote=True.
with llama.trace("The Eiffel Tower is in the city of", remote=True) as runner:
    hidden_states = llama.model.layers[-1].output.save()
    output = llama.output.save()

print(hidden_states)
print(output["logits"])

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

2024-10-23 22:19:49,708 6d114891-ffce-4c04-8212-05967923667e - RECEIVED: Your job has been received and is waiting approval.
2024-10-23 22:19:49,735 6d114891-ffce-4c04-8212-05967923667e - APPROVED: Your job was approved and is waiting to be run.
