In [3]:
import torch
from transformers import AutoConfig, AutoTokenizer, LlamaForCausalLM
import os
import gc

if "CUDA_VISIBLE_DEVICES" in os.environ:
    print(os.environ["CUDA_VISIBLE_DEVICES"])

# Load the model and tokenizer
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "../huggingface/llama-2-7b-hf"
# model_path = "../output/base1-lr.25"

In [4]:
config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False,legacy=False)
model: LlamaForCausalLM = LlamaForCausalLM.from_pretrained(model_path, config=config, device_map='cpu',torch_dtype=torch.float16,)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
def get_tensor_memory_size(tensor: torch.Tensor):
    """
    Calculate the memory size of a tensor in bytes.
    """
    return tensor.element_size() * tensor.nelement()

In [2]:
# dataloader = torch.load('../cache/llama/wikitext2_128/dataloader.cache')
inps0: torch.Tensor = torch.load(f'../cache/llama/wikitext2_128/inps_0.cache',map_location='cpu')

In [13]:
model.to(dev)

inp = inps0[0].to(dev)
print(inp)
out = model(inp)
print(out)

# Free up memory
del inp
del out
gc.collect()
torch.cuda.empty_cache()

tensor([[-0.0292, -0.0015,  0.0145,  ..., -0.0018,  0.0266, -0.0109],
        [-0.0012,  0.0057, -0.0074,  ..., -0.0071, -0.0054,  0.0093],
        [-0.0043,  0.0106, -0.0044,  ...,  0.0034, -0.0136,  0.0223],
        ...,
        [ 0.0154,  0.0178,  0.0023,  ..., -0.0134,  0.0190,  0.0032],
        [-0.0061,  0.0127, -0.0095,  ..., -0.0065,  0.0168, -0.0006],
        [-0.0198,  0.0035,  0.0143,  ..., -0.0118, -0.0069, -0.0058]],
       device='cuda:0', dtype=torch.float16)


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.HalfTensor instead (while checking arguments for embedding)

In [None]:
# print(
#     torch.cuda.max_memory_allocated(),
#     torch.cuda.memory_allocated(),
#     torch.cuda.memory_reserved(),
# )
inp = inps0[0].unsqueeze(0).to(dev)

with torch.no_grad():
    for i in range(32):
        layer = model.model.layers[i].to(dev)
        inp: torch.Tensor = layer(inp)[0]
        layer.cpu()

        gc.collect()
        torch.cuda.empty_cache()

1589789184 42074112 54525952
0 1589789184 42074112 71303168
1 1589789184 42074112 75497472
2 1589789184 42074112 75497472
3 1589789184 42074112 75497472
4 1589789184 42074112 75497472
5 1589789184 42074112 75497472
6 1589789184 42074112 75497472
7 1589789184 42074112 75497472
8 1589789184 42074112 75497472
9 1589789184 42074112 75497472
10 1589789184 42074112 75497472
11 1589789184 42074112 75497472
12 1589789184 42074112 75497472
13 1589789184 42074112 75497472
14 1589789184 42074112 75497472
15 1589789184 42074112 75497472
16 1589789184 42074112 75497472
17 1589789184 42074112 75497472
18 1589789184 42074112 75497472
19 1589789184 42074112 75497472
20 1589789184 42074112 75497472
21 1589789184 42074112 75497472
22 1589789184 42074112 75497472
23 1589789184 42074112 75497472
24 1589789184 42074112 75497472
25 1589789184 42074112 75497472
26 1589789184 42074112 75497472
27 1589789184 42074112 75497472
28 1589789184 42074112 75497472
29 1589789184 42074112 75497472
30 1589789184 4207411

In [22]:
model.lm_head.to(dev)
with torch.no_grad():
    logits = model.lm_head(inp)
    logits = logits[0].argmax(dim=1)
    print(logits)
    print(tokenizer.decode(logits, skip_special_tokens=True))
model.lm_head.cpu()

tensor([29889, 29889,    13,  ..., 29889, 29889, 29889], device='cuda:0')
..
.......
...
.....
.
...
...

..........
.............
....
................
...
...
.

......
.
.......
...
.
.................................
............
...............................
................................
............
.
...
.
..
......
....
..
...
.....
....

.
......
....
.......
.....
............
....
..
.............
....
....
.......
...........
........
.
...
.....
.............
..
....
.....
..........

.....
........
.........
...........
...
....
....

................................
..
...............
.......................
...........
....
.

...
.............

..................
........
........
.......


...............

.
.
..
..
.................

.........
........

.................................
..........
.
.
.
.....
..............
...
.......
.............
..................................
............
...........................
.............
......
........
.....
..

Linear(in_features=4096, out_features=32000, bias=False)