In [1]:
import numpy as np
import torch
# torch.backends.cuda.enable_mem_efficient_sdp(False)
from transformers import LlamaForSequenceClassification
version = "5"
model = LlamaForSequenceClassification.from_pretrained(f'../Model/PRM_LORA_merge{version}_code',\
                                                    num_labels=1,\
                                                    device_map="cuda",
                                                    torch_dtype="auto",
                                                    ).eval()
base_model = model.model
model.score.load_state_dict(torch.load(f'../Model/model_score{version}_code.pth'))
model = model.model

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ../Model/PRM_LORA_merge5_code and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# Generate random integer inputs
input1 = torch.randint(0, 50000, (1, 800),device='cuda')  # Random sequence of length 5
input2 = torch.randint(0, 50000, (1, 400),device='cuda')  # Random sequence of length 5

# Run forward pass on input1 and save past_key_values
with torch.no_grad():
    outputs1 = model(input1, use_cache=True)
    past_key_values = outputs1.past_key_values

# Run forward pass on input2 with saved past_key_values
with torch.no_grad():
    outputs2 = model(input2, past_key_values=past_key_values, use_cache=True)

# Run forward pass on combined input1 and input2
with torch.no_grad():
    combined_input = torch.cat([input1, input2], dim=1)
    outputs_combined = model(combined_input)

# Extract the last hidden states
last_hidden_state_1_2 = outputs2.last_hidden_state
last_hidden_state_combined = outputs_combined.last_hidden_state

# Compare the outputs
comparison = torch.allclose(last_hidden_state_1_2, last_hidden_state_combined[:, -input2.size(1):])

print(f"Are the outputs the same? {comparison}")
torch.mean(torch.abs(last_hidden_state_1_2-last_hidden_state_combined[:, -input2.size(1):]))

Are the outputs the same? False


tensor(0.0010, device='cuda:0', dtype=torch.bfloat16)

In [53]:
input2 = input2.to('cpu')
combined_input = combined_input.to('cpu')

In [54]:
%%time
with torch.no_grad():
    input2 = input2.to('cuda')
    outputs2 = model(input2, past_key_values=past_key_values, use_cache=True)

CPU times: user 12.4 ms, sys: 8.31 ms, total: 20.7 ms
Wall time: 20.4 ms


In [55]:
%%time
with torch.no_grad():
    combined_input = combined_input.to('cuda')
    outputs_combined = model(combined_input)

CPU times: user 34.9 ms, sys: 2.69 ms, total: 37.6 ms
Wall time: 37.4 ms


In [56]:
def convert_past_key_values(past_key_values, target_device):
    # past_key_values ((key_layer1, value_layer1),(key_layer2, value_layer2),...)
    converted = []
    for layer_past in past_key_values:
        converted_layer = tuple(t.to(target_device) for t in layer_past)
        converted.append(converted_layer)
    return tuple(converted)

In [57]:
past_key_values2 = convert_past_key_values(convert_past_key_values(past_key_values,'cpu'),'cuda')
outputs2_new = model(input2, past_key_values=past_key_values2, use_cache=True)

In [61]:
torch.mean(torch.abs(outputs2_new.last_hidden_state-outputs2.last_hidden_state))

tensor(0., device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)

In [64]:
%%time
convert_past_key_values(convert_past_key_values(past_key_values,'cpu'),'cuda');

CPU times: user 62.3 ms, sys: 54.5 ms, total: 117 ms
Wall time: 118 ms


In [65]:
# for kv_cache to work during inference in eval, we need to convert kv between cpu and gpu, one time conversion cost is too high
20.4 * 6 + 118, 37.4 * 6

(240.39999999999998, 224.39999999999998)