diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index a9d899ad29e1..3127a36098e2 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -76,6 +76,8 @@ def hf_model_weights_iterator( state = torch.load(bin_file, map_location="cpu") for name, param in state.items(): yield name, param + del state + torch.cuda.empty_cache() def load_tensor_parallel_weights(