vllm-project · zhuohan123 · Aug 4, 2023 · Aug 4, 2023
diff --git a/vllm/config.py b/vllm/config.py
@@ -98,9 +98,11 @@ def get_num_heads(self, parallel_config: "ParallelConfig") -> int:
         # Note: for falcon, when new_decoder_architecture is True, the
         # multi_query flag is ignored and we use n_head_kv for the number of
         # KV heads.
-        if (getattr(self.hf_config, "multi_query", False) and
-            (self.hf_config.model_type == "falcon" and
-             not getattr(self.hf_config, "new_decoder_architecture", False))):
+        new_decoder_arch_falcon = (
+            self.hf_config.model_type == "falcon"
+            and getattr(self.hf_config, "new_decoder_architecture", False))
+        if not new_decoder_arch_falcon and getattr(self.hf_config,
+                                                   "multi_query", False):
             # Multi-query attention, only one KV head.
             return 1
         # For Falcon: