From 83756fb653cde74908718814d88d975b06542b7e Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Fri, 4 Aug 2023 19:38:54 +0800
Subject: [PATCH] fix: incorrect bigcode attention heads num

---
 vllm/config.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index bd3dd6a21ae5..2e8d58411181 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -98,9 +98,11 @@ def get_num_heads(self, parallel_config: "ParallelConfig") -> int:
         # Note: for falcon, when new_decoder_architecture is True, the
         # multi_query flag is ignored and we use n_head_kv for the number of
         # KV heads.
-        if (getattr(self.hf_config, "multi_query", False) and
-            (self.hf_config.model_type == "falcon" and
-             not getattr(self.hf_config, "new_decoder_architecture", False))):
+        new_decoder_arch_falcon = (
+            self.hf_config.model_type == "falcon"
+            and getattr(self.hf_config, "new_decoder_architecture", False))
+        if not new_decoder_arch_falcon and getattr(self.hf_config,
+                                                   "multi_query", False):
             # Multi-query attention, only one KV head.
             return 1
         # For Falcon: