From 6c696fc7eee7bb35ac1cf7d381419f3c9bdb9609 Mon Sep 17 00:00:00 2001
From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com>
Date: Sun, 22 Mar 2026 23:41:44 -0700
Subject: [PATCH] feat(semantic): add exponential backoff retry for LLM rate
 limiting

Wrap LLM calls in semantic_processor.py with retry logic that handles
429/TooManyRequests/RequestBurstTooFast errors with exponential backoff
and jitter. Previously, a single rate limit error during batch ingestion
caused permanent failure. Now retries up to 3 times with delays of
0.5s, 1s, 2s before giving up gracefully.

Addresses the ingestion pain reported in #350 where 4435 sections
trigger RequestBurstTooFast with Doubao 2.0.

Relates to #350

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../storage/queuefs/semantic_processor.py     | 52 +++++++++++++++----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
index 16ec22912..49027e0f3 100644
--- a/openviking/storage/queuefs/semantic_processor.py
+++ b/openviking/storage/queuefs/semantic_processor.py
@@ -3,6 +3,7 @@
 """SemanticProcessor: Processes messages from SemanticQueue, generates .abstract.md and .overview.md."""
 
 import asyncio
+import random
 import threading
 from contextlib import nullcontext
 from dataclasses import dataclass, field
@@ -186,6 +187,43 @@ def _detect_file_type(self, file_name: str) -> str:
         # Default to other
         return FILE_TYPE_OTHER
 
+    async def _llm_with_retry(
+        self,
+        prompt: str,
+        llm_sem: asyncio.Semaphore,
+        max_retries: int = 3,
+    ) -> str:
+        """Call VLM with exponential backoff on rate limit errors."""
+        vlm = get_openviking_config().vlm
+        for attempt in range(max_retries + 1):
+            try:
+                async with llm_sem:
+                    return await vlm.get_completion_async(prompt)
+            except Exception as e:
+                error_str = str(e)
+                is_rate_limit = (
+                    "429" in error_str
+                    or "TooManyRequests" in error_str
+                    or "RateLimit" in error_str
+                    or "RequestBurstTooFast" in error_str
+                )
+                if is_rate_limit and attempt < max_retries:
+                    delay = min(0.5 * (2**attempt), 8.0) + random.uniform(0, 0.5)
+                    logger.warning(
+                        "LLM rate limited (attempt %d/%d), retrying in %.1fs",
+                        attempt + 1,
+                        max_retries,
+                        delay,
+                    )
+                    await asyncio.sleep(delay)
+                else:
+                    if attempt > 0:
+                        logger.error("LLM call failed after %d attempts: %s", attempt + 1, e)
+                    else:
+                        logger.error("LLM call failed: %s", e)
+                    return ""
+        return ""
+
     async def _check_file_content_changed(
         self, file_path: str, target_file: str, ctx: Optional[RequestContext] = None
     ) -> bool:
@@ -669,8 +707,7 @@ async def _generate_text_summary(
                             "semantic.code_ast_summary",
                             {"file_name": file_name, "skeleton": skeleton_text},
                         )
-                        async with llm_sem:
-                            summary = await vlm.get_completion_async(prompt)
+                        summary = await self._llm_with_retry(prompt, llm_sem)
                         return {"name": file_name, "summary": summary.strip()}
                 if skeleton_text is None:
                     logger.info("AST unsupported language, fallback to LLM: %s", file_path)
@@ -682,8 +719,7 @@ async def _generate_text_summary(
                 "semantic.code_summary",
                 {"file_name": file_name, "content": content},
             )
-            async with llm_sem:
-                summary = await vlm.get_completion_async(prompt)
+            summary = await self._llm_with_retry(prompt, llm_sem)
             return {"name": file_name, "summary": summary.strip()}
 
         elif file_type == FILE_TYPE_DOCUMENTATION:
@@ -696,8 +732,7 @@ async def _generate_text_summary(
             {"file_name": file_name, "content": content},
         )
 
-        async with llm_sem:
-            summary = await vlm.get_completion_async(prompt)
+        summary = await self._llm_with_retry(prompt, llm_sem)
         return {"name": file_name, "summary": summary.strip()}
 
     async def _generate_single_file_summary(
@@ -912,8 +947,6 @@ async def _single_generate_overview(
         """Generate overview from a single prompt (small directories)."""
         import re
 
-        vlm = get_openviking_config().vlm
-
         try:
             prompt = render_prompt(
                 "semantic.overview_generation",
@@ -924,7 +957,8 @@ async def _single_generate_overview(
                 },
             )
 
-            overview = await vlm.get_completion_async(prompt)
+            llm_sem = asyncio.Semaphore(self.max_concurrent_llm)
+            overview = await self._llm_with_retry(prompt, llm_sem)
 
             # Post-process: replace [number] with actual file name
             def replace_index(match):