Skip to content

Commit ecc590c

Browse files
slayofferclaude
andauthored
fix(docker): add retry logic for ML model downloads (#248)
- Add 3 retries with exponential backoff (10s -> 20s -> 40s) - Set HF_HUB_DOWNLOAD_TIMEOUT=600 for longer timeout - Fixes transient network failures during HuggingFace downloads - Applied to both api-only and standalone stages Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 381c96c commit ecc590c

File tree

1 file changed

+38
-4
lines changed

1 file changed

+38
-4
lines changed

docker/standalone/Dockerfile

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,16 +169,33 @@ ENV PATH="/app/api/.venv/bin:${PATH}"
169169

170170
# Pre-download ML models to avoid runtime download (conditional)
171171
# Only runs if both PRELOAD_ML_MODELS=true AND INCLUDE_LOCAL_MODELS=true
172+
# Includes retry logic with exponential backoff for transient network failures
172173
ARG PRELOAD_ML_MODELS
173174
ARG INCLUDE_LOCAL_MODELS
175+
ENV HF_HUB_DOWNLOAD_TIMEOUT=600
174176
RUN if [ "$PRELOAD_ML_MODELS" = "true" ] && [ "$INCLUDE_LOCAL_MODELS" = "true" ]; then \
175-
/app/api/.venv/bin/python -c "\
177+
MAX_RETRIES=3; \
178+
RETRY_DELAY=10; \
179+
for i in $(seq 1 $MAX_RETRIES); do \
180+
echo "Attempt $i/$MAX_RETRIES: Downloading ML models..."; \
181+
/app/api/.venv/bin/python -c "\
182+
import os; os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '600'; \
176183
from sentence_transformers import SentenceTransformer, CrossEncoder; \
177184
print('Downloading embedding model...'); \
178185
SentenceTransformer('BAAI/bge-small-en-v1.5'); \
179186
print('Downloading cross-encoder model...'); \
180187
CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2'); \
181-
print('Models cached successfully')"; \
188+
print('Models cached successfully')" && break; \
189+
if [ $i -lt $MAX_RETRIES ]; then \
190+
echo "Attempt $i failed, retrying in ${RETRY_DELAY}s..."; \
191+
sleep $RETRY_DELAY; \
192+
RETRY_DELAY=$((RETRY_DELAY * 2)); \
193+
fi; \
194+
done; \
195+
if [ $i -eq $MAX_RETRIES ] && ! /app/api/.venv/bin/python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-small-en-v1.5')" 2>/dev/null; then \
196+
echo "ERROR: Failed to download models after $MAX_RETRIES attempts"; \
197+
exit 1; \
198+
fi; \
182199
elif [ "$INCLUDE_LOCAL_MODELS" != "true" ]; then echo "Skipping ML model preload (local-models not included)"; \
183200
else echo "Skipping ML model preload"; fi
184201

@@ -277,16 +294,33 @@ ENV PATH="/app/api/.venv/bin:${PATH}"
277294

278295
# Pre-download ML models to avoid runtime download (conditional)
279296
# Only runs if both PRELOAD_ML_MODELS=true AND INCLUDE_LOCAL_MODELS=true
297+
# Includes retry logic with exponential backoff for transient network failures
280298
ARG PRELOAD_ML_MODELS
281299
ARG INCLUDE_LOCAL_MODELS
300+
ENV HF_HUB_DOWNLOAD_TIMEOUT=600
282301
RUN if [ "$PRELOAD_ML_MODELS" = "true" ] && [ "$INCLUDE_LOCAL_MODELS" = "true" ]; then \
283-
/app/api/.venv/bin/python -c "\
302+
MAX_RETRIES=3; \
303+
RETRY_DELAY=10; \
304+
for i in $(seq 1 $MAX_RETRIES); do \
305+
echo "Attempt $i/$MAX_RETRIES: Downloading ML models..."; \
306+
/app/api/.venv/bin/python -c "\
307+
import os; os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '600'; \
284308
from sentence_transformers import SentenceTransformer, CrossEncoder; \
285309
print('Downloading embedding model...'); \
286310
SentenceTransformer('BAAI/bge-small-en-v1.5'); \
287311
print('Downloading cross-encoder model...'); \
288312
CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2'); \
289-
print('Models cached successfully')"; \
313+
print('Models cached successfully')" && break; \
314+
if [ $i -lt $MAX_RETRIES ]; then \
315+
echo "Attempt $i failed, retrying in ${RETRY_DELAY}s..."; \
316+
sleep $RETRY_DELAY; \
317+
RETRY_DELAY=$((RETRY_DELAY * 2)); \
318+
fi; \
319+
done; \
320+
if [ $i -eq $MAX_RETRIES ] && ! /app/api/.venv/bin/python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('BAAI/bge-small-en-v1.5')" 2>/dev/null; then \
321+
echo "ERROR: Failed to download models after $MAX_RETRIES attempts"; \
322+
exit 1; \
323+
fi; \
290324
elif [ "$INCLUDE_LOCAL_MODELS" != "true" ]; then echo "Skipping ML model preload (local-models not included)"; \
291325
else echo "Skipping ML model preload"; fi
292326

0 commit comments

Comments
 (0)