diff --git a/Dockerfile b/Dockerfile index 7d1359e..6cf32a7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ -FROM amazonlinux:2018.03.0.20200318.1 +FROM public.ecr.aws/lambda/python:3.9-x86_64 -ENV LEPTONICA_VERSION="1.75.1" +ENV LEPTONICA_VERSION="1.82.0" # docker_build leptonica WORKDIR /tmp/ RUN yum install clang wget zip gzip tar autoconf xz libpng-devel libtiff-devel zlib-devel libwebp-devel libjpeg-turbo-devel make libtool pkgconfig -y @@ -15,4 +15,4 @@ RUN cd autoconf-archive-2019.01.06 && cp m4/* /usr/share/aclocal/ COPY build_tesseract.sh /tmp/build_tesseract.sh RUN chmod +x /tmp/build_tesseract.sh -CMD sh /tmp/build_tesseract.sh \ No newline at end of file +ENTRYPOINT ["sh" ,"/tmp/build_tesseract.sh"] diff --git a/build_tesseract.sh b/build_tesseract.sh index 193aefd..d63c7f3 100644 --- a/build_tesseract.sh +++ b/build_tesseract.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -TESSERACT_VERSION="4.1.1" +TESSERACT_VERSION="5.1.0" # docker_build tesseract cd ~ @@ -18,15 +18,19 @@ mkdir tesseract-standalone cd tesseract-standalone cp /usr/local/bin/tesseract . mkdir lib -cp /usr/local/lib/libtesseract.so.4 lib/ +cp /usr/local/lib/libtesseract.so.5 lib/ cp /usr/local/lib/liblept.so.5 lib/ cp /usr/lib64/libjpeg.so.62 lib/ cp /usr/lib64/libwebp.so.4 lib/ cp /usr/lib64/libpng15.so.15 lib/ +cp /usr/lib64/libtiff.so.5 lib/ +cp /usr/lib64/libgomp.so.1 lib/ +cp /usr/lib64/libjbig.so.2.0 lib/ + mkdir tessdata cd tessdata -wget https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata +wget https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata mkdir configs cp /usr/local/share/tessdata/configs/pdf configs/ diff --git a/serverless.yml b/serverless.yml index 31bc97b..cc546f3 100644 --- a/serverless.yml +++ b/serverless.yml @@ -1,8 +1,10 @@ service: tesseract-aws-lambda +frameworkVersion: '3' + provider: name: aws - runtime: python3.7 + runtime: python3.9 package: exclude: @@ -19,7 +21,6 @@ package: - test.jpg - test_handler.py - use_ocr_as_a_service.py - - layer/** layers: OCR: @@ -27,11 +28,11 @@ layers: name: ocr-layer description: Layer with Tesseract compatibleRuntimes: - - python3.7 + - python3.9 retain: false package: include: - - layer/** + - tesseract functions: ocr: