From 6496debdfd7db8b30ffe5fb75751d80e4672c8fc Mon Sep 17 00:00:00 2001 From: AnastaciusWright Date: Sun, 17 Apr 2022 14:04:12 +0200 Subject: [PATCH 1/4] added libraries, updated library libtesseract from 4 to 5, and updated tesseract version to 5 --- build_tesseract.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/build_tesseract.sh b/build_tesseract.sh index 193aefd..d63c7f3 100644 --- a/build_tesseract.sh +++ b/build_tesseract.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -TESSERACT_VERSION="4.1.1" +TESSERACT_VERSION="5.1.0" # docker_build tesseract cd ~ @@ -18,15 +18,19 @@ mkdir tesseract-standalone cd tesseract-standalone cp /usr/local/bin/tesseract . mkdir lib -cp /usr/local/lib/libtesseract.so.4 lib/ +cp /usr/local/lib/libtesseract.so.5 lib/ cp /usr/local/lib/liblept.so.5 lib/ cp /usr/lib64/libjpeg.so.62 lib/ cp /usr/lib64/libwebp.so.4 lib/ cp /usr/lib64/libpng15.so.15 lib/ +cp /usr/lib64/libtiff.so.5 lib/ +cp /usr/lib64/libgomp.so.1 lib/ +cp /usr/lib64/libjbig.so.2.0 lib/ + mkdir tessdata cd tessdata -wget https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata +wget https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata mkdir configs cp /usr/local/share/tessdata/configs/pdf configs/ From 25cbaad53ab46915171746c11fb718db2e3d98ce Mon Sep 17 00:00:00 2001 From: AnastaciusWright Date: Sun, 17 Apr 2022 14:05:16 +0200 Subject: [PATCH 2/4] updated image of dockerfile to run with python 3.9, added mandatory entrypoint --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7d1359e..6cf32a7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ -FROM amazonlinux:2018.03.0.20200318.1 +FROM public.ecr.aws/lambda/python:3.9-x86_64 -ENV LEPTONICA_VERSION="1.75.1" +ENV LEPTONICA_VERSION="1.82.0" # docker_build leptonica WORKDIR /tmp/ RUN yum install clang wget zip gzip tar autoconf xz libpng-devel libtiff-devel zlib-devel libwebp-devel libjpeg-turbo-devel make libtool pkgconfig -y @@ -15,4 +15,4 @@ RUN cd autoconf-archive-2019.01.06 && cp m4/* /usr/share/aclocal/ COPY build_tesseract.sh /tmp/build_tesseract.sh RUN chmod +x /tmp/build_tesseract.sh -CMD sh /tmp/build_tesseract.sh \ No newline at end of file +ENTRYPOINT ["sh" ,"/tmp/build_tesseract.sh"] From c7b807bcd416531647d9cc1057ad4ca91b548f52 Mon Sep 17 00:00:00 2001 From: AnastaciusWright Date: Sun, 17 Apr 2022 14:07:09 +0200 Subject: [PATCH 3/4] updated include packages for layer, it will not accept tesseract file by default, so it has to be added as an include, and it is resolved from layer path --- serverless.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serverless.yml b/serverless.yml index 31bc97b..9192471 100644 --- a/serverless.yml +++ b/serverless.yml @@ -31,7 +31,7 @@ layers: retain: false package: include: - - layer/** + - tesseract functions: ocr: From aab54f98dbe1c3d2c740c1c9163341914efe289b Mon Sep 17 00:00:00 2001 From: AnastaciusWright Date: Sun, 17 Apr 2022 14:08:37 +0200 Subject: [PATCH 4/4] modified serverless to use python 3.9 --- serverless.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/serverless.yml b/serverless.yml index 9192471..cc546f3 100644 --- a/serverless.yml +++ b/serverless.yml @@ -1,8 +1,10 @@ service: tesseract-aws-lambda +frameworkVersion: '3' + provider: name: aws - runtime: python3.7 + runtime: python3.9 package: exclude: @@ -19,7 +21,6 @@ package: - test.jpg - test_handler.py - use_ocr_as_a_service.py - - layer/** layers: OCR: @@ -27,7 +28,7 @@ layers: name: ocr-layer description: Layer with Tesseract compatibleRuntimes: - - python3.7 + - python3.9 retain: false package: include: