From a18a94c8f70868bf18167366f8318fe122c2affc Mon Sep 17 00:00:00 2001 From: Virantha Ekanayake Date: Thu, 18 Feb 2016 13:41:27 -0500 Subject: [PATCH] Fix for win32 multiprocessing with pyinstaller --- pypdfocr/pypdfocr.py | 7 ++++ pypdfocr/pypdfocr_multiprocessing.py | 56 ++++++++++++++++++++++++++++ pypdfocr/pypdfocr_pdffiler.py | 6 +++ pypdfocr/pypdfocr_tesseract.py | 4 +- 4 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 pypdfocr/pypdfocr_multiprocessing.py diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index 4ef136b..9b68250 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -25,6 +25,12 @@ from PIL import Image import yaml +import multiprocessing +# Replace the Popen routine to allow win32 pyinstaller to build +from multiprocessing import forking +from pypdfocr_multiprocessing import _Popen +forking.Popen = _Popen + from pypdfocr_pdf import PyPdf from pypdfocr_tesseract import PyTesseract from pypdfocr_gs import PyGs @@ -452,6 +458,7 @@ def _convert_and_file_email(self, pdf_filename): self._send_email(pdf_filename, ocr_pdffilename, filing) def main(): # pragma: no cover + multiprocessing.freeze_support() script = PyPDFOCR() script.go(sys.argv[1:]) diff --git a/pypdfocr/pypdfocr_multiprocessing.py b/pypdfocr/pypdfocr_multiprocessing.py new file mode 100644 index 0000000..3666268 --- /dev/null +++ b/pypdfocr/pypdfocr_multiprocessing.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python2.7 +# Copyright 2013 Virantha Ekanayake All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys, os, multiprocessing.forking +import logging + +""" Special work-around to support multiprocessing and pyinstaller --onefile on windows systms + + https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing +""" + +import multiprocessing.forking as forking +import os +import sys + +class _Popen(multiprocessing.forking.Popen): + def __init__(self, *args, **kw): + if hasattr(sys, 'frozen'): + # We have to set original _MEIPASS2 value from sys._MEIPASS + # to get --onefile mode working. + os.putenv('_MEIPASS2', sys._MEIPASS) + try: + super(_Popen, self).__init__(*args, **kw) + finally: + if hasattr(sys, 'frozen'): + # On some platforms (e.g. AIX) 'os.unsetenv()' is not + # available. In those cases we cannot delete the variable + # but only set it to the empty string. The bootloader + # can handle this case. + if hasattr(os, 'unsetenv'): + os.unsetenv('_MEIPASS2') + else: + os.putenv('_MEIPASS2', '') + +forking.Popen = _Popen + +#class Process(multiprocessing.Process): + #_Popen = _Popen + +# ... + +if __name__ == '__main__': + # On Windows calling this function is necessary. + multiprocessing.freeze_support() diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py index 6f26b88..1bb23f5 100644 --- a/pypdfocr/pypdfocr_pdffiler.py +++ b/pypdfocr/pypdfocr_pdffiler.py @@ -26,6 +26,7 @@ from PyPDF2 import PdfFileReader from pypdfocr_filer import PyFiler +from pypdfocr_filer_dirs import PyFilerDirs class PyPdfFiler(object): def __init__(self, filer): @@ -72,3 +73,8 @@ def move_to_matching_folder(self, filename): tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder) return tgt_file +if __name__ == '__main__': + p = PyPdfFiler(PyFilerDirs()) + for page_text in p.iter_pdf_page_text("scan_ocr.pdf"): + print (page_text) + diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index 476d5cd..d1411a3 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -24,8 +24,10 @@ import subprocess import glob from subprocess import CalledProcessError + from multiprocessing import Pool + def error(text): print("ERROR: %s" % text) sys.exit(-1) @@ -129,8 +131,8 @@ def make_hocr_from_pnms(self, fns): # Glob it #fns = glob.glob(img_filename) + logging.debug("Making pool for tesseract") pool = Pool(processes=self.threads) - print("Making pool") hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns)) pool.close() pool.join()