diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index de6b20f..ed6d76e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,6 +14,7 @@ If you need to make asset changes: * `npm install` * Add the missing values from `.env` to a `.env.local` file * Use `https://vision.googleapis.com/` as the `APP_GOOGLE_CLOUD_VISION_ENDPOINT`, with your own [Cloud Vision API](https://cloud.google.com/vision) key as the `APP_GOOGLE_CLOUD_VISION_KEY`. Google gives you 1,000 free lookups per month. +* Install [Tesseract](https://tesseract-ocr.github.io) and make sure it's in your `$PATH` * `symfony serve` to start the application * `npm run dev-server` if you need to make JS/CSS changes. * Stop the dev-server and run `npm run build` before committing. diff --git a/composer.json b/composer.json index 6752a92..dc7f2a5 100644 --- a/composer.json +++ b/composer.json @@ -1,6 +1,6 @@ { "name": "wikimedia/wikimedia-ocr", - "description": "A simple wrapper around the Google Cloud Vision API, enabling Wikisources to submit images for OCR and retrieve the resultant text.", + "description": "A simple wrapper around multiple OCR engines, enabling Wikisources to submit images for OCR and retrieve the resultant text.", "type": "project", "license": "GPL-3.0-or-later", "require": { @@ -16,6 +16,7 @@ "symfony/twig-bundle": "5.2.*", "symfony/webpack-encore-bundle": "^1.11", "symfony/yaml": "5.2.*", + "thiagoalessio/tesseract_ocr": "^2.11", "twig/extra-bundle": "^2.12|^3.0", "twig/twig": "^2.12|^3.0", "wikimedia/toolforge-bundle": "^1.3", diff --git a/composer.lock b/composer.lock index 531f34c..3379905 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "9975eb0e515bbf4e75cd46c2c422259b", + "content-hash": "0a32e76d4f81151e949751efbb611fc5", "packages": [ { "name": "doctrine/annotations", @@ -247,33 +247,32 @@ }, { "name": "doctrine/dbal", - "version": "2.10.4", + "version": "2.13.0", "source": { "type": "git", "url": "https://github.com/doctrine/dbal.git", - "reference": "47433196b6390d14409a33885ee42b6208160643" + "reference": "67d56d3203b33db29834e6b2fcdbfdc50535d796" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/doctrine/dbal/zipball/47433196b6390d14409a33885ee42b6208160643", - "reference": "47433196b6390d14409a33885ee42b6208160643", + "url": "https://api.github.com/repos/doctrine/dbal/zipball/67d56d3203b33db29834e6b2fcdbfdc50535d796", + "reference": "67d56d3203b33db29834e6b2fcdbfdc50535d796", "shasum": "" }, "require": { "doctrine/cache": "^1.0", + "doctrine/deprecations": "^0.5.3", "doctrine/event-manager": "^1.0", "ext-pdo": "*", - "php": "^7.2" + "php": "^7.1 || ^8" }, "require-dev": { - "doctrine/coding-standard": "^8.1", - "jetbrains/phpstorm-stubs": "^2019.1", - "nikic/php-parser": "^4.4", - "phpstan/phpstan": "^0.12.40", - "phpunit/phpunit": "^8.5.5", - "psalm/plugin-phpunit": "^0.10.0", + "doctrine/coding-standard": "8.2.0", + "jetbrains/phpstorm-stubs": "2020.2", + "phpstan/phpstan": "0.12.81", + "phpunit/phpunit": "^7.5.20|^8.5|9.5.0", "symfony/console": "^2.0.5|^3.0|^4.0|^5.0", - "vimeo/psalm": "^3.14.2" + "vimeo/psalm": "4.6.4" }, "suggest": { "symfony/console": "For helpful console commands such as SQL execution and import of files." @@ -282,12 +281,6 @@ "bin/doctrine-dbal" ], "type": "library", - "extra": { - "branch-alias": { - "dev-master": "2.10.x-dev", - "dev-develop": "3.0.x-dev" - } - }, "autoload": { "psr-4": { "Doctrine\\DBAL\\": "lib/Doctrine/DBAL" @@ -352,20 +345,59 @@ "type": "tidelift" } ], - "time": "2020-09-12T21:20:41+00:00" + "time": "2021-03-28T18:10:53+00:00" + }, + { + "name": "doctrine/deprecations", + "version": "v0.5.3", + "source": { + "type": "git", + "url": "https://github.com/doctrine/deprecations.git", + "reference": "9504165960a1f83cc1480e2be1dd0a0478561314" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/doctrine/deprecations/zipball/9504165960a1f83cc1480e2be1dd0a0478561314", + "reference": "9504165960a1f83cc1480e2be1dd0a0478561314", + "shasum": "" + }, + "require": { + "php": "^7.1|^8.0" + }, + "require-dev": { + "doctrine/coding-standard": "^6.0|^7.0|^8.0", + "phpunit/phpunit": "^7.0|^8.0|^9.0", + "psr/log": "^1.0" + }, + "suggest": { + "psr/log": "Allows logging deprecations via PSR-3 logger implementation" + }, + "type": "library", + "autoload": { + "psr-4": { + "Doctrine\\Deprecations\\": "lib/Doctrine/Deprecations" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "description": "A small layer on top of trigger_error(E_USER_DEPRECATED) or PSR-3 logging with options to disable all deprecations or selectively for packages.", + "homepage": "https://www.doctrine-project.org/", + "time": "2021-03-21T12:59:47+00:00" }, { "name": "doctrine/doctrine-bundle", - "version": "2.3.0", + "version": "2.3.1", "source": { "type": "git", "url": "https://github.com/doctrine/DoctrineBundle.git", - "reference": "8b922578bdee2243a26202b13df795e170efaef8" + "reference": "a08bc3b4d8567cdff05e89b272ba1e06e9d71c21" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/doctrine/DoctrineBundle/zipball/8b922578bdee2243a26202b13df795e170efaef8", - "reference": "8b922578bdee2243a26202b13df795e170efaef8", + "url": "https://api.github.com/repos/doctrine/DoctrineBundle/zipball/a08bc3b4d8567cdff05e89b272ba1e06e9d71c21", + "reference": "a08bc3b4d8567cdff05e89b272ba1e06e9d71c21", "shasum": "" }, "require": { @@ -377,7 +409,7 @@ "symfony/config": "^4.3.3|^5.0", "symfony/console": "^3.4.30|^4.3.3|^5.0", "symfony/dependency-injection": "^4.3.3|^5.0", - "symfony/doctrine-bridge": "^4.3.7|^5.0", + "symfony/doctrine-bridge": "^4.4.7|^5.0", "symfony/framework-bundle": "^3.4.30|^4.3.3|^5.0", "symfony/service-contracts": "^1.1.1|^2.0" }, @@ -390,25 +422,25 @@ "doctrine/orm": "^2.6", "friendsofphp/proxy-manager-lts": "^1.0", "phpunit/phpunit": "^7.5 || ^8.0 || ^9.3", - "symfony/phpunit-bridge": "^4.2", + "psalm/plugin-phpunit": "^0.15.1", + "psalm/plugin-symfony": "^2.2.4", + "symfony/phpunit-bridge": "^5.2", "symfony/property-info": "^4.3.3|^5.0", "symfony/proxy-manager-bridge": "^3.4|^4.3.3|^5.0", + "symfony/security-bundle": "^4.4|5.0", "symfony/twig-bridge": "^3.4.30|^4.3.3|^5.0", "symfony/validator": "^3.4.30|^4.3.3|^5.0", "symfony/web-profiler-bundle": "^3.4.30|^4.3.3|^5.0", "symfony/yaml": "^3.4.30|^4.3.3|^5.0", - "twig/twig": "^1.34|^2.12|^3.0" + "twig/twig": "^1.34|^2.12|^3.0", + "vimeo/psalm": "^4.7" }, "suggest": { "doctrine/orm": "The Doctrine ORM integration is optional in the bundle.", + "ext-pdo": "*", "symfony/web-profiler-bundle": "To use the data collector." }, "type": "symfony-bundle", - "extra": { - "branch-alias": { - "dev-master": "2.3.x-dev" - } - }, "autoload": { "psr-4": { "Doctrine\\Bundle\\DoctrineBundle\\": "" @@ -458,7 +490,7 @@ "type": "tidelift" } ], - "time": "2021-03-16T16:24:04+00:00" + "time": "2021-04-05T14:21:02+00:00" }, { "name": "doctrine/event-manager", @@ -4477,6 +4509,50 @@ ], "time": "2021-03-06T07:59:01+00:00" }, + { + "name": "thiagoalessio/tesseract_ocr", + "version": "2.11.0", + "source": { + "type": "git", + "url": "https://github.com/thiagoalessio/tesseract-ocr-for-php.git", + "reference": "d288750532593ba3eaf2e7446f426fc561caf2a2" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/thiagoalessio/tesseract-ocr-for-php/zipball/d288750532593ba3eaf2e7446f426fc561caf2a2", + "reference": "d288750532593ba3eaf2e7446f426fc561caf2a2", + "shasum": "" + }, + "require": { + "php": "^5.3 || ^7.0 || ^8.0" + }, + "require-dev": { + "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "thiagoalessio\\TesseractOCR\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "thiagoalessio", + "email": "thiagoalessio@me.com" + } + ], + "description": "A wrapper to work with Tesseract OCR inside PHP.", + "keywords": [ + "OCR", + "Tesseract", + "text recognition" + ], + "time": "2021-02-10T17:51:31+00:00" + }, { "name": "twig/extra-bundle", "version": "v3.3.0", @@ -4885,16 +4961,16 @@ }, { "name": "drenso/phan-extensions", - "version": "v3.3.0", + "version": "v3.4.0", "source": { "type": "git", "url": "https://github.com/Drenso/PhanExtensions.git", - "reference": "bd479813f4d9275262a04b655f966a6503cafa3b" + "reference": "af53f4a4edf9e12ca74780a32e63c7112c026476" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/Drenso/PhanExtensions/zipball/bd479813f4d9275262a04b655f966a6503cafa3b", - "reference": "bd479813f4d9275262a04b655f966a6503cafa3b", + "url": "https://api.github.com/repos/Drenso/PhanExtensions/zipball/af53f4a4edf9e12ca74780a32e63c7112c026476", + "reference": "af53f4a4edf9e12ca74780a32e63c7112c026476", "shasum": "" }, "require-dev": { @@ -4925,7 +5001,7 @@ "stubs", "symfony" ], - "time": "2021-03-28T13:04:31+00:00" + "time": "2021-04-07T19:40:11+00:00" }, { "name": "felixfbecker/advanced-json-rpc", diff --git a/config/services.yaml b/config/services.yaml index 9aac21f..7a5d374 100644 --- a/config/services.yaml +++ b/config/services.yaml @@ -26,12 +26,10 @@ services: resource: '../src/Controller/' tags: ['controller.service_arguments'] - App\Controller\OcrController: + App\Engine\GoogleCloudVisionEngine: arguments: - - '@request_stack' - - '@Krinkle\Intuition\Intuition' - - '%env(APP_GOOGLE_CLOUD_VISION_ENDPOINT)%' - - '%env(APP_GOOGLE_CLOUD_VISION_KEY)%' + $endpoint: '%env(APP_GOOGLE_CLOUD_VISION_ENDPOINT)%' + $key: '%env(APP_GOOGLE_CLOUD_VISION_KEY)%' # please note that last definitions always *replace* previous ones # add more service definitions when explicit configuration is needed diff --git a/i18n/en.json b/i18n/en.json index 1749fde..8db8af0 100644 --- a/i18n/en.json +++ b/i18n/en.json @@ -1,14 +1,19 @@ { - "title": "Wikisource Google OCR", + "title": "Wikimedia OCR", "image-url": "Image URL:", "image-url-help": "This must start with 'https://upload.wikimedia.org/' and be a full URL to an actual image file.", "image-url-error": "Image URL must begin with '$1'", "image-alt-text": "The original image", "language-code": "Two-letter language code (optional):", "language-code-help": "The ISO639 code of the language of the text in the image.", + "engine": "OCR engine:", + "engine-help": "Choose between the open-source Wikimedia-hosted Tesseract, and Google's Cloud Vision API.", + "engine-name-google": "Google", + "engine-name-tesseract": "Tesseract", "submit": "Go", "copy-to-clipboard": "Copy to clipboard", "copied-to-clipboard": "Copied!", "more-info": "For more information, see:", - "limit-exceeded": "Limit exceeded: $1" + "limit-exceeded": "Limit exceeded: $1", + "image-retrieval-failed": "Image retrieval failed: $1" } diff --git a/src/Controller/OcrController.php b/src/Controller/OcrController.php index 1ce41b6..fc47bba 100644 --- a/src/Controller/OcrController.php +++ b/src/Controller/OcrController.php @@ -3,7 +3,9 @@ namespace App\Controller; -use App\Engine\GoogleCloudVisionEngine; +use App\Engine\EngineBase; +use App\Engine\EngineFactory; +use App\Engine\TesseractEngine; use App\Exception\OcrException; use Krinkle\Intuition\Intuition; use Symfony\Bundle\FrameworkBundle\Controller\AbstractController; @@ -17,7 +19,7 @@ class OcrController extends AbstractController /** @var Intuition */ protected $intuition; - /** @var GoogleCloudVisionEngine */ + /** @var EngineBase */ protected $engine; /** @var mixed[] Output params for the view or API response. */ @@ -33,16 +35,18 @@ class OcrController extends AbstractController * OcrController constructor. * @param RequestStack $requestStack * @param Intuition $intuition - * @param string $endpoint - * @param string $key + * @param EngineFactory $engineFactory */ - public function __construct(RequestStack $requestStack, Intuition $intuition, string $endpoint, string $key) + public function __construct(RequestStack $requestStack, Intuition $intuition, EngineFactory $engineFactory) { - $request = $requestStack->getCurrentRequest(); - // Dependencies. $this->intuition = $intuition; - $this->engine = new GoogleCloudVisionEngine($endpoint, $key); + + $request = $requestStack->getCurrentRequest(); + + // Engine. + $this->engine = $engineFactory->get($request->get('engine', 'google')); + $this->params['engine'] = $this->engine instanceof TesseractEngine ? 'tesseract' : 'google'; // Parameters. $this->imageUrl = (string)$request->query->get('image'); diff --git a/src/Engine/EngineBase.php b/src/Engine/EngineBase.php new file mode 100644 index 0000000..796f285 --- /dev/null +++ b/src/Engine/EngineBase.php @@ -0,0 +1,30 @@ + */ + private $engines; + + public function __construct(GoogleCloudVisionEngine $cloudVisionEngine, TesseractEngine $tesseractEngine) + { + $this->engines = [ + 'google' => $cloudVisionEngine, + 'tesseract' => $tesseractEngine, + ]; + } + + public function get(string $name): EngineBase + { + if (!isset($this->engines[$name])) { + throw new Exception('Engine not found: '.$name); + } + return $this->engines[$name]; + } +} diff --git a/src/Engine/GoogleCloudVisionEngine.php b/src/Engine/GoogleCloudVisionEngine.php index 8fc278a..251c637 100644 --- a/src/Engine/GoogleCloudVisionEngine.php +++ b/src/Engine/GoogleCloudVisionEngine.php @@ -7,7 +7,7 @@ use Wikisource\GoogleCloudVisionPHP\GoogleCloudVision; use Wikisource\GoogleCloudVisionPHP\LimitExceededException; -class GoogleCloudVisionEngine +class GoogleCloudVisionEngine extends EngineBase { /** @var string The API key. */ protected $key; @@ -28,19 +28,6 @@ public function __construct(string $endpoint, string $key) $this->gcv->setEndpoint($endpoint); } - /** - * Checks that the given image URL is valid. - * @param string $imageUrl - * @throws OcrException - */ - public function checkImageUrl(string $imageUrl): void - { - $uploadUrl = 'https://upload.wikimedia.org/'; - if (substr($imageUrl, 0, strlen($uploadUrl)) !== $uploadUrl) { - throw new OcrException('image-url-error', [$uploadUrl]); - } - } - /** * Get transcribed text from the given image. * @param string $imageUrl diff --git a/src/Engine/TesseractEngine.php b/src/Engine/TesseractEngine.php new file mode 100644 index 0000000..4d898a7 --- /dev/null +++ b/src/Engine/TesseractEngine.php @@ -0,0 +1,48 @@ +httpClient = $httpClient; + } + + public function getText(string $imageUrl, ?string $lang = null): string + { + // Check the URL and fetch the image data. + $this->checkImageUrl($imageUrl); + $imageResponse = $this->httpClient->request('GET', $imageUrl); + try { + $imageContent = $imageResponse->getContent(); + } catch (ClientException $exception) { + throw new OcrException('image-retrieval-failed', [$exception->getMessage()]); + } + + // Sanitize the language code. + $cleanLang = preg_replace('/[a-zA-Z]+/', '', $lang); + + // Run OCR. + $ocr = new TesseractOCR(); + // @TODO Remove this after this is resolved: https://github.com/thiagoalessio/tesseract-ocr-for-php/issues/210 + FriendlyErrors::checkTesseractPresence($ocr->command->executable); + $ocr->imageData($imageContent, $imageResponse->getHeaders()['content-length'][0]); + if ($cleanLang) { + $ocr->lang($cleanLang); + } + $text = $ocr->run(); + return $text; + } +} diff --git a/symfony.lock b/symfony.lock index c4a0155..75caa8c 100644 --- a/symfony.lock +++ b/symfony.lock @@ -26,6 +26,9 @@ "doctrine/dbal": { "version": "3.0.0" }, + "doctrine/deprecations": { + "version": "v0.5.3" + }, "doctrine/doctrine-bundle": { "version": "2.0", "recipe": { @@ -356,6 +359,9 @@ "symfony/yaml": { "version": "v5.2.4" }, + "thiagoalessio/tesseract_ocr": { + "version": "2.11.0" + }, "twig/extra-bundle": { "version": "v3.3.0" }, diff --git a/templates/output.html.twig b/templates/output.html.twig index 1e9f45f..7fa4252 100644 --- a/templates/output.html.twig +++ b/templates/output.html.twig @@ -12,6 +12,14 @@

{{ msg( 'language-code-help' ) }}

+
+ + +

{{ msg( 'engine-help' ) }}

+