From bf3a2dd0d83f571dc6bbdc0fcae71aef8f0329b1 Mon Sep 17 00:00:00 2001 From: Sam Wilson Date: Thu, 21 Feb 2019 18:03:31 +0800 Subject: [PATCH] Add pagelist option that uses the API Only works for pages that exist, and doesn't fetch their labels, but is more robust (doesn't use HTML scraping). Also bump PHP version to 7.2. Bug: T216677 --- .travis.yml | 9 ++------- composer.json | 17 ++++++++++++---- src/IndexPage.php | 40 ++++++++++++++++++++++++++++++++++++-- src/Wikisource.php | 48 +++++++++++++++++++++++++++++++++++++--------- 4 files changed, 92 insertions(+), 22 deletions(-) mode change 100755 => 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml old mode 100755 new mode 100644 index dcc315d..861db7f --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,10 @@ language: php php: - - 5.6 - - 7.0 - - 7.1 + - 7.2 install: - composer install script: - - vendor/bin/phpcs -s - - vendor/bin/parallel-lint . --exclude vendor - - vendor/bin/phpdoc -q --template=checkstyle - - (! grep -B 1 error docs/api/checkstyle.xml) + - composer test diff --git a/composer.json b/composer.json index 766a310..3f91ef7 100644 --- a/composer.json +++ b/composer.json @@ -2,7 +2,7 @@ "name": "wikisource/api", "description": "A PHP API to Wikisources (all languages).", "type": "library", - "license": "GPL-2.0+", + "license": "GPL-2.0-or-later", "support": { "irc": "irc://irc.freenode.org/wikisource", "issues": "https://phabricator.wikimedia.org/tag/wikisource-api/", @@ -19,19 +19,28 @@ } }, "require": { + "ext-json": "*", + "ext-simplexml": "*", "psr/cache": "^1.0", "psr/log": "^1.0", "dflydev/dot-access-data": "^1.0", "addwiki/mediawiki-api": "^0.7", - "symfony/dom-crawler": "^3.1" + "symfony/dom-crawler": "^4.2" }, "require-dev": { + "mediawiki/minus-x": "^0.3", "jakub-onderka/php-parallel-lint": "^0.9", "mediawiki/mediawiki-codesniffer": "^13.0", - "phpunit/phpunit": "^5.6", - "phpdocumentor/phpdocumentor": "^2.9", "tedivm/stash": "^0.14", "monolog/monolog": "^1.21", "eloquent/asplode": "^2.2" + }, + "scripts": { + "test": [ + "composer validate", + "parallel-lint . --exclude vendor", + "minus-x check .", + "phpcs -s" + ] } } diff --git a/src/IndexPage.php b/src/IndexPage.php index f0e5f10..3627849 100644 --- a/src/IndexPage.php +++ b/src/IndexPage.php @@ -178,15 +178,51 @@ protected function getHtmlCrawler() { * Get a list of all pages: their numbers, labels, statuses, and URLs. Currently doing this in a * pretty clunky way that probably makes quite a few assumptions based on English Wikisource. * This method sends a request to Wikisource. + * + * @param bool $onlyExisting Only return info about pages that exist. This does not return the + * pages' label (it is set to the page number). * @return string[] Array of arrays with keys 'num', 'label', 'status', 'url'. */ - public function getPageList() { + public function getPageList( $onlyExisting = false ) { + $pagelist = []; + + // Use the API to get a list of all existing pages. + if ( $onlyExisting ) { + $req = new FluentRequest(); + $req->setAction( 'query' ); + $indexNsLocalName = $this->wikisource + ->getNamespaceLocalName( Wikisource::NS_NAME_INDEX ); + $title = substr( $this->getTitle(), strlen( $indexNsLocalName ) + 1 ); + $ns = $this->wikisource->getNamespaceId( Wikisource::NS_NAME_PAGE ); + $reqParams = [ + 'prop' => 'proofread', + 'generator' => 'prefixsearch', + 'gpssearch' => $title, + 'gpsnamespace' => $ns, + 'gpslimit' => 500, + ]; + $req->addParams( $reqParams ); + $res = $this->wikisource->sendApiRequest( $req, 'query.pages' ); + foreach ( $res as $page ) { + $subpage = substr( $page['title'], strrpos( $page['title'], '/' ) + 1 ); + // @TODO The label can not currently be retrieved from the API. + $pagelist['page-'.$subpage] = [ + 'label' => $subpage, + 'num' => $subpage, + 'url' => 'https://'.$this->getWikisource()->getDomainName().'/wiki/'.$page['title'], + 'quality' => $page['proofread']['quality'], + 'title' => $page['title'], + ]; + } + return $pagelist; + } + + // If we need non-existing pages as well, we have to scrape the HTML. :-( preg_match( '/(.*wikisource.org)/', $this->pageInfo['canonicalurl'], $matches ); $baseUrl = isset( $matches[1] ) ? $matches[1] : false; $pageCrawler = $this->getHtmlCrawler(); $pagelistAnchors = $pageCrawler->filterXPath( "//div[contains(@class, 'index-pagelist')]//a" ); - $pagelist = []; foreach ( $pagelistAnchors as $pageLink ) { // Get page URL (which is relative, starting with /w/index.php) and page number. $anchorHref = $pageLink->getAttribute( 'href' ); diff --git a/src/Wikisource.php b/src/Wikisource.php index 89b1b3e..b6dbab7 100644 --- a/src/Wikisource.php +++ b/src/Wikisource.php @@ -20,6 +20,9 @@ class Wikisource { /** The canonical name of the 'Index' namespace. */ const NS_NAME_INDEX = 'Index'; + /** The canonical name of the 'Page' namespace. */ + const NS_NAME_PAGE = 'Page'; + /** @var WikisourceApi The parent API object. */ protected $api; @@ -126,6 +129,38 @@ public function getIndexPageFromUrl( $url ) { * @return int The namespace ID, or false if it can't be found. */ public function getNamespaceId( $namespaceName ) { + foreach ( $this->getNamespaces() as $ns ) { + if ( isset( $ns['canonical'] ) && $ns['canonical'] === $namespaceName ) { + return $ns['id']; + } + } + return false; + } + + /** + * Get the local name for a single namespace. + * + * @param string $namespaceName The name of the namespace to get. + * @return string + */ + public function getNamespaceLocalName( $namespaceName ) { + if ( $namespaceName === '' ) { + return ''; + } + $namespaces = $this->getNamespaces(); + foreach ( $namespaces as $ns ) { + if ( isset( $ns['canonical'] ) && $ns['canonical'] === $namespaceName && isset( $ns['*'] ) ) { + return $ns['*']; + } + } + } + + /** + * Get information about the namespaces on this Wikisource. + * + * @return array + */ + public function getNamespaces() { $cacheKey = 'namespaces'.$this->getLanguageCode(); $namespaces = $this->getWikisoureApi()->cacheGet( $cacheKey ); if ( $namespaces !== false ) { @@ -133,18 +168,13 @@ public function getNamespaceId( $namespaceName ) { } else { $this->logger->debug( "Requesting namespace data for ".$this->getLanguageCode() ); $request = FluentRequest::factory() - ->setAction( 'query' ) - ->setParam( 'meta', 'siteinfo' ) - ->setParam( 'siprop', 'namespaces' ); + ->setAction( 'query' ) + ->setParam( 'meta', 'siteinfo' ) + ->setParam( 'siprop', 'namespaces' ); $namespaces = $this->sendApiRequest( $request, 'query.namespaces' ); $this->getWikisoureApi()->cacheSet( $cacheKey, $namespaces ); } - foreach ( $namespaces as $ns ) { - if ( isset( $ns['canonical'] ) && $ns['canonical'] === $namespaceName ) { - return $ns['id']; - } - } - return false; + return $namespaces; } /**