diff --git a/.travis.yml b/.travis.yml old mode 100755 new mode 100644 index dcc315d..861db7f --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,10 @@ language: php php: - - 5.6 - - 7.0 - - 7.1 + - 7.2 install: - composer install script: - - vendor/bin/phpcs -s - - vendor/bin/parallel-lint . --exclude vendor - - vendor/bin/phpdoc -q --template=checkstyle - - (! grep -B 1 error docs/api/checkstyle.xml) + - composer test diff --git a/composer.json b/composer.json index 766a310..3f91ef7 100644 --- a/composer.json +++ b/composer.json @@ -2,7 +2,7 @@ "name": "wikisource/api", "description": "A PHP API to Wikisources (all languages).", "type": "library", - "license": "GPL-2.0+", + "license": "GPL-2.0-or-later", "support": { "irc": "irc://irc.freenode.org/wikisource", "issues": "https://phabricator.wikimedia.org/tag/wikisource-api/", @@ -19,19 +19,28 @@ } }, "require": { + "ext-json": "*", + "ext-simplexml": "*", "psr/cache": "^1.0", "psr/log": "^1.0", "dflydev/dot-access-data": "^1.0", "addwiki/mediawiki-api": "^0.7", - "symfony/dom-crawler": "^3.1" + "symfony/dom-crawler": "^4.2" }, "require-dev": { + "mediawiki/minus-x": "^0.3", "jakub-onderka/php-parallel-lint": "^0.9", "mediawiki/mediawiki-codesniffer": "^13.0", - "phpunit/phpunit": "^5.6", - "phpdocumentor/phpdocumentor": "^2.9", "tedivm/stash": "^0.14", "monolog/monolog": "^1.21", "eloquent/asplode": "^2.2" + }, + "scripts": { + "test": [ + "composer validate", + "parallel-lint . --exclude vendor", + "minus-x check .", + "phpcs -s" + ] } } diff --git a/src/IndexPage.php b/src/IndexPage.php index f0e5f10..3627849 100644 --- a/src/IndexPage.php +++ b/src/IndexPage.php @@ -178,15 +178,51 @@ protected function getHtmlCrawler() { * Get a list of all pages: their numbers, labels, statuses, and URLs. Currently doing this in a * pretty clunky way that probably makes quite a few assumptions based on English Wikisource. * This method sends a request to Wikisource. + * + * @param bool $onlyExisting Only return info about pages that exist. This does not return the + * pages' label (it is set to the page number). * @return string[] Array of arrays with keys 'num', 'label', 'status', 'url'. */ - public function getPageList() { + public function getPageList( $onlyExisting = false ) { + $pagelist = []; + + // Use the API to get a list of all existing pages. + if ( $onlyExisting ) { + $req = new FluentRequest(); + $req->setAction( 'query' ); + $indexNsLocalName = $this->wikisource + ->getNamespaceLocalName( Wikisource::NS_NAME_INDEX ); + $title = substr( $this->getTitle(), strlen( $indexNsLocalName ) + 1 ); + $ns = $this->wikisource->getNamespaceId( Wikisource::NS_NAME_PAGE ); + $reqParams = [ + 'prop' => 'proofread', + 'generator' => 'prefixsearch', + 'gpssearch' => $title, + 'gpsnamespace' => $ns, + 'gpslimit' => 500, + ]; + $req->addParams( $reqParams ); + $res = $this->wikisource->sendApiRequest( $req, 'query.pages' ); + foreach ( $res as $page ) { + $subpage = substr( $page['title'], strrpos( $page['title'], '/' ) + 1 ); + // @TODO The label can not currently be retrieved from the API. + $pagelist['page-'.$subpage] = [ + 'label' => $subpage, + 'num' => $subpage, + 'url' => 'https://'.$this->getWikisource()->getDomainName().'/wiki/'.$page['title'], + 'quality' => $page['proofread']['quality'], + 'title' => $page['title'], + ]; + } + return $pagelist; + } + + // If we need non-existing pages as well, we have to scrape the HTML. :-( preg_match( '/(.*wikisource.org)/', $this->pageInfo['canonicalurl'], $matches ); $baseUrl = isset( $matches[1] ) ? $matches[1] : false; $pageCrawler = $this->getHtmlCrawler(); $pagelistAnchors = $pageCrawler->filterXPath( "//div[contains(@class, 'index-pagelist')]//a" ); - $pagelist = []; foreach ( $pagelistAnchors as $pageLink ) { // Get page URL (which is relative, starting with /w/index.php) and page number. $anchorHref = $pageLink->getAttribute( 'href' ); diff --git a/src/Wikisource.php b/src/Wikisource.php index 89b1b3e..b6dbab7 100644 --- a/src/Wikisource.php +++ b/src/Wikisource.php @@ -20,6 +20,9 @@ class Wikisource { /** The canonical name of the 'Index' namespace. */ const NS_NAME_INDEX = 'Index'; + /** The canonical name of the 'Page' namespace. */ + const NS_NAME_PAGE = 'Page'; + /** @var WikisourceApi The parent API object. */ protected $api; @@ -126,6 +129,38 @@ public function getIndexPageFromUrl( $url ) { * @return int The namespace ID, or false if it can't be found. */ public function getNamespaceId( $namespaceName ) { + foreach ( $this->getNamespaces() as $ns ) { + if ( isset( $ns['canonical'] ) && $ns['canonical'] === $namespaceName ) { + return $ns['id']; + } + } + return false; + } + + /** + * Get the local name for a single namespace. + * + * @param string $namespaceName The name of the namespace to get. + * @return string + */ + public function getNamespaceLocalName( $namespaceName ) { + if ( $namespaceName === '' ) { + return ''; + } + $namespaces = $this->getNamespaces(); + foreach ( $namespaces as $ns ) { + if ( isset( $ns['canonical'] ) && $ns['canonical'] === $namespaceName && isset( $ns['*'] ) ) { + return $ns['*']; + } + } + } + + /** + * Get information about the namespaces on this Wikisource. + * + * @return array + */ + public function getNamespaces() { $cacheKey = 'namespaces'.$this->getLanguageCode(); $namespaces = $this->getWikisoureApi()->cacheGet( $cacheKey ); if ( $namespaces !== false ) { @@ -133,18 +168,13 @@ public function getNamespaceId( $namespaceName ) { } else { $this->logger->debug( "Requesting namespace data for ".$this->getLanguageCode() ); $request = FluentRequest::factory() - ->setAction( 'query' ) - ->setParam( 'meta', 'siteinfo' ) - ->setParam( 'siprop', 'namespaces' ); + ->setAction( 'query' ) + ->setParam( 'meta', 'siteinfo' ) + ->setParam( 'siprop', 'namespaces' ); $namespaces = $this->sendApiRequest( $request, 'query.namespaces' ); $this->getWikisoureApi()->cacheSet( $cacheKey, $namespaces ); } - foreach ( $namespaces as $ns ) { - if ( isset( $ns['canonical'] ) && $ns['canonical'] === $namespaceName ) { - return $ns['id']; - } - } - return false; + return $namespaces; } /**