Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update CheckIfDead to v1.8 #35

Merged
merged 6 commits into from
May 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 142 additions & 10 deletions src/CheckIfDead.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

namespace Wikimedia\DeadlinkChecker;

define( 'CHECKIFDEADVERSION', '1.7.3' );
define( 'CHECKIFDEADVERSION', '1.8' );

class CheckIfDead {

Expand All @@ -30,7 +30,7 @@ class CheckIfDead {
* UserAgent for the device/browser we are pretending to be
*/
// @codingStandardsIgnoreStart Line exceeds 100 characters
protected $userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36";
protected $userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36";

// @codingStandardsIgnoreEnd

Expand Down Expand Up @@ -88,6 +88,21 @@ class CheckIfDead {
*/
protected $verbose = false;

/**
* The host to connect to when attempting a TOR connection
*/
protected static $socks5Host = "127.0.0.1";

/**
* The port to connect to when attempting a TOR connection
*/
protected static $socks5Port = 9050;

/**
* This is a flag that indicates whether the OS environment is configured to use TOR
*/
protected static $torEnabled = null;

/**
* Set up the class instance
*
Expand All @@ -102,13 +117,75 @@ public function __construct(
$curlTimeoutFull = 60,
$userAgent = false,
$sequentialTests = true,
$verbose = false
$verbose = false,
$socks5Host = '127.0.0.1',
$socks5Port = false
) {
$this->curlTimeoutNoBody = (int)$curlTimeoutNoBody;
$this->curlTimeoutFull = (int)$curlTimeoutFull;
$this->customUserAgent = $userAgent;
$this->queuedTesting = (bool)$sequentialTests;
$this->verbose = (bool)$verbose;

if ( is_null( self::$torEnabled ) ) {
// Check to see if we have an environment that supports TOR
if ( $this->verbose ) {
echo "Testing for TOR readiness...";
}

self::$socks5Host = $socks5Host;
if ( $socks5Port === false ) {
// If we are using TOR defaults, check OS to determine which defaults to use.
if ( substr( php_uname(), 0, 7 ) == "Windows" ) {
self::$socks5Port = 9150;
} else {
self::$socks5Port = 9050;
}
} else {
self::$socks5Port = $socks5Port;
}

$testURL = "https://check.torproject.org";

// Prepare test
$ch = curl_init();
// Get appropriate curl options

$options = $this->getCurlOptions(
$this->sanitizeURL( $testURL ),
true,
true
);
// Force Tor settings onto the options
$options[CURLOPT_PROXY] = self::$socks5Host . ":" . self::$socks5Port;
$options[CURLOPT_PROXYTYPE] = CURLPROXY_SOCKS5_HOSTNAME;
$options[CURLOPT_HTTPPROXYTUNNEL] = true;
curl_setopt_array(
$ch,
$options
);

$data = curl_exec( $ch );

if ( strpos( $data, "This browser is configured to use Tor." ) !== false ) {
self::$torEnabled = true;
} else {
self::$torEnabled = false;
}

curl_close( $ch );

if ( $this->verbose ) {
if ( self::$torEnabled ) {
echo "Ready\n";
echo "TOR requests can be made in this environment\n";
} else {
echo "Not ready\n";
echo "TOR requests will be ignored\n";
}
}
}

}

/**
Expand Down Expand Up @@ -150,6 +227,9 @@ public function areLinksDead( $urls ) {
$curl_instances = [];
// Array of URLs we want to send in for a full check
$fullCheckURLs = [];
// Maps the destination URL to the requested URL in case we followed a redirect
$fullCheckURLMap = [];

foreach ( $urls as $id => $url ) {
if ( $this->getRequestType( $this->sanitizeURL( $url ) ) != "UNSUPPORTED" ) {
$curl_instances[$id] = curl_init();
Expand All @@ -159,7 +239,7 @@ public function areLinksDead( $urls ) {
// Get appropriate curl options
curl_setopt_array(
$curl_instances[$id],
$this->getCurlOptions( $this->sanitizeURL( $url ), false )
$this->getCurlOptions( $this->sanitizeURL( $url ), false, $this->isOnion( $url ) )
);
// Add the instance handle
curl_multi_add_handle( $multicurl_resource, $curl_instances[$id] );
Expand Down Expand Up @@ -209,8 +289,13 @@ public function areLinksDead( $urls ) {
}
}
// If we got back a null, we should do a full page request
// We need to use the destination URL as CURL does not pass thru
// headers when following redirects. This causes some false positives.
if ( is_null( $deadLinks[$url] ) ) {
$fullCheckURLs[] = $url;
$fullCheckURLs[] = $headers['url'];
if ( $url != $headers['url'] ) {
$fullCheckURLMap[$url] = $headers['url'];
}
}
} else {
$deadLinks[$url] = null;
Expand Down Expand Up @@ -240,6 +325,12 @@ public function areLinksDead( $urls ) {
}
// Merge back results from full requests into our deadlinks array
$deadLinks = array_merge( $deadLinks, $results );

// Use map to change destination URL back to the requested URL
foreach ( $fullCheckURLMap as $requested=>$destination ) {
$deadLinks[$requested] = $deadLinks[$destination];
unset ( $deadLinks[$destination] );
}
}
if ( count( $this->curlQueue ) > 1 ) {
sleep( 1 );
Expand Down Expand Up @@ -274,7 +365,11 @@ protected function performFullRequest( $urls ) {
// Get appropriate curl options
curl_setopt_array(
$curl_instances[$id],
$this->getCurlOptions( $this->sanitizeURL( $url, false, true ), true )
$this->getCurlOptions(
$this->sanitizeURL( $url, false, true ),
true,
$this->isOnion( $url )
)
);
// Add the instance handle
curl_multi_add_handle( $multicurl_resource, $curl_instances[$id] );
Expand Down Expand Up @@ -349,9 +444,10 @@ protected function queueRequests( $urls ) {
*
* @param $url String URL we are testing against
* @param bool $full Is this a request for the full page?
* @param bool $tor Is this request being routed through TOR?
* @return array Options for curl
*/
protected function getCurlOptions( $url, $full = false ) {
protected function getCurlOptions( $url, $full = false, $tor = false ) {
$requestType = $this->getRequestType( $url );
if ( $requestType == "MMS" ) {
$url = str_ireplace( "mms://", "rtsp://", $url );
Expand All @@ -373,14 +469,14 @@ protected function getCurlOptions( $url, $full = false ) {
// Emulate a web browser request but make it accept more than a web browser
$header = [
// @codingStandardsIgnoreStart Line exceeds 100 characters
'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although I'm not sure it matters, "image/apng" looks like a typo.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what I copied out of Safari. I honestly don't think it matters that much but we can change it.

// @codingStandardsIgnoreEnd
'Upgrade-Insecure-Requests: 1',
'Cache-Control: max-age=0',
'Connection: keep-alive',
'Keep-Alive: 300',
'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept-Language: en-us,en;q=0.7,*;q=0.5',
'Accept-Encoding: *',
'Pragma: '
];
if ( $this->customUserAgent === false ) {
Expand All @@ -404,13 +500,22 @@ protected function getCurlOptions( $url, $full = false ) {
$options[CURLOPT_TIMEOUT] = $this->curlTimeoutFull;
$options[CURLOPT_HTTPHEADER] = $header;
if ( $requestType != "MMS" && $requestType != "RTSP" ) {
$options[CURLOPT_ENCODING] = 'gzip,deflate';
$options[CURLOPT_ENCODING] = 'gzip, deflate, br';
}
$options[CURLOPT_USERAGENT] = $this->userAgent;
} else {
$options[CURLOPT_NOBODY] = 1;
}

if ( $tor && self::$torEnabled ) {
$options[CURLOPT_PROXY] = self::$socks5Host . ":" . self::$socks5Port;
$options[CURLOPT_PROXYTYPE] = CURLPROXY_SOCKS5_HOSTNAME;
$options[CURLOPT_HTTPPROXYTUNNEL] = true;

} else {
$options[CURLOPT_PROXYTYPE] = CURLPROXY_HTTP;
}

return $options;
}

Expand All @@ -421,6 +526,10 @@ protected function getCurlOptions( $url, $full = false ) {
* @return string "FTP", "MMS", "RTSP", "HTTP", or "UNSUPPORTED"
*/
protected function getRequestType( $url ) {
if ( $this->isOnion( $url ) && !self::$torEnabled ) {
return "UNSUPPORTED";
}

switch ( strtolower( parse_url( $url, PHP_URL_SCHEME ) ) ) {
case "ftp":
return "FTP";
Expand All @@ -436,6 +545,20 @@ protected function getRequestType( $url ) {
}
}

/**
* Check if TOR is needed to access url
*
* @param $url String URL we are checking against
* @return bool True if it's an Onion URL
*/
protected function isOnion( $url ) {
$domain = strtolower( parse_url( $url, PHP_URL_HOST ) );

if ( substr( $domain, -6 ) == ".onion" ) {
return true;
}
}

/**
* Process the returned headers
*
Expand Down Expand Up @@ -790,4 +913,13 @@ public function cleanURL( $input ) {
public function getErrors() {
return $this->errors;
}

/**
* Returns the status of TOR readiness
*
* @return bool False if the environment doesn't support TOR
*/
public static function isTorEnabled() {
return self::$torEnabled;
}
}
18 changes: 13 additions & 5 deletions tests/checkIfDeadTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,44 +19,45 @@ public function testIsLinkDead( $url, $expect ) {
}

public function provideIsLinkDead() {
// Invoke CheckIfDead to determine TOR readiness
new CheckIfDead( 30, 60, false, true, true );

// @codingStandardsIgnoreStart Line exceeds 100 characters
$tests = [
[ 'https://en.wikipedia.org', false ],
[ '//en.wikipedia.org/wiki/Main_Page', false ],
[ 'https://en.wikipedia.org/w/index.php?title=Republic_of_India', false ],
[ 'ftp://ftp.rsa.com/pub/pkcs/ascii/layman.asc', false ],
[ 'http://www.discogs.com/Various-Kad-Jeknu-Dragačevske-Trube-2/release/1173051', false ],
//[ 'https://astraldynamics.com', false ],
// [ 'https://astraldynamics.com', false ],
[
'http://napavalleyregister.com/news/napa-pipe-plant-loads-its-final-rail-car/article_695e3e0a-8d33-5e3b-917c-07a7545b3594.html',
false
],
[ 'http://content.onlinejacc.org/cgi/content/full/41/9/1633', false ],
[ 'http://flysunairexpress.com/#about', false ],
[ 'http://www.palestineremembered.com/download/VillageStatistics/Table%20I/Haifa/Page-047.jpg', false ],
//[ 'http://list.english-heritage.org.uk/resultsingle.aspx?uid=1284140', false ],
[ 'http://archives.lse.ac.uk/TreeBrowse.aspx?src=CalmView.Catalog&field=RefNo&key=RICHARDS', false ],
[ 'https://en.wikipedia.org/w/index.php?title=Wikipedia:Templates_for_discussion/Holding%20cell&action=edit', false ],
[ 'http://hei.hankyung.com/news/app/newsview.php?aid=2011080869717', false ],
[ 'http://www.musicvf.com/Buck+Owens+%2526+Ringo+Starr.art', false ],
[ 'http://www.beweb.chiesacattolica.it/diocesi/diocesi/503/Aosta', false ],
[ 'http://www.dioceseoflascruces.org/', false ],
// [ 'http://www.dioceseoflascruces.org/', false ],
[ 'http://www.worcesterdiocese.org/', false ],
[ 'http://www.catholicdos.org/', false ],
[ 'http://www.diocesitivoli.it/', false ],
[ 'http://www.victoriadiocese.org/', false ],
[ 'http://www.saginaw.org/', false ],
[ 'http://www.dioceseofprovidence.org/', false ],
[ 'http://www.rcdop.org.uk/', false ],
[ 'mms://200.23.59.10/radiotam', false ],
[ 'mms://200.23.59.10/radiotam', true ],
[ 'http://babel.hathitrust.org/cgi/pt?id=pst.000003356951;view=1up;seq=1', false ],
[ 'http://parlinfo.aph.gov.au/parlInfo/search/display/display.w3p;query=Id%3A%22handbook%2Fnewhandbook%2F2014-10-31%2F0049%22', false ],
[ 'https://www.google.se/maps/@60.0254617,14.9787602,3a,75y,133.6h,84.1t/data=!3m6!1e1!3m4!1sqMn_R4TRF0CerotZfLlg8g!2e0!7i13312!8i6656', false ],

[ 'https://en.wikipedia.org/nothing', true ],
[ '//en.wikipedia.org/nothing', true ],
[ 'http://worldchiropracticalliance.org/resources/greens/green4.htm', true ],
//[ 'http://forums.lavag.org/Industrial-EtherNet-EtherNet-IP-t9041.html', true ],
[
'http://203.221.255.21/opacs/TitleDetails?displayid=137394&collection=all&displayid=0&fieldcode=2&from=BasicSearch&genreid=0&ITEMID=$VARS.getItemId()&original=$VARS.getOriginal()&pageno=1&phrasecode=1&searchwords=Lara%20Saint%20Paul%20&status=2&subjectid=0&index=',
true
Expand All @@ -69,6 +70,13 @@ public function provideIsLinkDead() {
$tests[] = [ 'http://кц.рф/ru/', false ];
}

if ( CheckIfDead::isTorEnabled() ) {
$tests[] = [ 'http://xmh57jrzrnw6insl.onion/', false ];
$tests[] = [ 'https://3g2upl4pq6kufc4m.onion/', false ];
$tests[] = [ 'https://3g2upl4pq6kufc4n.onion/', true ];
$tests[] = [ 'http://xmhqwe3rnw6insl.onion/', true ];
}

return $tests;
}

Expand Down