Skip to content

Commit

Permalink
~ proxydb scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
vantoozz committed Dec 6, 2017
1 parent 2e04d67 commit 5446ac8
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 11 deletions.
45 changes: 38 additions & 7 deletions src/Scrapers/ProxyDbScraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
*/
final class ProxyDbScraper implements ScraperInterface
{
private const PAGE_SIZE = 50;
private const PAGE_SIZE = 15;
private const MAX_OFFSET = 1000;
private const PAGE_URL = 'http://proxydb.net/?limit=%d&offset=%d';
private const PAGE_URL = 'http://proxydb.net?offset=%d';

/**
* @var HttpClientInterface
Expand All @@ -46,22 +46,21 @@ public function get(): \Generator
$offset = 0;
$pageSize = self::PAGE_SIZE;
do {
yield from $this->getPage($offset, $pageSize);
yield from $this->getPage($offset);
$offset += $pageSize;
} while ($offset <= self::MAX_OFFSET);
}

/**
* @param int $offset
* @param int $pageSize
* @return \Generator
* @throws \RuntimeException if the CssSelector Component is not available
* @throws \Vantoozz\ProxyScraper\Exceptions\ScraperException
*/
private function getPage(int $offset, int $pageSize): \Generator
private function getPage(int $offset): \Generator
{
try {
$html = $this->httpClient->get(sprintf(static::PAGE_URL, $pageSize, $offset));
$html = $this->httpClient->get(sprintf(static::PAGE_URL, $offset));
} catch (HttpClientException $e) {
throw new ScraperException($e->getMessage(), $e->getCode(), $e);
}
Expand Down Expand Up @@ -90,9 +89,41 @@ private function getPage(int $offset, int $pageSize): \Generator
*/
private function makeProxy(Dom $row): Proxy
{
$proxy = (new ProxyString(trim($row->filter('td')->eq(0)->text())))->asProxy();
$encoded = trim($row->filter('td')->eq(0)->text());

$proxy = (new ProxyString($this->decode($encoded)))->asProxy();
$proxy->addMetric(new Metric(Metrics::SOURCE, static::class));

return $proxy;
}

/**
* @param string $encoded
* @return string
*/
private function decode(string $encoded): string
{
$result = '';

$matches = [];
preg_match("/var n = \'(.+)'\.split\(\'\'\)\.reverse\(\)\.join\(\'\'\);/", $encoded, $matches);

if (array_key_exists(1, $matches)) {
$result .= strrev($matches[1]);
}

preg_match("/var yy = atob\(\'(.+)\'\.replace/", $encoded, $matches);

if (array_key_exists(1, $matches)) {
$result .= base64_decode($matches[1]);
}

preg_match("/var pp = (.+)\s\+\s(.+);/", $encoded, $matches);

if (array_key_exists(2, $matches)) {
$result .= ':' . ((int)$matches[1] + (int)$matches[2]);
}

return $result;
}
}
37 changes: 33 additions & 4 deletions tests/unit/Scrapers/ProxyDbScraperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,25 @@ public function it_throws_an_exception_on_non_html_response(): void
*/
public function it_returns_source_metric(): void
{
$html = <<<HTML
<table><tbody><tr>
<td>
<script>
var n = '1.631.312'.split('').reverse().join('');
var yy = atob('\x4d\x44\x55\x75\x4e\x6a\x49\x3d'.replace(/\\x([0-9A-Fa-f]{2})/g,function(){return String.fromCharCode(parseInt(arguments[1], 16))}));
var pp = -14920 + 18048;
document.write('<a href="/' + n + yy + '/' + pp + '#http" title="lsocit-213.136.105.62.aviso.ci">' + n + yy + String.fromCharCode(58) + pp + '</a>');
proxies.push(n + yy + String.fromCharCode(58) + pp);
</script>
</td>
</tr></table>
HTML;
/** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
$httpClient
->expects(static::once())
->method('get')
->willReturn('<table><tbody><tr><td>46.101.55.200:8118</td></tr></table>');
->willReturn($html);

$scraper = new ProxyDbScraper($httpClient);
$proxy = $scraper->get()->current();
Expand All @@ -76,18 +89,34 @@ public function it_returns_source_metric(): void
*/
public function it_returns_a_proxy(): void
{

$html = <<<HTML
<table><tbody><tr>
<td>
<script>
var n = '1.631.312'.split('').reverse().join('');
var yy = atob('\x4d\x44\x55\x75\x4e\x6a\x49\x3d'.replace(/\\x([0-9A-Fa-f]{2})/g,function(){return String.fromCharCode(parseInt(arguments[1], 16))}));
var pp = -14920 + 18048;
document.write('<a href="/' + n + yy + '/' + pp + '#http" title="lsocit-213.136.105.62.aviso.ci">' + n + yy + String.fromCharCode(58) + pp + '</a>');
proxies.push(n + yy + String.fromCharCode(58) + pp);
</script>
</td>
</tr></table>
HTML;


/** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
$httpClient
->expects(static::once())
->method('get')
->willReturn('<table><tbody><tr><td>46.101.55.200:8118</td></tr></table>');
->willReturn($html);

$scraper = new ProxyDbScraper($httpClient);
$proxy = $scraper->get()->current();

static::assertInstanceOf(Proxy::class, $proxy);
static::assertSame('46.101.55.200:8118', (string)$proxy);
static::assertSame('213.136.105.62:3128', (string)$proxy);
}

/**
Expand All @@ -98,7 +127,7 @@ public function it_skips_bad_rows(): void
/** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
$httpClient
->expects(static::exactly(21))
->expects(static::exactly(67))
->method('get')
->willReturn('<table><tbody><tr><td>bad proxy string</td></tr></table>');

Expand Down

0 comments on commit 5446ac8

Please sign in to comment.