Skip to content

Commit

Permalink
~ Cool Proxy Scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
vantoozz committed Nov 4, 2019
1 parent b7f49dc commit c1f4457
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 52 deletions.
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
],
"require": {
"php": "^7.1",
"ext-json": "*",
"php-http/client-implementation": "~1",
"php-http/message-factory": "^1.0.2",
"php-http/httplug": "^1.1",
Expand Down
78 changes: 40 additions & 38 deletions src/Scrapers/CoolProxyScraper.php
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
<?php declare(strict_types = 1);
<?php declare(strict_types=1);

namespace Vantoozz\ProxyScraper\Scrapers;

use Symfony\Component\DomCrawler\Crawler as Dom;
use Generator;
use RuntimeException;
use Throwable;
use Vantoozz\ProxyScraper\Enums\Metrics;
use Vantoozz\ProxyScraper\Exceptions\HttpClientException;
use Vantoozz\ProxyScraper\Exceptions\InvalidArgumentException;
use Vantoozz\ProxyScraper\Exceptions\ScraperException;
use Vantoozz\ProxyScraper\HttpClient\HttpClientInterface;
use Vantoozz\ProxyScraper\Ipv4;
use Vantoozz\ProxyScraper\Metric;
Expand All @@ -17,8 +21,7 @@
*/
final class CoolProxyScraper implements ScraperInterface
{
private const MAX_PAGES_COUNT = 100;
private const PAGE_URL = 'https://www.cool-proxy.net/proxies/http_proxy_list/page:%d';
private const JSON_URL = 'https://www.cool-proxy.net/proxies.json';

/**
* @var HttpClientInterface
Expand All @@ -35,56 +38,55 @@ public function __construct(HttpClientInterface $httpClient)
}

/**
* @return \Generator|Proxy[]
* @throws \RuntimeException
* @return Generator|Proxy[]
* @throws RuntimeException
* @throws ScraperException
*/
public function get(): \Generator
{
$page = 1;
do {
try {
yield from $this->getPage($page);
} catch (HttpClientException $e) {
break;
}
$page++;
} while ($page <= static::MAX_PAGES_COUNT);
}

/**
* @param int $page
* @return \Generator
* @throws \Vantoozz\ProxyScraper\Exceptions\HttpClientException
* @throws \RuntimeException if the CssSelector Component is not available
*/
private function getPage(int $page): \Generator
public function get(): Generator
{
try {
$json = $this->httpClient->get(sprintf(static::JSON_URL));
} catch (HttpClientException $e) {
throw new ScraperException($e->getMessage(), $e->getCode(), $e);
}

$html = $this->httpClient->get(sprintf(static::PAGE_URL, $page));
$data = json_decode($json, true);
if (!$data) {
throw new ScraperException('Cannot parse json: ' . json_last_error_msg());
}

$rows = (new Dom($html))->filter('table tr');
if (!is_array($data)) {
throw new ScraperException('No data');
}

foreach ($rows as $row) {
foreach ($data as $item) {
if (!is_array($item)) {
$item = [];
}
try {
yield $this->makeProxy(new Dom($row));
} catch (\Throwable $e) {
yield $this->makeProxy($item);
} catch (Throwable $e) {
continue;
}
}
}


/**
* @param Dom $row
* @param array $item
* @return Proxy
* @throws \Throwable
* @throws Throwable
*/
private function makeProxy(Dom $row): Proxy
private function makeProxy(array $item): Proxy
{
$ipv4 = base64_decode(str_rot13(explode('"', $row->filter('td')->eq(0)->text())[1]));

$port = (int)$row->filter('td')->eq(1)->text();
if (!isset($item['ip'])) {
throw new InvalidArgumentException('No IP given');
}
if (!isset($item['port'])) {
throw new InvalidArgumentException('No port given');
}

$proxy = new Proxy(new Ipv4($ipv4), new Port($port));
$proxy = new Proxy(new Ipv4($item['ip']), new Port((int)$item['port']));
$proxy->addMetric(new Metric(Metrics::SOURCE, static::class));

return $proxy;
Expand Down
84 changes: 70 additions & 14 deletions tests/unit/Scrapers/CoolProxyScraperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
namespace Vantoozz\ProxyScraper\UnitTests\Scrapers;

use PHPUnit\Framework\TestCase;
use PHPUnit_Framework_MockObject_MockObject;
use Vantoozz\ProxyScraper\Enums\Metrics;
use Vantoozz\ProxyScraper\Exceptions\HttpClientException;
use Vantoozz\ProxyScraper\HttpClient\HttpClientInterface;
use Vantoozz\ProxyScraper\Proxy;
use Vantoozz\ProxyScraper\Scrapers\CoolProxyScraper;
use Vantoozz\ProxyScraper\Scrapers\HideMyIpScraper;

/**
* Class CoolProxyScraperTest
Expand All @@ -17,8 +19,10 @@ final class CoolProxyScraperTest extends TestCase
{
/**
* @test
* @expectedException \Vantoozz\ProxyScraper\Exceptions\ScraperException
* @expectedExceptionMessage error message
*/
public function it_stops_on_http_client_error(): void
public function it_throws_an_exception_on_http_client_error(): void
{
/** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
Expand All @@ -28,20 +32,20 @@ public function it_stops_on_http_client_error(): void
->willThrowException(new HttpClientException('error message'));

$scraper = new CoolProxyScraper($httpClient);
static::assertNull($scraper->get()->current());
$scraper->get()->current();
}

/**
* @test
*/
public function it_returns_source_metric(): void
{
/** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */
/** @var HttpClientInterface|PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
$httpClient
->expects(static::once())
->method('get')
->willReturn('<table><tr><td>"ZGp3YwDmYwH3YwD4"</td><td>2222</td></tr></table>');
->willReturn('[{"ip":"177.43.57.48","port":2222},{"ip":"206.189.220.8","port":80}]');

$scraper = new CoolProxyScraper($httpClient);
$proxy = $scraper->get()->current();
Expand All @@ -57,12 +61,12 @@ public function it_returns_source_metric(): void
*/
public function it_returns_a_proxy(): void
{
/** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */
/** @var HttpClientInterface|PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
$httpClient
->expects(static::once())
->method('get')
->willReturn('<table><tr><td>"ZGp3YwDmYwH3YwD4"</td><td>2222</td></tr></table>');
->willReturn('[{"ip":"177.43.57.48","port":2222},{"ip":"206.189.220.8","port":80}]');

$scraper = new CoolProxyScraper($httpClient);
$proxy = $scraper->get()->current();
Expand All @@ -74,35 +78,87 @@ public function it_returns_a_proxy(): void
/**
* @test
*/
public function it_fetches_no_more_than_100_pages(): void
public function it_skips_rows_with_no_ip(): void
{
/** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */
/** @var HttpClientInterface|PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
$httpClient
->expects(static::atLeastOnce())
->method('get')
->willReturn('<table><tr><td>"ZGp3YwDmYwH3YwD4"</td><td>2222</td></tr></table>');
->willReturn('[{"port":2222}]');

$scraper = new CoolProxyScraper($httpClient);
$proxies = iterator_to_array($scraper->get(), false);

static::assertCount(100, $proxies);
static::assertNull($scraper->get()->current());
}

/**
* @test
*/
public function it_skips_bad_rows(): void
public function it_skips_non_array_rows(): void
{
/** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */
/** @var HttpClientInterface|PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
$httpClient
->expects(static::atLeastOnce())
->method('get')
->willReturn('<table><tr><td>aaa</td><td>2222</td></tr></table>');
->willReturn('[123]');

$scraper = new CoolProxyScraper($httpClient);

static::assertNull($scraper->get()->current());
}

/**
* @test
*/
public function it_skips_rows_with_no_port(): void
{
/** @var HttpClientInterface|PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
$httpClient
->expects(static::atLeastOnce())
->method('get')
->willReturn('[{"ip":"177.43.57.48"}]');

$scraper = new CoolProxyScraper($httpClient);

static::assertNull($scraper->get()->current());
}

/**
* @test
* @expectedException \Vantoozz\ProxyScraper\Exceptions\ScraperException
* @expectedExceptionMessage Cannot parse json: Syntax error
*/
public function it_throws_an_exception_if_bad_json_got(): void
{
/** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
$httpClient
->expects(static::once())
->method('get')
->willReturn('var json = dcvsdjh');

$scraper = new CoolProxyScraper($httpClient);
$scraper->get()->current();
}

/**
* @test
* @expectedException \Vantoozz\ProxyScraper\Exceptions\ScraperException
* @expectedExceptionMessage No data
*/
public function it_throws_an_exception_if_no_data_got(): void
{
/** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */
$httpClient = $this->createMock(HttpClientInterface::class);
$httpClient
->expects(static::once())
->method('get')
->willReturn('123');

$scraper = new CoolProxyScraper($httpClient);
$scraper->get()->current();
}
}

0 comments on commit c1f4457

Please sign in to comment.