-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
248 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
<?php declare(strict_types=1); | ||
|
||
namespace Vantoozz\ProxyScraper\Scrapers; | ||
|
||
use Generator; | ||
use Symfony\Component\DomCrawler\Crawler as Dom; | ||
use Throwable; | ||
use Vantoozz\ProxyScraper\Enums\Metrics; | ||
use Vantoozz\ProxyScraper\Exceptions\HttpClientException; | ||
use Vantoozz\ProxyScraper\Exceptions\InvalidArgumentException; | ||
use Vantoozz\ProxyScraper\Exceptions\ScraperException; | ||
use Vantoozz\ProxyScraper\HttpClient\HttpClientInterface; | ||
use Vantoozz\ProxyScraper\Metric; | ||
use Vantoozz\ProxyScraper\Proxy; | ||
use Vantoozz\ProxyScraper\ProxyString; | ||
|
||
/** | ||
* Class ProxynovaScraper | ||
* @package Vantoozz\ProxyScraper\Scrapers | ||
*/ | ||
final class ProxynovaScraper implements ScraperInterface, Discoverable | ||
{ | ||
/** | ||
* | ||
*/ | ||
private const URL = 'https://www.proxynova.com/proxy-server-list/'; | ||
/** | ||
* @var HttpClientInterface | ||
*/ | ||
private $httpClient; | ||
|
||
/** | ||
* ProxynovaScraper constructor. | ||
* @param HttpClientInterface $httpClient | ||
*/ | ||
public function __construct(HttpClientInterface $httpClient) | ||
{ | ||
$this->httpClient = $httpClient; | ||
} | ||
|
||
/** | ||
* @return Generator|Proxy[] | ||
* @throws ScraperException | ||
*/ | ||
public function get(): Generator | ||
{ | ||
try { | ||
$html = $this->httpClient->get(static::URL); | ||
} catch (HttpClientException $e) { | ||
throw new ScraperException($e->getMessage(), $e->getCode(), $e); | ||
} | ||
|
||
$rows = (new Dom($html))->filter('#tbl_proxy_list tbody tr'); | ||
|
||
foreach ($rows as $row) { | ||
try { | ||
yield $this->makeProxy(new Dom($row)); | ||
} catch (InvalidArgumentException $e) { | ||
continue; | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* @param Dom $row | ||
* @return Proxy | ||
* @throws InvalidArgumentException | ||
* @throws ScraperException | ||
*/ | ||
private function makeProxy(Dom $row): Proxy | ||
{ | ||
try { | ||
$encodedIp4v = trim($row->filter('td')->eq(0)->text()); | ||
$port = (int)$row->filter('td')->eq(1)->text(); | ||
} catch (Throwable $e) { | ||
throw new InvalidArgumentException($e->getMessage(), $e->getCode(), $e); | ||
} | ||
|
||
$parts = explode("'", $encodedIp4v); | ||
|
||
$expectedPartsCount = 3; | ||
if ($expectedPartsCount !== count($parts)) { | ||
throw new ScraperException('Unknown markup'); | ||
} | ||
|
||
$proxy = (new ProxyString($parts[1] . ':' . $port))->asProxy(); | ||
$proxy->addMetric(new Metric(Metrics::SOURCE, static::class)); | ||
|
||
return $proxy; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
<?php declare(strict_types=1); | ||
|
||
namespace Vantoozz\ProxyScraper\IntegrationTests\Scrapers; | ||
|
||
use Vantoozz\ProxyScraper\IntegrationTests\IntegrationTest; | ||
use Vantoozz\ProxyScraper\Scrapers\ProxynovaScraper; | ||
use Vantoozz\ProxyScraper\Scrapers\UsProxyScraper; | ||
|
||
/** | ||
* Class ProxynovaScraperTest | ||
* @package Vantoozz\ProxyScraper\IntegrationTests\Scrapers | ||
*/ | ||
final class ProxynovaScraperTest extends IntegrationTest | ||
{ | ||
/** | ||
* @test | ||
*/ | ||
public function it_works(): void | ||
{ | ||
$scrapper = new ProxynovaScraper($this->httpClient()); | ||
|
||
$proxies = iterator_to_array($scrapper->get()); | ||
static::assertGreaterThanOrEqual(100, count($proxies)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
<?php declare(strict_types=1); | ||
|
||
namespace Vantoozz\ProxyScraper\UnitTests\Scrapers; | ||
|
||
use PHPUnit\Framework\TestCase; | ||
use Vantoozz\ProxyScraper\Enums\Metrics; | ||
use Vantoozz\ProxyScraper\Exceptions\ScraperException; | ||
use Vantoozz\ProxyScraper\Proxy; | ||
use Vantoozz\ProxyScraper\Scrapers\ProxynovaScraper; | ||
use Vantoozz\ProxyScraper\UnitTests\HttpClient\FailingDummyHttpClient; | ||
use Vantoozz\ProxyScraper\UnitTests\HttpClient\PredefinedDummyHttpClient; | ||
|
||
/** | ||
* Class ProxynovaScraperTest | ||
* @package Vantoozz\ProxyScraper\UnitTests\Scrapers | ||
*/ | ||
final class ProxynovaScraperTest extends TestCase | ||
{ | ||
/** | ||
* @test | ||
*/ | ||
public function it_throws_an_exception_on_http_client_error(): void | ||
{ | ||
$this->expectException(ScraperException::class); | ||
$this->expectExceptionMessage('error message'); | ||
|
||
$scraper = new ProxynovaScraper( | ||
new FailingDummyHttpClient('error message') | ||
); | ||
$scraper->get()->current(); | ||
} | ||
|
||
/** | ||
* @test | ||
*/ | ||
public function it_returns_source_metric(): void | ||
{ | ||
$scraper = new ProxynovaScraper( | ||
new PredefinedDummyHttpClient( | ||
'<table id="tbl_proxy_list"><tbody><tr><td>\'46.101.55.200\'</td><td>8118</td></tr></table>' | ||
) | ||
); | ||
$proxy = $scraper->get()->current(); | ||
|
||
static::assertInstanceOf(Proxy::class, $proxy); | ||
/** @var Proxy $proxy */ | ||
static::assertSame(Metrics::SOURCE, $proxy->getMetrics()[0]->getName()); | ||
static::assertSame(ProxynovaScraper::class, $proxy->getMetrics()[0]->getValue()); | ||
} | ||
|
||
/** | ||
* @test | ||
*/ | ||
public function it_returns_a_proxy(): void | ||
{ | ||
$scraper = new ProxynovaScraper( | ||
new PredefinedDummyHttpClient( | ||
'<table id="tbl_proxy_list"><tbody><tr><td>\'46.101.55.200\'</td><td>8118</td></tr></table>' | ||
) | ||
); | ||
$proxy = $scraper->get()->current(); | ||
|
||
static::assertInstanceOf(Proxy::class, $proxy); | ||
static::assertSame('46.101.55.200:8118', (string)$proxy); | ||
} | ||
|
||
|
||
/** | ||
* @test | ||
*/ | ||
public function it_throws_an_exception_if_unknown_markdown_got(): void | ||
{ | ||
$this->expectException(ScraperException::class); | ||
$this->expectExceptionMessage('Unknown markup'); | ||
|
||
$scraper = new ProxynovaScraper( | ||
new PredefinedDummyHttpClient( | ||
'<table id="tbl_proxy_list"><tbody><tr><td>111</td><td>111</td></tr></table>' | ||
) | ||
); | ||
$scraper->get()->current(); | ||
} | ||
|
||
/** | ||
* @test | ||
*/ | ||
public function it_skips_bad_rows(): void | ||
{ | ||
$scraper = new ProxynovaScraper( | ||
new PredefinedDummyHttpClient( | ||
'<table id="tbl_proxy_list"><tbody><tr><td>111</td></tr></table>' | ||
) | ||
); | ||
|
||
static::assertNull($scraper->get()->current()); | ||
} | ||
} |