diff --git a/composer.json b/composer.json index 14c95eb..1ac5873 100644 --- a/composer.json +++ b/composer.json @@ -11,6 +11,7 @@ ], "require": { "php": "^7.1", + "ext-json": "*", "php-http/client-implementation": "~1", "php-http/message-factory": "^1.0.2", "php-http/httplug": "^1.1", diff --git a/src/Scrapers/CoolProxyScraper.php b/src/Scrapers/CoolProxyScraper.php index 6170c7c..0ae45f2 100644 --- a/src/Scrapers/CoolProxyScraper.php +++ b/src/Scrapers/CoolProxyScraper.php @@ -1,10 +1,14 @@ -getPage($page); - } catch (HttpClientException $e) { - break; - } - $page++; - } while ($page <= static::MAX_PAGES_COUNT); - } - - /** - * @param int $page - * @return \Generator - * @throws \Vantoozz\ProxyScraper\Exceptions\HttpClientException - * @throws \RuntimeException if the CssSelector Component is not available - */ - private function getPage(int $page): \Generator + public function get(): Generator { + try { + $json = $this->httpClient->get(sprintf(static::JSON_URL)); + } catch (HttpClientException $e) { + throw new ScraperException($e->getMessage(), $e->getCode(), $e); + } - $html = $this->httpClient->get(sprintf(static::PAGE_URL, $page)); + $data = json_decode($json, true); + if (!$data) { + throw new ScraperException('Cannot parse json: ' . json_last_error_msg()); + } - $rows = (new Dom($html))->filter('table tr'); + if (!is_array($data)) { + throw new ScraperException('No data'); + } - foreach ($rows as $row) { + foreach ($data as $item) { + if (!is_array($item)) { + $item = []; + } try { - yield $this->makeProxy(new Dom($row)); - } catch (\Throwable $e) { + yield $this->makeProxy($item); + } catch (Throwable $e) { continue; } } } + /** - * @param Dom $row + * @param array $item * @return Proxy - * @throws \Throwable + * @throws Throwable */ - private function makeProxy(Dom $row): Proxy + private function makeProxy(array $item): Proxy { - $ipv4 = base64_decode(str_rot13(explode('"', $row->filter('td')->eq(0)->text())[1])); - - $port = (int)$row->filter('td')->eq(1)->text(); + if (!isset($item['ip'])) { + throw new InvalidArgumentException('No IP given'); + } + if (!isset($item['port'])) { + throw new InvalidArgumentException('No port given'); + } - $proxy = new Proxy(new Ipv4($ipv4), new Port($port)); + $proxy = new Proxy(new Ipv4($item['ip']), new Port((int)$item['port'])); $proxy->addMetric(new Metric(Metrics::SOURCE, static::class)); return $proxy; diff --git a/tests/unit/Scrapers/CoolProxyScraperTest.php b/tests/unit/Scrapers/CoolProxyScraperTest.php index 773f1c6..ad391c3 100644 --- a/tests/unit/Scrapers/CoolProxyScraperTest.php +++ b/tests/unit/Scrapers/CoolProxyScraperTest.php @@ -3,11 +3,13 @@ namespace Vantoozz\ProxyScraper\UnitTests\Scrapers; use PHPUnit\Framework\TestCase; +use PHPUnit_Framework_MockObject_MockObject; use Vantoozz\ProxyScraper\Enums\Metrics; use Vantoozz\ProxyScraper\Exceptions\HttpClientException; use Vantoozz\ProxyScraper\HttpClient\HttpClientInterface; use Vantoozz\ProxyScraper\Proxy; use Vantoozz\ProxyScraper\Scrapers\CoolProxyScraper; +use Vantoozz\ProxyScraper\Scrapers\HideMyIpScraper; /** * Class CoolProxyScraperTest @@ -17,8 +19,10 @@ final class CoolProxyScraperTest extends TestCase { /** * @test + * @expectedException \Vantoozz\ProxyScraper\Exceptions\ScraperException + * @expectedExceptionMessage error message */ - public function it_stops_on_http_client_error(): void + public function it_throws_an_exception_on_http_client_error(): void { /** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */ $httpClient = $this->createMock(HttpClientInterface::class); @@ -28,7 +32,7 @@ public function it_stops_on_http_client_error(): void ->willThrowException(new HttpClientException('error message')); $scraper = new CoolProxyScraper($httpClient); - static::assertNull($scraper->get()->current()); + $scraper->get()->current(); } /** @@ -36,12 +40,12 @@ public function it_stops_on_http_client_error(): void */ public function it_returns_source_metric(): void { - /** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */ + /** @var HttpClientInterface|PHPUnit_Framework_MockObject_MockObject $httpClient */ $httpClient = $this->createMock(HttpClientInterface::class); $httpClient ->expects(static::once()) ->method('get') - ->willReturn('
"ZGp3YwDmYwH3YwD4"2222
'); + ->willReturn('[{"ip":"177.43.57.48","port":2222},{"ip":"206.189.220.8","port":80}]'); $scraper = new CoolProxyScraper($httpClient); $proxy = $scraper->get()->current(); @@ -57,12 +61,12 @@ public function it_returns_source_metric(): void */ public function it_returns_a_proxy(): void { - /** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */ + /** @var HttpClientInterface|PHPUnit_Framework_MockObject_MockObject $httpClient */ $httpClient = $this->createMock(HttpClientInterface::class); $httpClient ->expects(static::once()) ->method('get') - ->willReturn('
"ZGp3YwDmYwH3YwD4"2222
'); + ->willReturn('[{"ip":"177.43.57.48","port":2222},{"ip":"206.189.220.8","port":80}]'); $scraper = new CoolProxyScraper($httpClient); $proxy = $scraper->get()->current(); @@ -74,35 +78,87 @@ public function it_returns_a_proxy(): void /** * @test */ - public function it_fetches_no_more_than_100_pages(): void + public function it_skips_rows_with_no_ip(): void { - /** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */ + /** @var HttpClientInterface|PHPUnit_Framework_MockObject_MockObject $httpClient */ $httpClient = $this->createMock(HttpClientInterface::class); $httpClient ->expects(static::atLeastOnce()) ->method('get') - ->willReturn('
"ZGp3YwDmYwH3YwD4"2222
'); + ->willReturn('[{"port":2222}]'); $scraper = new CoolProxyScraper($httpClient); - $proxies = iterator_to_array($scraper->get(), false); - static::assertCount(100, $proxies); + static::assertNull($scraper->get()->current()); } /** * @test */ - public function it_skips_bad_rows(): void + public function it_skips_non_array_rows(): void { - /** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */ + /** @var HttpClientInterface|PHPUnit_Framework_MockObject_MockObject $httpClient */ $httpClient = $this->createMock(HttpClientInterface::class); $httpClient ->expects(static::atLeastOnce()) ->method('get') - ->willReturn('
aaa2222
'); + ->willReturn('[123]'); $scraper = new CoolProxyScraper($httpClient); static::assertNull($scraper->get()->current()); } + + /** + * @test + */ + public function it_skips_rows_with_no_port(): void + { + /** @var HttpClientInterface|PHPUnit_Framework_MockObject_MockObject $httpClient */ + $httpClient = $this->createMock(HttpClientInterface::class); + $httpClient + ->expects(static::atLeastOnce()) + ->method('get') + ->willReturn('[{"ip":"177.43.57.48"}]'); + + $scraper = new CoolProxyScraper($httpClient); + + static::assertNull($scraper->get()->current()); + } + + /** + * @test + * @expectedException \Vantoozz\ProxyScraper\Exceptions\ScraperException + * @expectedExceptionMessage Cannot parse json: Syntax error + */ + public function it_throws_an_exception_if_bad_json_got(): void + { + /** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */ + $httpClient = $this->createMock(HttpClientInterface::class); + $httpClient + ->expects(static::once()) + ->method('get') + ->willReturn('var json = dcvsdjh'); + + $scraper = new CoolProxyScraper($httpClient); + $scraper->get()->current(); + } + + /** + * @test + * @expectedException \Vantoozz\ProxyScraper\Exceptions\ScraperException + * @expectedExceptionMessage No data + */ + public function it_throws_an_exception_if_no_data_got(): void + { + /** @var HttpClientInterface|\PHPUnit_Framework_MockObject_MockObject $httpClient */ + $httpClient = $this->createMock(HttpClientInterface::class); + $httpClient + ->expects(static::once()) + ->method('get') + ->willReturn('123'); + + $scraper = new CoolProxyScraper($httpClient); + $scraper->get()->current(); + } }