Skip to content

Commit

Permalink
+ proxyvova scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
vantoozz committed Apr 25, 2020
1 parent 895ffc2 commit f214f45
Show file tree
Hide file tree
Showing 6 changed files with 248 additions and 27 deletions.
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,34 @@ Library for scraping free proxies lists written in PHP
[![Codacy Badge](https://api.codacy.com/project/badge/Grade/4b3e0816e98d486e9f0eff445a6310c6)](https://www.codacy.com/app/vantoozz/proxy-scraper?utm_source=github.com&utm_medium=referral&utm_content=vantoozz/proxy-scraper&utm_campaign=Badge_Grade)
[![Packagist](https://img.shields.io/packagist/v/vantoozz/proxy-scraper.svg)](https://packagist.org/packages/vantoozz/proxy-scraper)

### Quick start
```bash
composer require vantoozz/proxy-scraper php-http/guzzle6-adapter hanneskod/classtools
```
```php
<?php declare(strict_types = 1);

use function Vantoozz\ProxyScraper\proxyScraper;

require_once __DIR__ . '/vendor/autoload.php';

foreach (proxyScraper()->get() as $proxy) {
echo $proxy . "\n";
}
```

### Older versions
This is version 2 of the library. For version 1 please check [v1](https://github.com/vantoozz/proxy-scraper/tree/v1) branch.

### Setup

The library is built on top of [HTTPlug](http://httplug.io/) and requires a compatible HTTP client. Available clients are listed on Packagist: https://packagist.org/providers/php-http/client-implementation. To use the library you have to install any of them, e.g.:
The library uses [HTTPlug](http://httplug.io/) and requires a compatible HTTP client.
To use the library you have to install any of them, e.g.:

```bash
composer require php-http/guzzle6-adapter
```
All available clients are listed on Packagist: https://packagist.org/providers/php-http/client-implementation.

Then install proxy-scraper library itself:
```bash
Expand Down
1 change: 0 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
"require-dev": {
"php-http/guzzle6-adapter": "~2",
"phpunit/phpunit": "^8.4|~9",
"league/container": "~3",
"hanneskod/classtools": "~1"
},
"suggest": {
Expand Down
91 changes: 91 additions & 0 deletions src/Scrapers/ProxynovaScraper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
<?php declare(strict_types=1);

namespace Vantoozz\ProxyScraper\Scrapers;

use Generator;
use Symfony\Component\DomCrawler\Crawler as Dom;
use Throwable;
use Vantoozz\ProxyScraper\Enums\Metrics;
use Vantoozz\ProxyScraper\Exceptions\HttpClientException;
use Vantoozz\ProxyScraper\Exceptions\InvalidArgumentException;
use Vantoozz\ProxyScraper\Exceptions\ScraperException;
use Vantoozz\ProxyScraper\HttpClient\HttpClientInterface;
use Vantoozz\ProxyScraper\Metric;
use Vantoozz\ProxyScraper\Proxy;
use Vantoozz\ProxyScraper\ProxyString;

/**
* Class ProxynovaScraper
* @package Vantoozz\ProxyScraper\Scrapers
*/
final class ProxynovaScraper implements ScraperInterface, Discoverable
{
/**
*
*/
private const URL = 'https://www.proxynova.com/proxy-server-list/';
/**
* @var HttpClientInterface
*/
private $httpClient;

/**
* ProxynovaScraper constructor.
* @param HttpClientInterface $httpClient
*/
public function __construct(HttpClientInterface $httpClient)
{
$this->httpClient = $httpClient;
}

/**
* @return Generator|Proxy[]
* @throws ScraperException
*/
public function get(): Generator
{
try {
$html = $this->httpClient->get(static::URL);
} catch (HttpClientException $e) {
throw new ScraperException($e->getMessage(), $e->getCode(), $e);
}

$rows = (new Dom($html))->filter('#tbl_proxy_list tbody tr');

foreach ($rows as $row) {
try {
yield $this->makeProxy(new Dom($row));
} catch (InvalidArgumentException $e) {
continue;
}
}
}

/**
* @param Dom $row
* @return Proxy
* @throws InvalidArgumentException
* @throws ScraperException
*/
private function makeProxy(Dom $row): Proxy
{
try {
$encodedIp4v = trim($row->filter('td')->eq(0)->text());
$port = (int)$row->filter('td')->eq(1)->text();
} catch (Throwable $e) {
throw new InvalidArgumentException($e->getMessage(), $e->getCode(), $e);
}

$parts = explode("'", $encodedIp4v);

$expectedPartsCount = 3;
if ($expectedPartsCount !== count($parts)) {
throw new ScraperException('Unknown markup');
}

$proxy = (new ProxyString($parts[1] . ':' . $port))->asProxy();
$proxy->addMetric(new Metric(Metrics::SOURCE, static::class));

return $proxy;
}
}
25 changes: 25 additions & 0 deletions tests/integration/Scrapers/ProxynovaScraperTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?php declare(strict_types=1);

namespace Vantoozz\ProxyScraper\IntegrationTests\Scrapers;

use Vantoozz\ProxyScraper\IntegrationTests\IntegrationTest;
use Vantoozz\ProxyScraper\Scrapers\ProxynovaScraper;
use Vantoozz\ProxyScraper\Scrapers\UsProxyScraper;

/**
* Class ProxynovaScraperTest
* @package Vantoozz\ProxyScraper\IntegrationTests\Scrapers
*/
final class ProxynovaScraperTest extends IntegrationTest
{
/**
* @test
*/
public function it_works(): void
{
$scrapper = new ProxynovaScraper($this->httpClient());

$proxies = iterator_to_array($scrapper->get());
static::assertGreaterThanOrEqual(100, count($proxies));
}
}
41 changes: 16 additions & 25 deletions tests/systemTests.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,49 +5,40 @@
require_once __DIR__ . '/../vendor/autoload.php';

use GuzzleHttp\Client as GuzzleClient;
use hanneskod\classtools\Iterator\ClassIterator;
use Http\Adapter\Guzzle6\Client as HttpAdapter;
use Http\Message\MessageFactory\GuzzleMessageFactory as MessageFactory;
use League\Container\Container;
use League\Container\ReflectionContainer;
use Vantoozz\ProxyScraper\HttpClient\HttpClientInterface;
use Symfony\Component\Finder\Finder;
use Vantoozz\ProxyScraper\HttpClient\HttplugHttpClient;
use Vantoozz\ProxyScraper\Scrapers;
use Vantoozz\ProxyScraper\Scrapers\Discoverable;
use Vantoozz\ProxyScraper\Scrapers\ScraperInterface;
use Vantoozz\ProxyScraper\SystemTests\ProxiesMiner\Cached;
use Vantoozz\ProxyScraper\SystemTests\Reports\CountsReport;
use Vantoozz\ProxyScraper\SystemTests\Reports\DuplicatesReport;
use Vantoozz\ProxyScraper\SystemTests\Reports\ExclusivityReport;
use Vantoozz\ProxyScraper\SystemTests\Reports\ReportsPipeline;


$httpClient = new HttplugHttpClient(
new HttpAdapter(new GuzzleClient([
'connect_timeout' => 2,
'timeout' => 3,
'connect_timeout' => 5,
'timeout' => 10,
])),
new MessageFactory
);

$container = new Container;
$container->delegate(new ReflectionContainer);
$container->add(HttpClientInterface::class, $httpClient, true);

$miner = new ProxiesMiner\ScrapersProxiesMiner;
foreach ([
Scrapers\BlogspotProxyScraper::class,
Scrapers\CheckProxyScraper::class,
Scrapers\CoolProxyScraper::class,
Scrapers\FreeProxyListScraper::class,
Scrapers\HideMyIpScraper::class,
Scrapers\MultiproxyScraper::class,
Scrapers\ProxyServerlistScraper::class,
Scrapers\SocksProxyScraper::class,
Scrapers\SslProxiesScraper::class,
Scrapers\UsProxyScraper::class,
Scrapers\TopProxysScraper::class,
] as $class) {
$miner->addScraper($container->get($class));

$classIterator = new ClassIterator((new Finder)->in(__DIR__ . '/../src/Scrapers'));
foreach ($classIterator->type(Discoverable::class) as $class) {
if (!$class->isInstantiable()) {
continue;
}
/** @var ScraperInterface $scraper */
$scraper = $class->newInstance($httpClient);
$miner->addScraper($scraper);
}


$cacheFilename = __DIR__ . '/.cached_proxies';
if (in_array('--refresh', $argv, true)) {
unlink($cacheFilename);
Expand Down
97 changes: 97 additions & 0 deletions tests/unit/Scrapers/ProxynovaScraperTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
<?php declare(strict_types=1);

namespace Vantoozz\ProxyScraper\UnitTests\Scrapers;

use PHPUnit\Framework\TestCase;
use Vantoozz\ProxyScraper\Enums\Metrics;
use Vantoozz\ProxyScraper\Exceptions\ScraperException;
use Vantoozz\ProxyScraper\Proxy;
use Vantoozz\ProxyScraper\Scrapers\ProxynovaScraper;
use Vantoozz\ProxyScraper\UnitTests\HttpClient\FailingDummyHttpClient;
use Vantoozz\ProxyScraper\UnitTests\HttpClient\PredefinedDummyHttpClient;

/**
* Class ProxynovaScraperTest
* @package Vantoozz\ProxyScraper\UnitTests\Scrapers
*/
final class ProxynovaScraperTest extends TestCase
{
/**
* @test
*/
public function it_throws_an_exception_on_http_client_error(): void
{
$this->expectException(ScraperException::class);
$this->expectExceptionMessage('error message');

$scraper = new ProxynovaScraper(
new FailingDummyHttpClient('error message')
);
$scraper->get()->current();
}

/**
* @test
*/
public function it_returns_source_metric(): void
{
$scraper = new ProxynovaScraper(
new PredefinedDummyHttpClient(
'<table id="tbl_proxy_list"><tbody><tr><td>\'46.101.55.200\'</td><td>8118</td></tr></table>'
)
);
$proxy = $scraper->get()->current();

static::assertInstanceOf(Proxy::class, $proxy);
/** @var Proxy $proxy */
static::assertSame(Metrics::SOURCE, $proxy->getMetrics()[0]->getName());
static::assertSame(ProxynovaScraper::class, $proxy->getMetrics()[0]->getValue());
}

/**
* @test
*/
public function it_returns_a_proxy(): void
{
$scraper = new ProxynovaScraper(
new PredefinedDummyHttpClient(
'<table id="tbl_proxy_list"><tbody><tr><td>\'46.101.55.200\'</td><td>8118</td></tr></table>'
)
);
$proxy = $scraper->get()->current();

static::assertInstanceOf(Proxy::class, $proxy);
static::assertSame('46.101.55.200:8118', (string)$proxy);
}


/**
* @test
*/
public function it_throws_an_exception_if_unknown_markdown_got(): void
{
$this->expectException(ScraperException::class);
$this->expectExceptionMessage('Unknown markup');

$scraper = new ProxynovaScraper(
new PredefinedDummyHttpClient(
'<table id="tbl_proxy_list"><tbody><tr><td>111</td><td>111</td></tr></table>'
)
);
$scraper->get()->current();
}

/**
* @test
*/
public function it_skips_bad_rows(): void
{
$scraper = new ProxynovaScraper(
new PredefinedDummyHttpClient(
'<table id="tbl_proxy_list"><tbody><tr><td>111</td></tr></table>'
)
);

static::assertNull($scraper->get()->current());
}
}

0 comments on commit f214f45

Please sign in to comment.