Skip to content

Commit

Permalink
Scrapers Updated (#3)
Browse files Browse the repository at this point in the history
* + blogspot (RSS feed) scrapers, - deprecated scrapers

* + intgration & unit tests, + fix HideMyIpScraper tests

* + Fix bug and add test for BlogSpotProxyScraper & ProxyServerListScraper

* ~ re-arrange code & add try catch stmt

* ~ update try-catch exception type
  • Loading branch information
sleeyax authored and vantoozz committed Jul 24, 2018
1 parent 5610aa3 commit 1e0781b
Show file tree
Hide file tree
Showing 32 changed files with 972 additions and 718 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ $httpClient = new GuzzleHttpClient(new GuzzleClient([
'connect_timeout' => 2,
'timeout' => 3,
]));
$scraper = new Scrapers\SpysMeScraper($httpClient);
$scraper = new Scrapers\FreeProxyListScraper($httpClient);

foreach ($scraper->get() as $proxy) {
echo (string)$proxy . "\n";
Expand Down Expand Up @@ -64,7 +64,7 @@ $compositeScraper = new Scrapers\CompositeScraper;
$compositeScraper->addScraper(new Scrapers\FreeProxyListScraper($httpClient));
$compositeScraper->addScraper(new Scrapers\MultiproxyScraper($httpClient));
$compositeScraper->addScraper(new Scrapers\SocksProxyScraper($httpClient));
$compositeScraper->addScraper(new Scrapers\SpysMeScraper($httpClient));
$compositeScraper->addScraper(new Scrapers\FreeProxyListScraper($httpClient));

foreach ($compositeScraper->get() as $proxy) {
echo (string)$proxy . "\n";
Expand Down Expand Up @@ -178,7 +178,7 @@ $httpClient = new GuzzleHttpClient(new GuzzleClient([
'connect_timeout' => 2,
'timeout' => 3,
]));
$scraper = new Scrapers\SpysMeScraper($httpClient);
$scraper = new Scrapers\FreeProxyListScraper($httpClient);

/** @var \Vantoozz\ProxyScraper\Proxy $proxy */
$proxy = $scraper->get()->current();
Expand All @@ -189,7 +189,7 @@ foreach ($proxy->getMetrics() as $metric) {
```
Will output
```
source: Vantoozz\ProxyScraper\Scrapers\SpysMeScraper
source: Vantoozz\ProxyScraper\Scrapers\FreeProxyListScraper
```


Expand Down
2 changes: 1 addition & 1 deletion examples/01-single_scraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
'connect_timeout' => 2,
'timeout' => 3,
]));
$scraper = new Scrapers\SpysMeScraper($httpClient);
$scraper = new Scrapers\HideMyIpScraper($httpClient);

foreach ($scraper->get() as $proxy) {
echo (string)$proxy . "\n";
Expand Down
1 change: 0 additions & 1 deletion examples/02-composite_scraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
$compositeScraper->addScraper(new Scrapers\FreeProxyListScraper($httpClient));
$compositeScraper->addScraper(new Scrapers\MultiproxyScraper($httpClient));
$compositeScraper->addScraper(new Scrapers\SocksProxyScraper($httpClient));
$compositeScraper->addScraper(new Scrapers\SpysMeScraper($httpClient));

foreach ($compositeScraper->get() as $proxy) {
echo (string)$proxy . "\n";
Expand Down
2 changes: 1 addition & 1 deletion examples/05-metrics.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
'connect_timeout' => 2,
'timeout' => 3,
]));
$scraper = new Scrapers\SpysMeScraper($httpClient);
$scraper = new Scrapers\FreeProxyListScraper($httpClient);

/** @var Proxy $proxy */
$proxy = $scraper->get()->current();
Expand Down
79 changes: 79 additions & 0 deletions src/Scrapers/AbstractRssBloggerScraper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
<?php declare(strict_types = 1);

namespace Vantoozz\ProxyScraper\Scrapers;

use Vantoozz\ProxyScraper\Enums\Metrics;
use Vantoozz\ProxyScraper\Exceptions\HttpClientException;
use Vantoozz\ProxyScraper\Exceptions\InvalidArgumentException;
use Vantoozz\ProxyScraper\Exceptions\ScraperException;
use Vantoozz\ProxyScraper\HttpClient\HttpClientInterface;
use Vantoozz\ProxyScraper\Metric;
use Vantoozz\ProxyScraper\Proxy;
use Vantoozz\ProxyScraper\ProxyString;
use Vantoozz\ProxyScraper\Text;

/**
* Class AbstractRssBloggerScraper
* @package Vantoozz\ProxyScraper\Scrapers
*/
abstract class AbstractRssBloggerScraper implements ScraperInterface
{
/**
* @var HttpClientInterface
*/
private $httpClient;

/**
* FreeProxyListScraper constructor.
* @param HttpClientInterface $httpClient
*/
public function __construct(HttpClientInterface $httpClient)
{
$this->httpClient = $httpClient;
}

/**
* @return \Generator|Proxy[]
* @throws ScraperException
*/
public function get(): \Generator
{
try {
$html = $this->httpClient->get($this->rssBloggerUrl());
} catch (HttpClientException $e) {
throw new ScraperException($e->getMessage(), $e->getCode(), $e);
}

if (!(new Text($html))->isXml()) {
throw new ScraperException('Invalid XML');
}

$feed = simplexml_load_string($html);
yield from $this->fetchFeed($feed);
}

/**
* @param \SimpleXMLElement $feed
* @return \Generator
*/
private function fetchFeed(\SimpleXMLElement $feed)
{
foreach ($feed->entry as $entry) {
preg_match_all('/\d+\.\d+\.\d+\.\d+:\d{1,5}/m', (string)$entry->content, $matches);
foreach ($matches[0] as $proxyString) {
try {
$proxy = (new ProxyString($proxyString))->asProxy();
$proxy->addMetric(new Metric(Metrics::SOURCE, static::class));
yield $proxy;
} catch (InvalidArgumentException $e) {
continue;
}
}
}
}

/**
* @return string
*/
abstract protected function rssBloggerUrl(): string;
}
18 changes: 18 additions & 0 deletions src/Scrapers/BlogspotProxyScraper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?php declare(strict_types = 1);
namespace Vantoozz\ProxyScraper\Scrapers;

/**
* Class BlogspotProxyScraper
* @package Vantoozz\ProxyScraper\Scrapers
*/
final class BlogspotProxyScraper extends AbstractRssBloggerScraper
{

/**
* @return string
*/
protected function rssBloggerUrl(): string
{
return 'https://blogspotproxy.blogspot.com/feeds/posts/default';
}
}
18 changes: 0 additions & 18 deletions src/Scrapers/FoxToolsScraper.php

This file was deleted.

2 changes: 1 addition & 1 deletion src/Scrapers/HideMyIpScraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ private function extractData(string $html): array
if ($expectedPartsCount !== count($parts)) {
throw new ScraperException('Unknown markup');
}
$json = trim(explode(";\n\n", $parts[1])[0]);
$json = trim(explode(";", $parts[1])[0]);
$data = json_decode($json, true);
if (!$data) {
throw new ScraperException('Cannot parse json: ' . json_last_error_msg());
Expand Down
75 changes: 0 additions & 75 deletions src/Scrapers/PrimeSpeedScraper.php

This file was deleted.

127 changes: 0 additions & 127 deletions src/Scrapers/ProxyDbScraper.php

This file was deleted.

Loading

0 comments on commit 1e0781b

Please sign in to comment.