Skip to content

Commit

Permalink
Add FromHtml, update Parser Options and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Carlos Escobar committed Dec 28, 2020
1 parent cb21b3f commit a62ced1
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 57 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ For using the Parser one you need to execute something like this.
``` php
use WeblaborMX\ScrappingPlus\Scrapping;

// Using html directly
$scrapper = Scrapping::fromHtml('<html><body><h1>Hola</h1><p>Excerpt</p></body></html>');
$h1 = $scrapper->first('h1');
$text = $h1->getText(); // Hola

// Get it from an URL
$google = Scrapping::scrappe('https://www.google.com.mx');
$html = $google->getHtml();

Expand All @@ -41,7 +47,7 @@ $title = $class->getAttribute('value');

And if you want to execute it with laravel dusk you just need to execute something like this:

```
``` php
$page = Scrapping::method('dusk')->scrappe($url);
$page->object->waitForText($text); // How to use laravel dusk functions directly
$page = $page->toParser(); // Convert to the parser driver
Expand Down
2 changes: 1 addition & 1 deletion src/Drivers/Dusk.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public function setHtml($html) {

public function get($selector)
{
return;
return collect([]);
}

public function toParser() {
Expand Down
24 changes: 15 additions & 9 deletions src/Drivers/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use WeblaborMX\ScrappingPlus\DriverFormat;
use PHPHtmlParser\Dom;
use Illuminate\Support\Collection;
use PHPHtmlParser\Options;

class Parser extends DriverFormat
{
Expand All @@ -14,21 +15,15 @@ class Parser extends DriverFormat

public function setUrl($url)
{
$dom = new Dom;
$dom->setOptions([
'cleanupInput' => false, // Set a global option to enable strict html parsing.
]);
$dom = $this->getDom();
$dom->loadFromUrl($url);
$this->object = $dom;
return $this;
}

public function setHtml($html) {
$dom = new Dom;
$dom->setOptions([
'cleanupInput' => false, // Set a global option to enable strict html parsing.
]);
$dom->loadStr($html, []);
$dom = $this->getDom();
$dom->loadStr($html);
$this->object = $dom;
return $this;
}
Expand Down Expand Up @@ -68,5 +63,16 @@ public function getText()
{
return $this->selector->text;
}

private function getDom()
{
$dom = new Dom;
$dom->setOptions(
// this is set as the global option level.
(new Options())
->setCleanupInput(false) // Set a global option to enable strict html parsing.
);
return $dom;
}

}
17 changes: 15 additions & 2 deletions src/Scrapping.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,27 @@ static public function method($name)
}

static public function scrappe($url)
{
$class = self::getClass();
$object = new $class;
return $object->setUrl($url);
}

static public function fromHtml($html)
{
$class = self::getClass();
$object = new $class;
return $object->setHtml($html);
}

static private function getClass()
{
$class_name = ucwords(self::$method);
$class_name = str_replace(' ', '', $class_name);
$class = 'WeblaborMX\ScrappingPlus\Drivers\\'.$class_name;
if(!class_exists($class)) {
throw new \Exception("The method doesnt exist", 1);
}
$object = new $class;
return $object->setUrl($url);
return $class;
}
}
52 changes: 8 additions & 44 deletions tests/BasicTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public function parserTest()

// Access inputs
$inputs = $google->get('input');
$this->assertEquals(5, $inputs->count());
$this->assertEquals(10, $inputs->count());

$first = $inputs->first();
$second = $inputs[1];
Expand All @@ -37,7 +37,7 @@ public function parserDusk()

// Access inputs
$inputs = $google->get('input');
$this->assertEquals(9, $inputs->count());
$this->assertEquals(8, $inputs->count());

$first = $inputs->first();
$second = $inputs[1];
Expand All @@ -48,31 +48,14 @@ public function parserDusk()
$this->assertEquals('Me siento con suerte ', $title);
}

/** @test */
public function useOriginalObject()
{
$page = Scrapping::method('dusk')->scrappe('https://www.ticketmaster.com.mx/Auditorio-Nacional-boletos-Mexico/venue/163841?tm_link=tm_homeA_b_10001_1');
$page->object->waitForText('Ver Boletos');
$page = $page->toParser();
$item = $page->get('table#venue_results_tbl > tbody > tr');
$items = $page->get('table#venue_results_tbl');

$this->assertEquals(15, $item->count());

$item = $item->first();
$link = $item->first('a');
var_dump($link->getLink());
}

/** @test */
public function parserItsSelectedByDefault()
{
$google = Scrapping::scrappe('https://www.google.com.mx');
$html = $google->getHtml();
$google = Scrapping::method('Voku')->scrappe('https://www.google.com.mx');

// Access inputs
$inputs = $google->get('input');
$this->assertEquals(5, $inputs->count());
$this->assertEquals(7, $inputs->count());

$first = $inputs->first();
$second = $inputs[1];
Expand Down Expand Up @@ -102,29 +85,10 @@ public function vokuTest()
}

/** @test */
public function bug1()
public function fromHtml()
{
$page = Scrapping::method('voku')->scrappe('https://www.eticket.mx/masinformacion.aspx?idevento=23878');
$image_header_url = $page->first('#copetes_dinamicos .ancholimitado img')->getAttribute('src');
$image_poster_url = $page->first('.campo2_hor_izq .font14 img')->getAttribute('src');
$image_map_url = $page->first('#mapwrapper img')->getAttribute('src');
$title = $page->first('#copetes_dinamicos .ancholimitado img')->getAttribute('alt');
$date = $page->first('.grisclarofondo .mayusculas_primera')->getText();
$hour = $page->first('.grisclarofondo .mayusculas_primera > span')->getText();
$hour = str_replace('(', '', $hour);
$hour = str_replace(')', '', $hour);
$address_object = $page->get('.grisclarofondo > div')[2];
$address_object = $address_object->get('div > div');
$place = $address_object[0]->getText();
$this->assertTrue(!is_null($place));
$city = $address_object[1]->getText();
$this->assertTrue(!is_null($city));
$address = $address_object[2]->getText();
$this->assertTrue(!is_null($address));
$neighbor = $address_object[3]->getText();
$this->assertTrue(!is_null($neighbor));
$postal_code = $address_object[4]->getText();
$this->assertTrue(!is_null($postal_code));
$scrapper = Scrapping::fromHtml('<html><body><h1>Hola</h1><p>Excerpt</p></body></html>');
$h1 = $scrapper->first('h1');
$this->assertEquals('Hola', $h1->getText());
}

}

0 comments on commit a62ced1

Please sign in to comment.