diff --git a/README.md b/README.md index ee5632f..32aa94f 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,12 @@ For using the Parser one you need to execute something like this. ``` php use WeblaborMX\ScrappingPlus\Scrapping; +// Using html directly +$scrapper = Scrapping::fromHtml('

Hola

Excerpt

'); +$h1 = $scrapper->first('h1'); +$text = $h1->getText(); // Hola + +// Get it from an URL $google = Scrapping::scrappe('https://www.google.com.mx'); $html = $google->getHtml(); @@ -41,7 +47,7 @@ $title = $class->getAttribute('value'); And if you want to execute it with laravel dusk you just need to execute something like this: -``` +``` php $page = Scrapping::method('dusk')->scrappe($url); $page->object->waitForText($text); // How to use laravel dusk functions directly $page = $page->toParser(); // Convert to the parser driver diff --git a/src/Drivers/Dusk.php b/src/Drivers/Dusk.php index 5769265..2fe8683 100644 --- a/src/Drivers/Dusk.php +++ b/src/Drivers/Dusk.php @@ -34,7 +34,7 @@ public function setHtml($html) { public function get($selector) { - return; + return collect([]); } public function toParser() { diff --git a/src/Drivers/Parser.php b/src/Drivers/Parser.php index cb185a3..3d8c12f 100644 --- a/src/Drivers/Parser.php +++ b/src/Drivers/Parser.php @@ -5,6 +5,7 @@ use WeblaborMX\ScrappingPlus\DriverFormat; use PHPHtmlParser\Dom; use Illuminate\Support\Collection; +use PHPHtmlParser\Options; class Parser extends DriverFormat { @@ -14,21 +15,15 @@ class Parser extends DriverFormat public function setUrl($url) { - $dom = new Dom; - $dom->setOptions([ - 'cleanupInput' => false, // Set a global option to enable strict html parsing. - ]); + $dom = $this->getDom(); $dom->loadFromUrl($url); $this->object = $dom; return $this; } public function setHtml($html) { - $dom = new Dom; - $dom->setOptions([ - 'cleanupInput' => false, // Set a global option to enable strict html parsing. - ]); - $dom->loadStr($html, []); + $dom = $this->getDom(); + $dom->loadStr($html); $this->object = $dom; return $this; } @@ -68,5 +63,16 @@ public function getText() { return $this->selector->text; } + + private function getDom() + { + $dom = new Dom; + $dom->setOptions( + // this is set as the global option level. + (new Options()) + ->setCleanupInput(false) // Set a global option to enable strict html parsing. + ); + return $dom; + } } \ No newline at end of file diff --git a/src/Scrapping.php b/src/Scrapping.php index 43feb0c..996daec 100644 --- a/src/Scrapping.php +++ b/src/Scrapping.php @@ -13,6 +13,20 @@ static public function method($name) } static public function scrappe($url) + { + $class = self::getClass(); + $object = new $class; + return $object->setUrl($url); + } + + static public function fromHtml($html) + { + $class = self::getClass(); + $object = new $class; + return $object->setHtml($html); + } + + static private function getClass() { $class_name = ucwords(self::$method); $class_name = str_replace(' ', '', $class_name); @@ -20,7 +34,6 @@ static public function scrappe($url) if(!class_exists($class)) { throw new \Exception("The method doesnt exist", 1); } - $object = new $class; - return $object->setUrl($url); + return $class; } } \ No newline at end of file diff --git a/tests/BasicTest.php b/tests/BasicTest.php index 9caceb2..aad9198 100644 --- a/tests/BasicTest.php +++ b/tests/BasicTest.php @@ -15,7 +15,7 @@ public function parserTest() // Access inputs $inputs = $google->get('input'); - $this->assertEquals(5, $inputs->count()); + $this->assertEquals(10, $inputs->count()); $first = $inputs->first(); $second = $inputs[1]; @@ -37,7 +37,7 @@ public function parserDusk() // Access inputs $inputs = $google->get('input'); - $this->assertEquals(9, $inputs->count()); + $this->assertEquals(8, $inputs->count()); $first = $inputs->first(); $second = $inputs[1]; @@ -48,31 +48,14 @@ public function parserDusk() $this->assertEquals('Me siento con suerte ', $title); } - /** @test */ - public function useOriginalObject() - { - $page = Scrapping::method('dusk')->scrappe('https://www.ticketmaster.com.mx/Auditorio-Nacional-boletos-Mexico/venue/163841?tm_link=tm_homeA_b_10001_1'); - $page->object->waitForText('Ver Boletos'); - $page = $page->toParser(); - $item = $page->get('table#venue_results_tbl > tbody > tr'); - $items = $page->get('table#venue_results_tbl'); - - $this->assertEquals(15, $item->count()); - - $item = $item->first(); - $link = $item->first('a'); - var_dump($link->getLink()); - } - /** @test */ public function parserItsSelectedByDefault() { - $google = Scrapping::scrappe('https://www.google.com.mx'); - $html = $google->getHtml(); + $google = Scrapping::method('Voku')->scrappe('https://www.google.com.mx'); // Access inputs $inputs = $google->get('input'); - $this->assertEquals(5, $inputs->count()); + $this->assertEquals(7, $inputs->count()); $first = $inputs->first(); $second = $inputs[1]; @@ -102,29 +85,10 @@ public function vokuTest() } /** @test */ - public function bug1() + public function fromHtml() { - $page = Scrapping::method('voku')->scrappe('https://www.eticket.mx/masinformacion.aspx?idevento=23878'); - $image_header_url = $page->first('#copetes_dinamicos .ancholimitado img')->getAttribute('src'); - $image_poster_url = $page->first('.campo2_hor_izq .font14 img')->getAttribute('src'); - $image_map_url = $page->first('#mapwrapper img')->getAttribute('src'); - $title = $page->first('#copetes_dinamicos .ancholimitado img')->getAttribute('alt'); - $date = $page->first('.grisclarofondo .mayusculas_primera')->getText(); - $hour = $page->first('.grisclarofondo .mayusculas_primera > span')->getText(); - $hour = str_replace('(', '', $hour); - $hour = str_replace(')', '', $hour); - $address_object = $page->get('.grisclarofondo > div')[2]; - $address_object = $address_object->get('div > div'); - $place = $address_object[0]->getText(); - $this->assertTrue(!is_null($place)); - $city = $address_object[1]->getText(); - $this->assertTrue(!is_null($city)); - $address = $address_object[2]->getText(); - $this->assertTrue(!is_null($address)); - $neighbor = $address_object[3]->getText(); - $this->assertTrue(!is_null($neighbor)); - $postal_code = $address_object[4]->getText(); - $this->assertTrue(!is_null($postal_code)); + $scrapper = Scrapping::fromHtml('

Hola

Excerpt

'); + $h1 = $scrapper->first('h1'); + $this->assertEquals('Hola', $h1->getText()); } - }