In [1]:
import json
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.firefox import GeckoDriverManager

In [2]:
class Spider:
    """
    This is the constructor class to which you can pass a bunch of parameters. 
    These parameters are stored to the class instance variables so that the
    class functions can access them later.
    
    no_of_pagedowns: number of times sending PAGE-DOWN key
    """
    def __init__(self, no_of_pagedowns=1):
        self.driver = webdriver.Firefox(
            executable_path=GeckoDriverManager().install()
        )
        self.pagedowns = no_of_pagedowns
        self.product_list = []
        self.url_list = []
                
    def get_info(self, url, selectors):
        driver = self.driver
        result = dict()
        driver.get(url)
        result['product_name'] = driver.find_element_by_xpath(
            selectors['product_name']
        ).text
        result['product_chunk'] = driver.find_element_by_xpath(
            selectors['product_chunk']
        ).text
        result['product_image'] = driver.find_element_by_xpath(
            selectors['product_image']
        ).get_attribute('href')
        result['product_description'] = driver.find_element_by_xpath(
            selectors['product_description']
        ).text
        
        driver.find_element_by_xpath(
            selectors['product_more_info']['click']
        ).click()
        result['product_more_info'] = driver.find_element_by_xpath(
            selectors['product_more_info']['info']
        ).text
        
        return result
       
    def get_list(self, url, item_selector, link_selector):
        pagedowns = self.pagedowns
        driver = self.driver
        result = []

        driver.get(url)
        body = driver.find_element_by_tag_name("body")     
        
        for number in range(1, pagedowns+1):
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(0.5)
            
        div_elements = driver.find_elements_by_css_selector(item_selector)
        
        for div in div_elements:
            anchor = div.find_element_by_css_selector(link_selector)
            result.append(anchor.get_attribute('href'))
        
        result = np.unique(result).tolist()
        self.url_list = result
        print(f'{len(result)} links')

    def save_products(self):
        with open("products.json", "w") as outfile:
            json.dump(self.product_list, outfile, indent=2)
        print("Products saved.")
    
    def loop_it(self, selectors):
        iterable = self.url_list
        if len(iterable) == 0:
            raise Exception('Try running "get_list"\n')
        for link in iterable:
            self.product_list.append(self.get_info(link, selectors))
            time.sleep(7)
        print('Loop done.')

In [3]:
my_spider = Spider(no_of_pagedowns=130)

[WDM] - Driver [/home/tonalli/.wdm/drivers/geckodriver/linux64/v0.27.0/geckodriver] found in cache


In [4]:
target_url = 'https://www.yvesrocher.com.mx/YR/Products/Search?id=*'
item_selector = 'div.ProductElement'
link_selector = 'a.btn'

In [5]:
product_selectors = {
    'product_name': '/html/body/div[3]/div[2]/div[3]/div[2]/div/div[2]/p',
    'product_chunk': '/html/body/div[3]/div[2]/div[3]/div[2]/div/div[4]/div/div[1]',
    'product_image': '//*[@id="ProductPhoto"]',
    'product_description': '//*[@id="descrip"]/div/div',
    'product_more_info': {
        'click': "//a[contains(text(),'SECRETO')]",
        'info': '//*[@id="secreto"]/div/div'
    }
}

In [None]:
# test_url = 'https://www.yvesrocher.com.mx/YR/Products/Details/67204'
# my_spider.get_info(test_url, product_selectors)

In [6]:
my_spider.get_list(target_url, item_selector, link_selector)
# my_spider.url_list

294 links


In [7]:
my_spider.loop_it(product_selectors)
# my_spider.product_list

NoSuchElementException: Message: Unable to locate element: //a[contains(text(),'SECRETO')]


In [15]:
my_spider.save_products()

Products saved.
