In [1]:
import json
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.firefox import GeckoDriverManager

In [2]:
class Spider:
    """
    This is the constructor class to which you can pass a bunch of parameters. 
    These parameters are stored to the class instance variables so that the
    class functions can access them later.
    
    no_of_pagedowns: number of times sending PAGE-DOWN key
    """
    def __init__(self, no_of_pagedowns=1):
        self.driver = webdriver.Firefox(
            executable_path=GeckoDriverManager().install()
        )
        self.pagedowns = no_of_pagedowns
        self.product_list = []
        self.url_list = []
                
    def get_info(self, url, selectors):
        driver = self.driver
        result = dict()
        driver.get(url)
        result['product_name'] = self.get_target(
            selectors['product_name'], 'text'
        )
        result['product_chunk'] = self.get_target(
            selectors['product_chunk'], 'text'
        )
        result['product_image'] = self.get_target(
            selectors['product_image'], 'href'
        )
        result['product_description'] = self.get_target(
            selectors['product_description'], 'text'
        )
        
        self.get_target(
            selectors['product_more_info']['click'],
            'click'
        )
        result['product_more_info'] = self.get_target(
            selectors['product_more_info']['info'],
            'text'
        )
        
        return result
    
    def get_target(self, selector, target):
        driver = self.driver
        if target == 'text':
            try: 
                return driver.find_element_by_xpath(
                    selector
                ).text
            except: 
                return None
        elif target == 'href':
            try: 
                return driver.find_element_by_xpath(
                    selector
                ).get_attribute('href')
            except: 
                return None
        elif target == 'click':
            try: 
                return driver.find_element_by_xpath(
                    selector
                ).click()
            except: 
                return None
        else:
            return None
       
    def get_list(self, url, item_selector, link_selector):
        pagedowns = self.pagedowns
        driver = self.driver
        result = []

        driver.get(url)
        body = driver.find_element_by_tag_name("body")     
        
        for number in range(1, pagedowns+1):
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(0.5)
            
        div_elements = driver.find_elements_by_css_selector(item_selector)
        
        for div in div_elements:
            anchor = div.find_element_by_css_selector(link_selector)
            result.append(anchor.get_attribute('href'))
        
        result = np.unique(result).tolist()
        self.url_list = np.unique(result + self.url_list).tolist()
        print(f'{len(self.url_list)} links')
    
    def clear_list(self):
        self.url_list = []
        
    def clear_products(self):
        self.product_list = []

    def save_products(self, fname='products.json'):
        with open(fname, "w") as outfile:
            json.dump(self.product_list, outfile, indent=2)
        print("Products saved.")
    
    def loop_it(self, selectors):
        iterable = self.url_list
        if len(iterable) == 0:
            raise Exception('Try running "get_list"\n')
        for link in iterable:
            self.product_list.append(self.get_info(link, selectors))
            time.sleep(7)
        print('Loop done.')

In [3]:
my_spider = Spider(no_of_pagedowns=40)

[WDM] - Driver [/home/tonalli/.wdm/drivers/geckodriver/linux64/v0.27.0/geckodriver] found in cache


In [4]:
target_url = 'https://www.yvesrocher.com.mx/YR/Products/Search?id=*'
item_selector = 'div.ProductElement'
link_selector = 'a.btn'

In [5]:
product_selectors = {
    'product_name': '/html/body/div[3]/div[2]/div[3]/div[2]/div/div[2]/p',
    'product_chunk': '/html/body/div[3]/div[2]/div[3]/div[2]/div/div[4]/div/div[1]',
    'product_image': '//*[@id="ProductPhoto"]',
    'product_description': '//*[@id="descrip"]/div/div',
    'product_more_info': {
        'click': "//a[contains(text(),'SECRETO')]",
        'info': '//*[@id="secreto"]/div/div'
    }
}

In [6]:
# All clean
# test_url = 'https://www.yvesrocher.com.mx/YR/Products/Details/67204'

# There are missing values
# test_url = 'https://www.yvesrocher.com.mx/YR/Products/Details/L36449'

# my_spider.get_info(test_url, product_selectors)

In [7]:
# my_spider.get_list(target_url, item_selector, link_selector)
# my_spider.url_list

In [8]:
# my_spider.loop_it(product_selectors)
# my_spider.product_list

In [9]:
# my_spider.save_products()

In [10]:
categories = {
    'faciales': [
        'https://www.yvesrocher.com.mx/YR/Products/Child/4',
        'https://www.yvesrocher.com.mx/YR/Products/Child/5',
        'https://www.yvesrocher.com.mx/YR/Products/Child/3'
    ],
    'capilares': [
        'https://www.yvesrocher.com.mx/YR/Products/Child/17',
        'https://www.yvesrocher.com.mx/YR/Products/Child/18'
    ],
    'corporales': [
        'https://www.yvesrocher.com.mx/YR/Products/Child/12',
        'https://www.yvesrocher.com.mx/YR/Products/Child/168',
        'https://www.yvesrocher.com.mx/YR/Products/Child/169'
        
    ],
    'maquillaje': [
        'https://www.yvesrocher.com.mx/YR/Products/Child/6',
        'https://www.yvesrocher.com.mx/YR/Products/Child/7',
        'https://www.yvesrocher.com.mx/YR/Products/Child/8',
        'https://www.yvesrocher.com.mx/YR/Products/Child/9'
        
    ],
    'fragancias': [
        'https://www.yvesrocher.com.mx/YR/Products/Child/10',
        'https://www.yvesrocher.com.mx/YR/Products/Child/11'
    ],
    'higiene': [
        'https://www.yvesrocher.com.mx/YR/Products/Child/15',
        'https://www.yvesrocher.com.mx/YR/Products/Child/1185'
    ]
}

In [11]:
for category, links in categories.items():
    for each_link in links:
        print(each_link)
        my_spider.get_list(each_link, item_selector, link_selector)
    np.savetxt(f'./data/url_{category}.txt', np.array(my_spider.url_list), fmt='%s')
    #my_spider.loop_it(product_selectors)
    #my_spider.save_products(f'{category}.json')
    my_spider.clear_list()
    #my_spider.clear_products()

https://www.yvesrocher.com.mx/YR/Products/Child/4
41 links
https://www.yvesrocher.com.mx/YR/Products/Child/5
99 links
https://www.yvesrocher.com.mx/YR/Products/Child/3
99 links
https://www.yvesrocher.com.mx/YR/Products/Child/17
33 links
https://www.yvesrocher.com.mx/YR/Products/Child/18
33 links
https://www.yvesrocher.com.mx/YR/Products/Child/12
30 links
https://www.yvesrocher.com.mx/YR/Products/Child/168
64 links
https://www.yvesrocher.com.mx/YR/Products/Child/169
64 links
https://www.yvesrocher.com.mx/YR/Products/Child/6
77 links
https://www.yvesrocher.com.mx/YR/Products/Child/7
179 links
https://www.yvesrocher.com.mx/YR/Products/Child/8
263 links
https://www.yvesrocher.com.mx/YR/Products/Child/9
290 links
https://www.yvesrocher.com.mx/YR/Products/Child/10
50 links
https://www.yvesrocher.com.mx/YR/Products/Child/11
50 links
https://www.yvesrocher.com.mx/YR/Products/Child/15
119 links
https://www.yvesrocher.com.mx/YR/Products/Child/1185
119 links


In [16]:
for category in categories.keys():
    print(category)
    my_spider.url_list = np.loadtxt(
        f'./data/url_{category}.txt',
        dtype=str
    ).tolist()
    my_spider.loop_it(product_selectors)
    my_spider.save_products(f'./data/{category}.json')
    my_spider.clear_products()

faciales
Loop done.
Products saved.
capilares
Loop done.
Products saved.
corporales
Loop done.
Products saved.
maquillaje
Loop done.
Products saved.
fragancias
Loop done.
Products saved.
higiene
Loop done.
Products saved.
