In [1]:
import json
import time

from PIL import Image
from io import BytesIO
from base64 import b64decode as decoder
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager

In [2]:
class Spider:
    """
    This is the constructor class to which you can pass a bunch of parameters. 
    These parameters are stored to the class instance variables so that the
    class functions can access them later.
    
    delay: delay parameter used in time.sleep()
    """
    def __init__(self, delay=3):
        self.driver = webdriver.Firefox(
            executable_path=GeckoDriverManager().install()
        )
        self.delay = delay
        self.current_page = dict()
                
    def get_raw_page(self, page_rules, page_id=1):
        assert type(page_rules) == dict
        rules = dict()
        result = dict()
        driver = self.driver

        
        for key, value in page_rules.items():
            rules[key] = value.format(page_id)

        url = rules['url']
        driver.get(url)
        time.sleep(self.delay)

        result[rules['input_tag']] = self.get_target(
                target = rules['target'],
                css_selector = rules['css_selector']
            )
            
        self.current_page = result
        return result          
    
    def get_target(self, target='', css_selector=''):
        '''
        Get target from DOM
        '''
        
        driver = self.driver
                
        if target == 'src':
            try:
                return driver.find_element_by_css_selector(
                    css_selector
                ).get_attribute('src')
            except:
                return None
        else:
            return None

    def save_page(self, page_rules, page_id=1):
        assert type(page_rules) == dict
        current_page = self.current_page
        output_name = 'img/' + page_rules['output_tag'].format(page_id)
        
        if current_page == {}:
            get_raw_page(page_rules, page_id)
        
        img_base64 = current_page['img-base64'].replace("data:image/jpg;base64,","")
        img = Image.open(BytesIO(decoder(img_base64)))
        img.save(output_name,'JPEG')
        
        self.current_page = {}
        
        print(f'...{output_name} saved...')

    def loop_it(self, page_rules, no_of_pages=1):
        for page_id in range(1, no_of_pages + 1):
            self.get_raw_page(page_rules, page_id)
            self.save_page(page_rules, page_id)
        print('...Loope done...')

In [3]:
my_spider = Spider(delay=4)







[WDM] - Driver [/home/tonalli/.wdm/drivers/geckodriver/linux64/v0.29.0/geckodriver] found in cache


In [4]:
page_rules = {
        'url': 'http://appstrillas.mx/pdfFlipping/viewer.jsp?id=SbS3AB#page/{}',
        'input_tag': 'img-base64',
        'output_tag': 'page-{:03}.jpg',
        'target': 'src',
        'css_selector': '.p{} > img:nth-child(2)',
}

In [5]:
page_id=77

In [6]:
current_page = my_spider.get_raw_page(page_rules, page_id)

In [7]:
my_spider.save_page(page_rules, page_id)

...img/page-077.jpg saved...


In [8]:
my_spider.loop_it(page_rules, no_of_pages=130)

...img/page-001.jpg saved...
...img/page-002.jpg saved...
...img/page-003.jpg saved...
...img/page-004.jpg saved...
...img/page-005.jpg saved...
...img/page-006.jpg saved...
...img/page-007.jpg saved...
...img/page-008.jpg saved...
...img/page-009.jpg saved...
...img/page-010.jpg saved...
...img/page-011.jpg saved...
...img/page-012.jpg saved...
...img/page-013.jpg saved...
...img/page-014.jpg saved...
...img/page-015.jpg saved...
...img/page-016.jpg saved...
...img/page-017.jpg saved...
...img/page-018.jpg saved...
...img/page-019.jpg saved...
...img/page-020.jpg saved...
...img/page-021.jpg saved...
...img/page-022.jpg saved...
...img/page-023.jpg saved...
...img/page-024.jpg saved...
...img/page-025.jpg saved...
...img/page-026.jpg saved...
...img/page-027.jpg saved...
...img/page-028.jpg saved...
...img/page-029.jpg saved...
...img/page-030.jpg saved...
...img/page-031.jpg saved...
...img/page-032.jpg saved...
...img/page-033.jpg saved...
...img/page-034.jpg saved...
...img/page-03