In [1]:
import time

from selenium.webdriver.common.by import By

from tintinspider.controller import Controller
from tintinspider.fetchers import SeleniumFetcher

import config

conf = config.config

In [2]:
fetcher = SeleniumFetcher(    
    driver_path = conf['selenium']['driver_path'],
    user_agent  = conf['selenium']['user_agent'],
    proxy       = None,
    headless    = True,  # False if you want to see the browser
)

### Tune regex_iurls

In [None]:
from tintinspider.rules import rule_extract_iurls

url = '' # url to scrape
regex_iurls = [r'']  # try different regex strings here

def get_item_urls(url, regex_iurls):
    html = fetcher.fetch(url)
    next_urls = fetcher.extract_next_urls(html, url)  # only urls in the same domain
    iurls = rule_extract_iurls(next_urls, regex_iurls)
    print(f'Item URLs: {len(iurls)}')
    print('-'*17)
    print('Top 3 Item URLs are: ')
    for i in range(3):
        print(iurls[i])


### Tune customer fetcher

In [4]:
class YourFetcher(SeleniumFetcher):  # Name your custom fetcher

    def fetch_with_custom(self, url, k):
        pass

fetcher2 = YourFetcher(   # Use your custom fetcher
    driver_path = conf['selenium']['driver_path'],
    user_agent  = conf['selenium']['user_agent'],
    proxy       = None,
    headless    = True,  # False if you want to see the browser
)

In [None]:
url = 'https://hk.centanet.com/findproperty/list/buy'
htmls = fetcher2.fetch_with_custom(url, 3)
print(f'Fetched {len(htmls)} pages')

In [None]:
iurls = []
for html in htmls:
    u = get_item_urls(html, regex_iurls)
    iurls.extend(u)
print(f'Total {len(iurls)} item URLs')
print('-'*17)
print('Top 3 Item URLs are: ')
for i in range(3):
    print(iurls[i])

### Add the site to DB

In [None]:
ctrl = Controller(config.config, fetcher)

site_info = {
    "code" : "",              # Unique site code. Use lower case letter only
    "name" : "",              # Site name
    "homepage" : "",          # Site homepage, optional
    "rule_pagination" : "",   # "next_url_...", "click", "scroll", "custom"
    "clk_xpath" : "",         # if pagination is "click", specify Xpath for "Load more"-kind button
    "first_maxpages" : 3,     # Max pagination for first-time scraping
    "revisit_maxpages" : 3,   # Max pagination for periodic scraping
    "revisit_freq" : 14400,   # Revisit frequency by seconds
    "root_curls" : [          # Entry URLs. Pagniation will be applied to this list of URLs.
    ],
    "root_curls_nopages" : [  # Another set of entry URLs. No pagination. 
    ],
    "regex_iurls" : [         # Specify regex of item URLs
    ],
    "priority" : 1,           # The higher, the more prioritized. 0 means the site is inactive.
}

# ctrl.add_site(site_info)