In [1]:
import os
import time
from datetime import datetime
import concurrent.futures

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.common.exceptions import WebDriverException

from urllib.parse import urlparse
from urllib.request import urlopen
from urllib.error import URLError
import robotparser as urobot

from datetime import datetime
from selenium.common.exceptions import WebDriverException

import requests
from requests.packages import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

from entities import Site, Page, PageData, Link, FrontierEntry, Error
import repository
import helper_functions

## Set Up Parameters

In [2]:

# Selenium set up
path = os.path.dirname(os.path.abspath("__file__"))
WEB_DRIVER_LOCATION = os.path.join(path, "chromedriver") # chromedriver.exe should be placed inside crawler folder
USER_AGENT = "fri-wier-jernejtim"

chrome_options = Options()
chrome_options.add_argument(f"user-agent={USER_AGENT}")
chrome_options.add_argument('--ignore-ssl-errors=yes')
chrome_options.add_argument('--ignore-certificate-errors')
# chrome_options.add_argument("--headless")

# Crawler set up
LIMIT_DOMAIN = ".gov.si"
# SEED_URLS = ['https://e-uprava.gov.si/?view_mode=0', 'https://e-uprava.gov.si/']
SEED_URLS = ["http://www.gov.si/", "http://www.evem.gov.si/", "http://e-uprava.gov.si/", "http://e-prostor.gov.si/"]
BINARY_CONTENT = ['pdf', 'doc', 'docx', 'ppt', 'pptx']
ALLOWED_LINK_TYPES = [
    'text/html', 
    'application/pdf',
    'application/msword',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
    'application/vnd.ms-powerpoint',
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']

NUMBER_OF_WORKERS = 3
TIMEOUT = 5 # Default timeout if no robots.txt
START_CLEAN = False # If set to TRUE it will clear the database and start again from seed urls
STORE_BINARY = False # If set to TRUE it will store binary data of images and files
RESPECT_CRAWL_DELAY = True # If set to true the crawler will respect crawl_delay from robots.txt

In [3]:
def get_type_code(url, html_content):
    duplicate_url = repository.check_if_duplicate(html_content)
    if url.split('.')[-1] in BINARY_CONTENT:
        return 'BINARY', None
    elif url in SEED_URLS:
        return 'FRONTIER', None
    elif duplicate_url:
        return 'DUPLICATE', duplicate_url
    else:
        return 'HTML', None

In [4]:
def crawl_page(driver: WebDriver, frontier_entry, timeout):
    frontier_id, src_url, dest_url = frontier_entry
    o = urlparse(dest_url)

    # Read and parse robots.txt file
    # print(f"Reading 'robots.txt' for domain '{o.hostname}'")
    rp, robots_content, sitemap_content = helper_functions.parse_robots_file(dest_url)
    if RESPECT_CRAWL_DELAY:
        crawl_delay = rp.crawl_delay(USER_AGENT)
        if crawl_delay: timeout = crawl_delay

    time.sleep(timeout)
    # If this page was already crawled
    # -> skip this page
    if repository.check_if_page_exists(dest_url):
        print(f"Already Crawled URL '{dest_url}'")
        try:
            repository.create_link(Link(src_url, dest_url))
        except:
            print(f"Link not created dure to error...")
        return frontier_id, driver

    # Add new site if on new domain
    if not repository.check_if_site_exists(o.hostname):
        new_site = Site(o.hostname, robots_content, sitemap_content)
        repository.create_site(new_site)

    # If robots.txt doesnt allow for crawling the page
    # -> skip this page
    if rp and not rp.can_fetch(USER_AGENT, dest_url):
        print(f"Prohibited access to URL '{dest_url}'")
        return frontier_id, driver
    
    # Try and fetch the page with the url
    # If it fails -> skip it
    status_code = None
    try:
        print(f"Retrieving web page with URL '{dest_url}'")
        driver.get(dest_url)
        status_code = requests.get(dest_url, verify=False).status_code
        time.sleep(timeout)
    except WebDriverException as e:
        print(e)
        repository.create_error(Error(dest_url, e.msg, datetime.now()))
        return frontier_id, driver
    except requests.ConnectionError as e:
        print(e)
        repository.create_error(Error(dest_url, e.strerror, datetime.now()))
        return frontier_id, driver
    except requests.TooManyRedirects as e:
        print(e)
        repository.create_error(Error(dest_url, e.strerror, datetime.now()))
        return frontier_id, driver
    except:
        print(f"Unknown error wher retrieving '{dest_url}' ... Skipping")
        return frontier_id, driver


    type_code, duplicate_url = get_type_code(dest_url, driver.page_source)

    if type_code == 'FRONTIER':
        site_domain = o.hostname
        html_content = driver.page_source
        status_code = requests.get(dest_url, verify=False).status_code
        accessed_time = datetime.now()
        images = helper_functions.get_all_images(driver, dest_url, STORE_BINARY)
        links = helper_functions.get_all_links(driver, LIMIT_DOMAIN, ALLOWED_LINK_TYPES)
        page_data = None
    
    elif type_code == 'HTML':
        site_domain = o.hostname
        html_content = driver.page_source
        status_code = requests.get(dest_url, verify=False).status_code
        accessed_time = datetime.now()
        images = helper_functions.get_all_images(driver, dest_url, STORE_BINARY)
        links = helper_functions.get_all_links(driver, LIMIT_DOMAIN, ALLOWED_LINK_TYPES)
        page_data = None

    elif type_code == 'BINARY':
        site_domain = o.hostname
        html_content = None
        status_code = requests.get(dest_url, verify=False).status_code
        accessed_time = datetime.now()
        images = []
        links = []        
        page_data = helper_functions.get_page_data(dest_url, STORE_BINARY)

    elif type_code == 'DUPLICATE':
        site_domain = o.hostname
        html_content = None
        status_code = requests.get(dest_url, verify=False).status_code
        accessed_time = datetime.now()
        images = []
        links = []        
        page_data = None
        src_url = duplicate_url


    # Create the page with gathered data 
    # and add it to pages
    page = Page(
        site_domain,
        type_code, 
        dest_url, 
        html_content, 
        status_code, 
        accessed_time, 
        images, 
        page_data, 
        links
    )

    try:
        repository.create_page(page)
    except:
        print(f"Not unique url db error '{dest_url}'")
        try:
            repository.create_link(Link(src_url, dest_url))
        except:
            print(f"Link not created due to error...")
        repository.update_frontier_entry_to_crawled(frontier_id)
        return frontier_id, driver

    
    try:
        if images: repository.create_images(images)
        if page_data: repository.create_page_data(page_data)
    except:
        print(f"Images of PageData not created due to error...")

    # Add the link between the two pages
    repository.create_link(Link(src_url, dest_url))

    print('\n----------------------------------------- ')
    print(page.__str__())
    # print(links.__str__())
    print('\n')

    # Add gathered links to the frontier
    frontier_entries = [FrontierEntry(dest_url, link) for link in links]
    repository.create_frontier_entries(frontier_entries)
    repository.update_frontier_entry_to_crawled(frontier_id)

    return frontier_id, driver

if START_CLEAN:
        repository.clear_db()
        initial_frontier = [FrontierEntry(None, url) for url in SEED_URLS]
        repository.create_frontier_entries(initial_frontier)


# Multi-threaded stuff
drivers = [webdriver.Chrome(WEB_DRIVER_LOCATION, options=chrome_options) for _ in range(NUMBER_OF_WORKERS)]
for d in drivers:
    d.implicitly_wait(TIMEOUT)

def update_to_processed(f):
    frontier_id, driver = f.result()

    if frontier_id:
        repository.update_frontier_entry_to_processed(frontier_id)

    if not driver:
            print("An error occured and a driver is no longer available")
            print("Adding a new driver...")
            driver = webdriver.Chrome(WEB_DRIVER_LOCATION, options=chrome_options)
            driver.implicitly_wait(TIMEOUT)

    drivers.append(driver)

with concurrent.futures.ThreadPoolExecutor(max_workers=NUMBER_OF_WORKERS) as executor:
    futures = []

    while True:
        f = executor.submit(repository.get_next_frontier_entry)
        next_frontier_entry = f.result()

        if next_frontier_entry is None:
            print("\n\n CURRENT FRONTIER IS EMPTY waiting for futures to complete... \n\n")
            concurrent.futures.wait(futures)

            f = executor.submit(repository.get_next_frontier_entry)
            next_frontier_entry = f.result()

            if next_frontier_entry is None:
                print("\n\n ----- FRONTIER IS EMPTY ----- \n\n")
                executor.shutdown(wait=True)
                break

        driver = drivers.pop(0) if drivers else None
        f = executor.submit(crawl_page, driver, next_frontier_entry, TIMEOUT)
        f.add_done_callback(update_to_processed)
        futures.append(f)

    # for d in drivers:
    #     d.close()


Already Crawled URL 'https://e-uprava.gov.si/javne-evidence/plovila.html'
Already Crawled URL 'https://e-uprava.gov.si/javne-evidence/motorna-vozila.html'
Already Crawled URL 'https://e-uprava.gov.si/javne-evidence/prosti-termini.html'
Already Crawled URL 'https://e-uprava.gov.si/javne-evidence/druge-javne-evidence.html'
Already Crawled URL 'https://e-uprava.gov.si/javne-evidence/odtujeni-osebni-dokumenti.html'
Already Crawled URL 'https://e-uprava.gov.si/javne-evidence/listine-in-potrdila.html'
Already Crawled URL 'https://e-uprava.gov.si/pomoc-kontakt/pomoc-pri-uporabi.html'
Already Crawled URL 'https://e-uprava.gov.si/pomoc-kontakt/vprasanja-in-mnenja.html'
Already Crawled URL 'https://e-uprava.gov.si/pomoc-kontakt/vodici.html'
Already Crawled URL 'https://e-uprava.gov.si/pomoc-kontakt/predlogi-uporabnikov.html'
Already Crawled URL 'https://e-uprava.gov.si/pomoc-kontakt/izjava-o-dostopnosti.html'
Already Crawled URL 'https://e-uprava.gov.si/pomoc-kontakt/vsa-pogosta-vprasanja.html'


KeyboardInterrupt: 

# Non-Multi Threaded Crawler 

In [None]:
# if START_CLEAN:
#     repository.clear_db()
#     initial_frontier = [FrontierEntry(None, url, False) for url in SEED_URLS.copy()]
#     repository.create_frontier_entries(initial_frontier)

# curr_domain = None # Current domain

# rp = urobot.RobotFileParser() # robot file parser

# driver = webdriver.Chrome(WEB_DRIVER_LOCATION, options=chrome_options)
# # driver.implicitly_wait(TIMEOUT)

# while True:
#     # Get next entry from frontier
#     # -> break if empty
#     next_frontier_entry = repository.get_next_frontier_entry()
#     if not next_frontier_entry:
#         print("\n\n ----- FRONTIER IS EMPTY ----- \n\n")
#         break

#     frontier_id, src_url, dest_url = next_frontier_entry
#     o = urlparse(dest_url)

#     # If the domain is different than from the previous url
#     # -> Read and parse robots.txt file again
#     if o.hostname != curr_domain:
#         print(f"Reading 'robots.txt' for domain '{o.hostname}'")
#         rp, robots_content, sitemap_content = helper_functions.parse_robots_file(dest_url)
#         if RESPECT_CRAWL_DELAY:
#             crawl_delay = rp.crawl_delay(USER_AGENT)
#             if crawl_delay: TIMEOUT = crawl_delay
#         curr_domain = o.hostname

#         # Add new site if on new domain
#         if not repository.check_if_site_exists(curr_domain):
#             new_site = Site(o.hostname, robots_content, sitemap_content)
#             repository.create_site(new_site)

#     # If this page was already crawled
#     # -> skip this page
#     if repository.check_if_page_exists(dest_url):
#         print(f"Already Crawled URL '{dest_url}'")
#         repository.create_link(Link(src_url, dest_url))
#         repository.update_frontier_entry_to_crawled(frontier_id)
#         continue

#     # If robots.txt doesnt allow for crawling the page
#     # -> skip this page
#     if not rp.can_fetch(USER_AGENT, dest_url):
#         print(f"Prohibited access to URL '{dest_url}'")
#         repository.update_frontier_entry_to_crawled(frontier_id)
#         continue
    
#     # Try and fetch the page with the url
#     # If it fails -> skip it
#     try:
#         print(f"Retrieving web page URL '{dest_url}'")
#         driver.get(dest_url)
#         time.sleep(TIMEOUT)
#     except WebDriverException as e:
#         print(e)
#         repository.create_error(Error(dest_url, e, datetime.now()))
#         repository.update_frontier_entry_to_crawled(frontier_id)
#         continue


#     type_code, duplicate_url = get_type_code(dest_url, driver.page_source)

#     if type_code == 'FRONTIER':
#         site_domain = curr_domain
#         html_content = driver.page_source
#         status_code = requests.get(dest_url, verify=False).status_code
#         accessed_time = datetime.now()
#         images = helper_functions.get_all_images(driver, dest_url, STORE_BINARY)
#         links = helper_functions.get_all_links(driver, LIMIT_DOMAIN, ALLOWED_LINK_TYPES)
#         page_data = None
    
#     elif type_code == 'HTML':
#         site_domain = curr_domain
#         html_content = driver.page_source
#         status_code = requests.get(dest_url, verify=False).status_code
#         accessed_time = datetime.now()
#         images = helper_functions.get_all_images(driver, dest_url, STORE_BINARY)
#         links = helper_functions.get_all_links(driver, LIMIT_DOMAIN, ALLOWED_LINK_TYPES)
#         page_data = None

#     elif type_code == 'BINARY':
#         site_domain = curr_domain
#         html_content = None
#         status_code = requests.get(dest_url, verify=False).status_code
#         accessed_time = datetime.now()
#         images = []
#         links = []        
#         page_data = helper_functions.get_page_data(dest_url, STORE_BINARY)

#     elif type_code == 'DUPLICATE':
#         site_domain = curr_domain
#         html_content = None
#         status_code = requests.get(dest_url, verify=False).status_code
#         accessed_time = datetime.now()
#         images = []
#         links = []        
#         page_data = None
#         src_url = duplicate_url


#     # Create the page with gathered data 
#     # and add it to pages
#     page = Page(
#         site_domain,
#         type_code, 
#         dest_url, 
#         html_content, 
#         status_code, 
#         accessed_time, 
#         images, 
#         page_data, 
#         links
#     )

#     repository.create_page(page)
#     if images: repository.create_images(images)
#     if page_data: repository.create_page_data(page_data)

#     # Add the link between the two pages
#     repository.create_link(Link(src_url, dest_url))

#     print('\n----------------------------------------- ')
#     print(page)
#     print(links)
#     print('\n')

#     # Add gathered links to the frontier
#     frontier_entries = [FrontierEntry(dest_url, link) for link in links]
#     repository.create_frontier_entries(frontier_entries)
#     repository.update_frontier_entry_to_crawled(frontier_id)

# driver.close()

In [None]:
# driver = webdriver.Chrome(WEB_DRIVER_LOCATION, options=chrome_options)
# driver.get('https://spot.gov.si/')

# images = helper_functions.get_all_images(driver, 'https://spot.gov.si/')

# for i in images:
#     print(i)