In [11]:
import os
import time
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException

from urllib.parse import urlparse
from urllib.error import URLError
import urllib.robotparser as urobot

from urllib.error import URLError
from datetime import datetime
from selenium.common.exceptions import WebDriverException

import requests
from requests.packages import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

from entities import Site, Page, PageData
import repository
from helper_functions import get_all_links, get_all_images, get_robots_data, write_file, append_file

## Set Up Parameters

In [12]:

# Selenium set up
path = os.path.dirname(os.path.abspath("__file__"))
WEB_DRIVER_LOCATION = os.path.join(path, "chromedriver") # chromedriver.exe should be placed inside crawler folder
USER_AGENT = "fri-wier-jernejtim"

chrome_options = Options()
chrome_options.add_argument(f"user-agent={USER_AGENT}")
chrome_options.add_argument('--ignore-ssl-errors=yes')
chrome_options.add_argument('--ignore-certificate-errors')
# chrome_options.add_argument("--headless")

# Crawler set up
# FRONTIER_FILENAME = "frontier.txt"
# CRAWLED_FILENAME = "crawled.txt"

LIMIT_DOMAIN = ".gov.si"
SEED_URLS = ["http://www.gov.si", "http://www.evem.gov.si", "http://e-uprava.gov.si", "http://e-prostor.gov.si"]
BINARY_CONTENT = ['pdf', 'doc', 'docx', 'ppt', 'pptx']
BANNED_FILETYPES = ['zip', 'jsp']
ALLOWED_LINK_TYPES = [
    'text/html', 
    'application/pdf',
    'application/msword',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
    'application/vnd.ms-powerpoint',
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']

TIMEOUT = 3

In [13]:
# if not os.path.isfile(FRONTIER_FILENAME):
#     write_file(FRONTIER_FILENAME, SEED_URLS)

# if not os.path.isfile(CRAWLED_FILENAME):
#     write_file(CRAWLED_FILENAME, '')

In [14]:
def get_robots_url(url):
    domain_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
    robots_url = domain_url + '/robots.txt'
    return robots_url

def get_type_code(url):
    if url.split('.')[-1] in BINARY_CONTENT:
        return 'BINARY'
    elif url in SEED_URLS:
        return 'FRONTIER'
    elif False:
        # TODO: Check if duplicate html content exists in DB
        return 'DUPLICATE'
    else:
        return 'HTML'

In [15]:
frontier = [(None, url) for url in SEED_URLS.copy()]

curr_domain = None # Current domain (for updating robots.txt)
# TODO: These data structures have to be replaced by DB
sites = {} # Site Objects (key=<site domain> e.g. 'e-prostor.gov.si')
pages = {} # Page Objects (key=<page url> e.g. 'https://www.e-prostor.gov.si/informacije/')
edge_list = [] # An edge list for storing links (will be replaced by a database)
errors = []

rp = urobot.RobotFileParser() # robot file parser

driver = webdriver.Chrome(WEB_DRIVER_LOCATION, options=chrome_options)
driver.implicitly_wait(TIMEOUT)

while frontier:
    # Get a pair of source url and destination url from frontier
    src_url, dest_url = frontier.pop(0)
    o = urlparse(dest_url)

    # If the domain is different than from the previous url
    # -> Read and parse robots.txt file again
    if o.hostname != curr_domain:
        print(f"Reading 'robots.txt' for domain '{o.hostname}'")
        curr_domain = o.hostname
        robots_url = get_robots_url(dest_url)
        rp.set_url(robots_url)
        try:
            rp.read()
        except URLError as e:
            print(e)
            errors.append(e)
            continue

    # If this is a new site domain
    # -> create a new site entry in dict
    if o.hostname not in sites.keys():
        print(f"NEW SITE: '{o.hostname}'")
        
        robots_url = get_robots_url(dest_url)
        site_maps = rp.site_maps()
        robots_content, sitemap_content = get_robots_data(driver, robots_url, site_maps)

        new_site = Site(o.hostname, robots_content, sitemap_content)
        sites[o.hostname] = new_site
        repository.create_site(new_site)

    # If this page was already crawled
    # -> skip this page
    if dest_url in pages.keys():
        print(f"Already Crawled URL '{dest_url}'")
        edge_list.append((src_url, dest_url))
        continue

    # If robots.txt doesnt allow for crawling the page
    # -> skip this page
    if not rp.can_fetch(USER_AGENT, dest_url):
        print(f"Prohibited access to URL '{dest_url}'")
        continue
    
    # Try and fetch the page with the url
    # If it fails -> skip it
    try:
        print(f"Retrieving web page URL '{dest_url}'")
        driver.get(dest_url)
        # time.sleep(TIMEOUT)
    except WebDriverException as e:
        print(e)
        errors.append(e)
        continue

    # Get request info and data
    status_code = requests.get(dest_url, verify=False).status_code
    type_code = get_type_code(dest_url)
    accessed_time = datetime.now()
    if type_code in ['FRONTIER', 'HTML']:
        html_content = driver.page_source
        images = get_all_images(driver)
        links = get_all_links(driver, LIMIT_DOMAIN, ALLOWED_LINK_TYPES)
        page_data = None
    elif type_code == 'BINARY':
        html_content, images, links = None, [], []
        data_type_code = dest_url.split('.')[-1].upper()
        data = requests.get(dest_url, verify=False).content
        page_data = PageData(data_type_code, data)
    else:
        html_content, images, links = None, [], []
        page_data = None


    # Create the page with gathered data 
    # and add it to pages
    page = Page(curr_domain, type_code, dest_url, html_content, status_code, accessed_time, images, page_data)
    pages[dest_url] = page

    # Add the link between the two pages
    edge_list.append((src_url, dest_url))

    print(page)
    print(links)
    print('\n')

    # Add gathered links to the frontier
    d_links = [(dest_url, link) for link in links]
    frontier.extend(d_links)

driver.close()


Reading 'robots.txt' for domain 'www.gov.si'
NEW SITE: 'www.gov.si'
Retrieving web page URL 'http://www.gov.si'
Site: www.gov.si 
Type: FRONTIER 
URL: http://www.gov.si 
HTML: 29545 
Status: 200 
Time: 2022-03-23 21:54:47.182045 
Images: 10 
PageData: None
['https://www.gov.si/#content', 'https://www.gov.si/', 'https://www.gov.si/podrocja/', 'https://www.gov.si/drzavni-organi/', 'https://www.gov.si/zbirke/', 'https://www.gov.si/dogodki/', 'https://www.gov.si/novice/', 'https://www.gov.si/sodelujte/', 'https://www.gov.si/dostopnost/', 'https://www.gov.si/o-spletnem-mestu/', 'https://www.gov.si/drzavni-organi/vlada/', 'https://www.gov.si/zbirke/delovna-mesta/', 'https://www.gov.si/drzave/', 'https://www.gov.si/teme/drzavni-prazniki-in-dela-prosti-dnevi/', 'https://www.gov.si/zbirke/javne-objave/', 'https://www.gov.si/zbirke/projekti-in-programi/pregled-vladnih-projektov-za-razvoj-obcin/', 'https://www.gov.si/teme/pomoc-slovenije-drzavljanom-ukrajine/', 'https://www.gov.si/podrocja/druzin

KeyboardInterrupt: 

In [None]:
print(f"Number of sites: {len(sites)}")
print(len(visited_pages))

for e in errors:
    print(e)

# for s in sites:
#     print(f"\n\n {sites[s]}")
#     print('-------------------------------')
#     for p in sites[s].pages:
#         print(p)
#         # if p.status_code != 200:
#         #     print(p)
#     print('-------------------------------')

Number of sites: 17
505
Message: unknown error: net::ERR_BAD_SSL_CLIENT_AUTH_CERT
  (Session info: chrome=99.0.4844.51)

<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>
<urlopen error [SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:1122)>
<urlopen error [Errno 11001] getaddrinfo failed>
<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>
<urlopen error [SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:1122)>
<urlopen error [Errno 11001] getaddrinfo failed>
<urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established con