In [1]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os
from helper_functions import get_all_links, get_all_images, get_robots_data, write_file, append_file
from urllib.parse import urlparse
import urllib.robotparser as urobot
from entities import Site, Page, Image
from bs4 import BeautifulSoup
import requests
from datetime import datetime
from selenium.common.exceptions import NoSuchElementException

## Set Up Parameters

In [2]:

# Selenium set up
path = os.path.dirname(os.path.abspath("__file__"))
WEB_DRIVER_LOCATION = os.path.join(path, "chromedriver") # chromedriver.exe should be placed inside crawler folder
USER_AGENT = "fri-wier-jernejtim"

chrome_options = Options()
chrome_options.add_argument(f"user-agent={USER_AGENT}")
# chrome_options.add_argument("--headless")

# Crawler set up
# FRONTIER_FILENAME = "frontier.txt"
# CRAWLED_FILENAME = "crawled.txt"

WEB_PAGE_ADDRESS = "http://evem.gov.si"
SEED_URLS = ["http://www.gov.si/", "http://evem.gov.si", "http://e-uprava.gov.si", "http://e-prostor.gov.si"]

TIMEOUT = 3

BINARY_CONTENT = ['pdf', 'doc', 'docx', 'ppt', 'pptx']

In [3]:
# if not os.path.isfile(FRONTIER_FILENAME):
#     write_file(FRONTIER_FILENAME, SEED_URLS)

# if not os.path.isfile(CRAWLED_FILENAME):
#     write_file(CRAWLED_FILENAME, '')

In [4]:
def get_robots_url(url):
    domain_url = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
    robots_url = domain_url + '/robots.txt'
    return robots_url

def get_type_code(url):
    if url.split('.')[-1] in BINARY_CONTENT:
        return 'BINARY'
    elif url in SEED_URLS:
        return 'FRONTIER'
    elif False:
        return 'DUPLICATE' # TO-DO: Check if duplicate html content exists in DB
    else:
        return 'HTML'

In [5]:
frontier = SEED_URLS.copy()
visited_sites = []

# Using robot file parser for easier checking
rp = urobot.RobotFileParser()

driver = webdriver.Chrome(WEB_DRIVER_LOCATION, options=chrome_options)
driver.implicitly_wait(TIMEOUT)

sites = {}

while frontier:
    url = frontier.pop(0)
    o = urlparse(url)

    # If this is a new site 
    # -> read the robots.txt
    # -> create a new site entry in dict
    if o.hostname not in visited_sites:
        print(f"Reading 'robots.txt' for domain '{o.hostname}'")
        
        robots_url = get_robots_url(url)
        rp.set_url(robots_url)
        rp.read()
        site_maps = rp.site_maps()

        robots_content, sitemap_content = get_robots_data(driver, robots_url, site_maps)

        sites[o.hostname] = Site(o.hostname, robots_content, sitemap_content)

        visited_sites.append(o.hostname)

    if not rp.can_fetch(USER_AGENT, url):
        print(f"Prohibited access to URL '{url}'")
        continue

    print(f"Retrieving web page URL '{url}'")
    driver.get(url)

    status_code = requests.get(url).status_code
    type_code = get_type_code(url)
    html_content = None if type_code == 'DUPLICATE' or type_code == 'BINARY' else driver.page_source
    images = get_all_images(driver)
    links = get_all_links(driver)
    accessed_time = datetime.now().time()

    page = Page(type_code, url, html_content, status_code, accessed_time, images, links)
    curr_site:Site = sites[o.hostname]
    curr_site.add_page(page)

    # frontier.extend(links)

driver.close()


Reading 'robots.txt' for domain 'www.gov.si'
Retrieving web page URL 'http://www.gov.si/'
Reading 'robots.txt' for domain 'evem.gov.si'
Retrieving web page URL 'http://evem.gov.si'
Reading 'robots.txt' for domain 'e-uprava.gov.si'
Retrieving web page URL 'http://e-uprava.gov.si'
Reading 'robots.txt' for domain 'e-prostor.gov.si'
Retrieving web page URL 'http://e-prostor.gov.si'


In [6]:
print(sites)

{'www.gov.si': <entities.Site object at 0x00000279C4363550>, 'evem.gov.si': <entities.Site object at 0x00000279C4346310>, 'e-uprava.gov.si': <entities.Site object at 0x00000279C434F6D0>, 'e-prostor.gov.si': <entities.Site object at 0x00000279C43F76D0>}
