In [1]:
import re
from urllib.parse import urljoin
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
from requests.exceptions import HTTPError
import os, codecs
from urllib.parse import unquote

In [16]:
class Scheduler:

    whitelist_file_type = ['html', 'htm']

    headers = {
        'User-Agent': 'OporBot',
        'From': 'thitiwat.tha@ku.th'
    }

    def __init__(self, seed_url, num_crawler):
        self.seed_url = urljoin(seed_url, '/')[:-1]
        self.frontier_q = [seed_url]
        self.visited_q = []
        self.num_crawler = num_crawler

    # @param 'links' is a list of extracted links to be stored in the queue
    def enqueue(self, links):
        for link in links:
            if link not in self.frontier_q and link not in self.visited_q:
                self.frontier_q.append(link)

    # FIFO queue
    def dequeue(self):
        current_url = self.frontier_q[0]
        self.frontier_q = self.frontier_q[1:]
        return current_url

    def link_parser(self, raw_html):
        soup = BeautifulSoup(raw_html, 'html.parser')
        urls = []
        for link in soup.find_all('a', href=True):
            urls.append(link.get('href'))
        # pattern = '<a href="([^"]*)"'
        # urls = re.findall(pattern, raw_html)
        return urls

    def de_duplicate_urls(self, urls):
        return list(set(urls))
    
    def get_base_url(self, url):
        return urljoin(url, '/')[:-1]

    def save_to_disk(self, url, raw_html):
        parsed = urlparse(url)
        hostname = parsed.hostname
        url_path = parsed.path

        # print(f'hostname {hostname}')
        save_folder_path = 'html/' + hostname + url_path
        save_filename = 'index.html'
        filetype = re.match(r'.*\.(.*)', url_path)

        if(filetype == True and filetype[1] in self.whitelist_file_type):
            # urljoin 'http://localhost:8888/asdasd/htasd.html' => 'http://localhost:8888/asdasd/'
            save_filename = url.split(urljoin(url, '.'))[1]
            save_abs_path = save_folder_path + save_filename
        else:
            save_abs_path = save_folder_path + '/' + save_filename

        print(f'savepath: {save_folder_path}')
        print(f'save filename: {save_filename}')
        print(f'save_abs_path: {save_abs_path}')

        os.makedirs(save_folder_path, 0o755, exist_ok=True)
        f = codecs.open(save_abs_path, 'w', 'utf-8')
        f.write(raw_html)
        f.close()

    def normalization_urls(self, urls, target_url):
        base_url = self.get_base_url(target_url)
        # absolute
        urls = [urljoin(base_url, url) for url in urls]

        # remove # (self reference)
        urls = [re.sub(r'#.*', '', url) for url in urls]

        # parse to utf8
        urls = [unquote(url) for url in urls]

        # strip / (backslash)
        urls = [url.strip('/') for url in urls]

        return urls

    def get_raw_html(self, url):
        text = ''
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            # If the response was successful, no Exception will be raised
            response.raise_for_status()
        except HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')  # Python 3.6
        except Exception as err:
            print(f'Other error occurred: {err}')  # Python 3.6
        else:
            # print('Success!')
            text = response.text
        return text

    def crawler_url(self, url):
        print(f'crawler: {url}')

        # Downloader
        raw_html = self.get_raw_html(url)

        # Analyzer
        urls = self.link_parser(raw_html)
        urls = self.normalization_urls(urls, url)
        urls = self.de_duplicate_urls(urls)

        # store to disk
        self.save_to_disk(url, raw_html)
        
        return urls


    def run(self):
        #--- main process ---#
        cur = 0
        while(cur < self.num_crawler):
            current_url = self.dequeue()
            self.visited_q.append(current_url)

            urls = self.crawler_url(current_url)
            self.enqueue(urls)
            # print(urls)
            cur += 1
            print('\n')
            # print(self.frontier_q)
            # print(self.visited_q)


In [17]:
Scheduler(
    seed_url='https://www.ku.ac.th',
    num_crawler= 10
    ).run()

crawler: https://www.ku.ac.th
savepath: html/www.ku.ac.th
save filename: index.html
save_abs_path: html/www.ku.ac.th/index.html


crawler: https://www.ku.ac.th/th/community-home
savepath: html/www.ku.ac.th/th/community-home
save filename: index.html
save_abs_path: html/www.ku.ac.th/th/community-home/index.html


crawler: https://www.ku.ac.th/th/newcomer-home
savepath: html/www.ku.ac.th/th/newcomer-home
save filename: index.html
save_abs_path: html/www.ku.ac.th/th/newcomer-home/index.html


crawler: https://www.ku.ac.th/th/partner-home
savepath: html/www.ku.ac.th/th/partner-home
save filename: index.html
save_abs_path: html/www.ku.ac.th/th/partner-home/index.html


crawler: https://www.ku.ac.th/th/related-links


KeyboardInterrupt: 