In [None]:
import re
import argparse
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

import os, codecs
from datetime import datetime 
from urllib.robotparser import RobotFileParser
from urllib.parse import urljoin, urlparse, unquote, urlsplit, urlunsplit
from requests.exceptions import HTTPError
from queue import Queue

from bs4 import BeautifulSoup

import ssl
import threading
import warnings
import time

warnings.filterwarnings('ignore')

ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
import sys


class Transcript(object):
    def __init__(self, filename):
        self.terminal = sys.stdout
        self.logfile = open(filename, "a")

    def write(self, message):
        self.terminal.write(message)
        self.logfile.write(message)

    def flush(self):
        # this flush method is needed for python 3 compatibility.
        # this handles the flush command by doing nothing.
        # you might want to specify some extra behavior here.
        pass


def startTC(filename):
    """Start transcript, appending print output to given filename"""
    sys.stdout = Transcript(filename)


def stopTC():
    """Stop transcript and return print functionality to normal"""
    sys.stdout.logfile.close()
    sys.stdout = sys.stdout.terminal


In [None]:
def remove_query_from_url(url):
    parsed = urlparse(url)
    return "".join([parsed.scheme,"://",parsed.netloc,parsed.path])

In [None]:
def requests_retry_session(
    retries=10,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [None]:
class Crawler(threading.Thread):
    current_i = 0

    def __init__(self, sleep_time, seed_url, max_num, whitelist_file_types, user_agent, whitelist_domain, frontier_q, visited, url_lock, parsed_sitemap_domains, parsed_robots_domains, num_lock):
        threading.Thread.__init__(self)
        print(f"Web Crawler worker {threading.current_thread()} has Started")
        self.sleep_time = sleep_time
        self.seed_url = urljoin(seed_url, '/')[:-1]
        self.visited = visited
        self.url_lock = url_lock
        self.num_lock = num_lock
        self.frontier_q = frontier_q
        self.max_num = max_num
        self.whitelist_file_types = whitelist_file_types
        self.user_agent = user_agent
        self.whitelist_domain = whitelist_domain
        self.parsed_robots_domains = parsed_robots_domains
        self.parsed_sitemap_domains = parsed_sitemap_domains
        self.headers = {
            'User-Agent':  user_agent,
            'From': 'thitiwat.tha@ku.th'
        }

    def link_parser(self, raw_html):
        soup = BeautifulSoup(raw_html, 'html.parser')
        urls = []
        for link in soup.find_all('a', href=True):
            urls.append(link.get('href'))
        # pattern = '<a href="([^"]*)"'
        # urls = re.findall(pattern, raw_html)
        return urls

    def de_duplicate_urls(self, urls):
        return list(set(urls))
    
    def get_base_url(self, url):
        return urljoin(url, '/')[:-1]

    def save_to_disk(self, url, raw_html):
        try:
            parsed = urlparse(url)
            hostname = parsed.hostname
            url_path = parsed.path

            # print(f'hostname {hostname}')
            # save_folder_path = 'html/' + hostname + url_path
            # save_filename = 'index.html'
            filetype = re.match(r'.*\.(.*)$', url_path)

            if(filetype != None):
                # urljoin 'http://localhost:8888/asdasd/htasd.html' => 'http://localhost:8888/asdasd/'
                save_folder_path = 'html/' + hostname + "/".join(url_path.split('/')[:-1])
                save_filename = url.split(urljoin(url, '.'))[1]
            else:
                save_folder_path = 'html/' + hostname + url_path
                save_filename = 'index.html'
            
            save_folder_path = save_folder_path.strip('/')
            save_abs_path = save_folder_path + '/' + save_filename

            print(f'savepath: {save_folder_path}')
            print(f'save filename: {save_filename}')
            print(f'save_abs_path: {save_abs_path}')

            os.makedirs(save_folder_path, 0o755, exist_ok=True)
            f = codecs.open(save_abs_path, 'w', 'utf-8')
            f.write(raw_html)
            f.close()
        except:
            print(f'Error in save_to_disk')

    def normalization_urls(self, urls, base_url):
        try:
            # absolute
            urls = [urljoin(base_url, url) for url in urls]

            # remove # (self reference)
            urls = [re.sub(r'#.*', '', url) for url in urls]

            # parse to utf8
            urls = [unquote(url) for url in urls]

            # strip / (backslash)
            urls = [url.strip('/') for url in urls]

            # remove query string
            urls = [remove_query_from_url(url) for url in urls]

            return urls
        except:
            print(f'Error in normalization_urls')

    def get_raw_html(self, url):
        text = ''
        try:
            response = requests_retry_session().get(url, headers=self.headers, timeout=30, verify=False)
            # If the response was successful, no Exception will be raised
            response.raise_for_status()
        except HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')  # Python 3.6
        except Exception as err:
            print(f'Other error occurred: {err}')  # Python 3.6
        else:
            # print('Success!')
            text = response.text
        finally:
            return text

    def check_and_save_robots(self, robot_url):
        try:
            raw = self.get_raw_html(robot_url)
            is_valid = 'User-agent' in raw
            if(is_valid):
                self.save_to_disk(robot_url, raw)
            return is_valid
        except:
            print(f'Error in check_and_save_robots')
            return False

    def get_parsed_robots(self, base_url):
        rp = RobotFileParser()
        robots_url = base_url + '/robots.txt'
        try:
            is_valid = self.check_and_save_robots(robots_url)
            if(not is_valid):
                raise Exception("Not found robots")

            if(base_url not in self.parsed_robots_domains):
                self.parsed_robots_domains.add(base_url)
            rp.set_url(robots_url)
            rp.read()
            
        except:
            # allow all
            rp.set_url('https://ku.ac.th')
            rp.read()
        finally:
            return rp


    def filters_urls(self, urls, base_url):
        filtered_urls = []
        rp = self.get_parsed_robots(base_url)
        for url in urls:
            parsed = urlparse(url)
            url_path = parsed.path
            hostname = parsed.hostname
            # check domain allow only ku.ac.th
            if(not hostname or self.whitelist_domain not in hostname):
                continue

            # check can fetch from robots.txt
            can_fetch = rp.can_fetch(self.user_agent, url)
            if(not can_fetch):
                continue

            # check filetype
            filetype = re.match(r'.*\.(.*)$', url_path)

            if(not filetype):
                filtered_urls.append(url)
            elif(filetype[1] in self.whitelist_file_types):
                filtered_urls.append(url)
            else:
                pass

        return filtered_urls

    def include_urls(self, urls, base_url):
        try:
            if(base_url in self.parsed_sitemap_domains):
                return urls

            xml = self.get_raw_html(base_url + '/sitemap.xml')
            soup = BeautifulSoup(xml)
            urlsetTag = soup.find_all("loc")
            urlsetTag = list(urlsetTag)
            if(len(urlsetTag) > 0):
              self.parsed_sitemap_domains.add(base_url)
              sitemap_urls = [url.getText() for url in urlsetTag]
              urls[0:0] = sitemap_urls
            return urls
        except:
            print(f'Error in include_urls')
        finally:
            return urls

    def crawler_url(self, url, q_size):
        print(f'crawler: {url}')
        urls = []
        try:
          base_url = self.get_base_url(url)

          # Downloader
          raw_html = self.get_raw_html(url)
          if(raw_html == ''):
            raise Exception("Empty page")

          # Analyzer
          urls = self.link_parser(raw_html)
          urls = self.include_urls(urls, base_url)
          urls = self.normalization_urls(urls, base_url)
          urls = self.filters_urls(urls, base_url)
          urls = self.de_duplicate_urls(urls)

          # store to disk
          self.save_to_disk(url, raw_html)

        except:
          print(f'Error in crawler_url')
        finally:
          return urls


    def get_current_i(self):
        self.num_lock.acquire()
        current_i = Crawler.current_i
        Crawler.current_i += 1
        self.num_lock.release()
        return current_i

    def run(self):
        #--- main process ---#
        current_i = self.get_current_i()

        while(current_i < self.max_num):
            self.url_lock.acquire()
            q_size = self.frontier_q.qsize()
            print(f"\nQueue Size: {q_size}")
            current_url = self.frontier_q.get()
            time.sleep(self.sleep_time)
            self.url_lock.release()

            if(current_url is None):
                continue

            if current_url in self.visited:
                continue
            
            print(f'i: {current_i}')
            
            try:
                urls = self.crawler_url(current_url, q_size)
                for url in urls:
                    self.frontier_q.put(url)
                print('\n')
                self.visited.add(current_url)

                current_i = self.get_current_i()
            except:
                pass
            finally:
                pass
                # self.frontier_q.task_done()


In [None]:
seed_url = 'https://ku.ac.th/th'

number_of_threads = 64
frontier_q = Queue()
frontier_q.put(seed_url)
visited = set()

url_lock = threading.Lock()
num_lock = threading.Lock()

crawler_threads = []
parsed_sitemap_domains = set()
parsed_robots_domains = set()
sleep_time = 0.1

start_time = datetime.now()
startTC('output.txt')

for i in range(int(number_of_threads)):
    crawler = Crawler(
        sleep_time=sleep_time,
        max_num=10000,
        seed_url='https://ku.ac.th/th',
        whitelist_file_types=['html', 'htm'],
        whitelist_domain='ku.ac.th',
        user_agent="Thitiwat_Bot",
        frontier_q=frontier_q,
        visited=visited,
        url_lock=url_lock,
        num_lock=num_lock,
        parsed_sitemap_domains=parsed_sitemap_domains,
        parsed_robots_domains=parsed_robots_domains,
    )
    
    crawler.start()
    crawler_threads.append(crawler)


for crawler in crawler_threads:
    crawler.join()
  
time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))
stopTC()


In [None]:
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))

In [None]:
def saveDomain(filename, urls):
    urls = list(urls)
    urls = [x for x in urls if x]
    domains = []
    for url in urls:
        parsed = urlparse(url)
        hostname = parsed.hostname
        domain = re.sub(r'www.', '', hostname)
        domains.append(domain)
    with open(filename, 'w') as f:
        f.write('\n'.join(domains))

In [None]:
saveDomain('list_robots.txt', list(parsed_robots_domains))
saveDomain('list_sitemap.txt', list(parsed_sitemap_domains))

In [None]:
parsed_robots_domains

In [None]:
parsed_sitemap_domains