Hand on 5 (1 of 3)

In [1]:
# Imports
import os
import json
import pickle
import requests
import multiprocessing

from bs4 import BeautifulSoup
from queue import Empty
from queue import Queue
from pathlib import Path
from bs4.element import Comment
from nltk.corpus import stopwords 
from urllib.parse import urljoin
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor

---
`Page 29 - 33 of Handout #5`

A simple multithreaded web crawler

In [2]:
# Page 29
class MultiThreadCrawler:
    def __init__(self, base_url, depth):
        self.base_url = base_url
        extracted_url = urlparse(base_url)
        parent = extracted_url.path[:extracted_url.path.rfind("/") + 1]
        self.root_url = '{}://{}{}'.format(extracted_url.scheme, extracted_url.netloc, parent)
        self.pool = ThreadPoolExecutor(max_workers=multiprocessing.cpu_count() - 1)
        self.to_crawl = Queue()
        self.to_crawl.put({self.base_url: depth})
        self.stored_folder = Path(os.path.abspath('')).parent / 'crawled/'

        if not Path(self.stored_folder).exists():
            Path.mkdir(self.stored_folder)

        if Path(self.stored_folder / 'url_list.pickle').exists():
            with open(self.stored_folder / 'url_list.pickle', 'rb') as f:
                self.crawled_pages = pickle.load(f)
            print(self.crawled_pages)
        else:
            self.crawled_pages = set([])
    
    # Page 30
    def extract_page(self, obj):
        if obj.result():
            result, url, depth = obj.result()
        if result and result.status_code == 200:
            url_lists = self.parse_links(result.text, depth)
            self.parse_contents(url, result.text, url_lists)
    def get_page(self, url, depth):
        try:
            res = requests.get(url, timeout=(3, 30))
            return res, url, depth
        except requests.RequestException:
            return
        
    # Page 31
    def parse_links(self, html, depth):
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=True)
        url_lists = []
        for link in links:
            url = link['href']
            url = urljoin(self.root_url, url)
            if depth >= 0 and '..' not in url and url not in self.crawled_pages:
                print("Adding {}".format(url))
                self.to_crawl.put({url: depth})
            url_lists.append(url)
        return url_lists
    
    def parse_contents(self, url, html, url_lists):
        def tag_visible(element):
            if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
                return False
            if isinstance(element, Comment):
                return False
            return True
        
        try:
            soup = BeautifulSoup(html, 'html.parser')
            texts = soup.findAll(string=True)
            visible_texts = filter(tag_visible, texts)
            
            title = soup.find('title').string.strip()
            text = u" ".join(t.strip() for t in visible_texts).strip()
            
            with open(self.stored_folder / (str(hash(url)) + '.txt'), 'w', encoding='utf-8') as f:
                json.dump({'url': url, 'title': title, 'text': text, 'url_lists': url_lists}, f, ensure_ascii=False)
        except:
            pass

        
    # Page 33
    def run_scraper(self):
        while True:
            try:
                target = self.to_crawl.get(timeout=10)
                url, depth = [(k, target[k]) for k in target][0]
                if url not in self.crawled_pages:
                    self.crawled_pages.add(url)
                    job = self.pool.submit(self.get_page, url, depth - 1)
                    job.add_done_callback(self.extract_page)
            except Empty:
                with open(self.stored_folder / 'url_list.pickle', 'wb') as f:
                    pickle.dump(self.crawled_pages, f, pickle.HIGHEST_PROTOCOL)
                with open(self.stored_folder / 'url_list.pickle', 'rb') as f:
                    print(pickle.load(f))
                break
            except Exception as e:
                print(e)
                continue


In [3]:
# Page 33
if __name__ == '__main__':
    s = MultiThreadCrawler("https://www.cmu.ac.th/en/home", 2)
    s.run_scraper()


Adding https://www.cmu.ac.th/Controls/ShareContent/
Adding https://www.cmu.ac.th/en/faculty/course
Adding https://mis.cmu.ac.th/TQF/TQF2/CurriculumPublicList.aspx
Adding https://www.cmu.ac.th/en/faculty/level/bachelor
Adding https://www.cmu.ac.th/en/faculty/level/master_bachelor
Adding https://www.cmu.ac.th/en/faculty/level/phd
Adding https://www.cmu.ac.th/en/faculty/Level/other
Adding https://www.cmu.ac.th/Controls/ShareContent/
Adding https://www.cmu.ac.th/en/content/F0917C78-2125-4FFD-AF68-BE9E4F5E0D36
Adding https://www.cmu.ac.th/en/content/E13BF8C0-7C77-46BA-BEA1-56392A967AEF
Adding https://www.cmu.ac.th/en/content/425815A0-90AB-4F39-9DA9-8D5DEA9CDBED
Adding https://www.cmu.ac.th/en/content/D4F81742-3F7A-4197-8286-4B568129DCF1
Adding https://www.cmu.ac.th/Controls/ShareContent/
Adding https://www.cmu.ac.th/en/faculty/aboutus
Adding https://www.cmu.ac.th/en/organization
Adding https://www.cmu.ac.th/en/content/7A7616BC-C917-407E-8ED9-9F544D3416BA
Adding https://www.cmu.ac.th/th/home

exception calling callback for <Future at 0x28f4eca1b20 state=finished returned NoneType>
Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\SE-IR\Lib\concurrent\futures\_base.py", line 340, in _invoke_callbacks
    callback(self)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_21416\136507877.py", line 27, in extract_page
    if result and result.status_code == 200:
       ^^^^^^
UnboundLocalError: cannot access local variable 'result' where it is not associated with a value
exception calling callback for <Future at 0x28f4eca2180 state=finished returned NoneType>
Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\SE-IR\Lib\concurrent\futures\_base.py", line 340, in _invoke_callbacks
    callback(self)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_21416\136507877.py", line 27, in extract_page
    if result and result.status_code == 200:
       ^^^^^^
UnboundLocalError: cannot access local variable 'result' where it is not associated

Adding https://www.cmu.ac.th/th/faculty/level/bachelorAdding https://www.cmu.ac.th/en/course
Adding https://www.cmu.ac.th/en/level/bachelor
Adding https://www.cmu.ac.th/en/level/master_bachelor
Adding https://www.cmu.ac.th/en/level/phd
Adding https://www.cmu.ac.th/en/Level/other
Adding https://www.cmu.ac.th/en/aboutus
Adding https://www.cmu.ac.th/th/faculty/aboutus
Adding https://www.cmu.ac.th/en/aboutus
Adding https://www.cmu.ac.th/cn/faculty/aboutus
Adding https://www.cmu.ac.th/th/faculty/aboutus
Adding https://www.cmu.ac.th/en/aboutus
Adding https://www.cmu.ac.th/cn/faculty/aboutus
Adding https://www.cmu.ac.th/en/course
Adding https://www.cmu.ac.th/en/level/bachelor
Adding https://www.cmu.ac.th/en/level/master_bachelor
Adding https://www.cmu.ac.th/en/level/phd
Adding https://www.cmu.ac.th/en/Level/other
Adding https://www.cmu.ac.th/en/aboutus
Adding https://www.cmu.ac.th/faculty
Adding https://www.cmu.ac.th/en/mass_communication/aboutus/head
Adding https://www.cmu.ac.th/en/mass_comm

exception calling callback for <Future at 0x28f4eca3440 state=finished returned NoneType>
Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\SE-IR\Lib\concurrent\futures\_base.py", line 340, in _invoke_callbacks
    callback(self)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_21416\136507877.py", line 27, in extract_page
    if result and result.status_code == 200:
       ^^^^^^
UnboundLocalError: cannot access local variable 'result' where it is not associated with a value


Adding https://www.cmu.ac.th/th/content/4EEE10D2-E793-4E55-8D67-400C510F98FE
Adding https://www.cmu.ac.th/cn/content/4EEE10D2-E793-4E55-8D67-400C510F98FE
Adding https://www.cmu.ac.th/th/content/4EEE10D2-E793-4E55-8D67-400C510F98FE
Adding https://www.cmu.ac.th/cn/content/4EEE10D2-E793-4E55-8D67-400C510F98FE
Adding https://www.cmu.ac.th/th/content/D65CB215-9918-4D97-9F54-DCBB749E7159
Adding https://www.cmu.ac.th/cn/content/D65CB215-9918-4D97-9F54-DCBB749E7159
Adding https://www.cmu.ac.th/th/content/D65CB215-9918-4D97-9F54-DCBB749E7159
Adding https://www.cmu.ac.th/cn/content/D65CB215-9918-4D97-9F54-DCBB749E7159
Adding https://www.cmu.ac.th/th/cmu/aboutus
Adding https://www.cmu.ac.th/cn/cmu/aboutus
Adding https://www.cmu.ac.th/th/cmu/aboutus
Adding https://www.cmu.ac.th/cn/cmu/aboutus
Adding https://policy13.cmu.ac.th
Adding https://www.cmu.ac.th/th/university/servicesgroup/c91c4ec5-4fa5-4813-936c-7b250c008e01
Adding https://www.cmu.ac.th/th/university/servicesgroup/2b036c23-3250-4c22-a11c

exception calling callback for <Future at 0x28f4eca3e00 state=finished returned NoneType>
Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\SE-IR\Lib\concurrent\futures\_base.py", line 340, in _invoke_callbacks
    callback(self)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_21416\136507877.py", line 27, in extract_page
    if result and result.status_code == 200:
       ^^^^^^
UnboundLocalError: cannot access local variable 'result' where it is not associated with a value


Adding http://www.soc.cmu.ac.th
Adding https://www.cmu.ac.th/en/article/d99ee86e-4f83-46a1-8cd6-e45d86534d45
Adding https://www.cmu.ac.th/en/article/d99ee86e-4f83-46a1-8cd6-e45d86534d45
Adding https://www.cmu.ac.th/en/article/da9d5c63-6216-4fda-a109-85faee678202
Adding https://www.cmu.ac.th/en/article/da9d5c63-6216-4fda-a109-85faee678202
Adding https://www.cmu.ac.th/en/article/7d5c7897-0c94-4fa0-9d00-daa490c164d5
Adding https://www.cmu.ac.th/en/article/7d5c7897-0c94-4fa0-9d00-daa490c164d5
Adding https://www.cmu.ac.th/en/article/1f61b538-85aa-4456-8c51-a16038ffbb4f
Adding https://www.cmu.ac.th/en/article/1f61b538-85aa-4456-8c51-a16038ffbb4f
Adding https://www.cmu.ac.th/en/article/e17e941d-ca58-4c5c-aa57-46965b248f67
Adding https://www.cmu.ac.th/en/article/e17e941d-ca58-4c5c-aa57-46965b248f67
Adding https://www.cmu.ac.th/en/article/73a9030e-c748-4bf4-b64f-0d99279ee83d
Adding https://www.cmu.ac.th/en/article/73a9030e-c748-4bf4-b64f-0d99279ee83d
Adding https://www.cmu.ac.th/en/article/54a7

exception calling callback for <Future at 0x28f4ece1400 state=finished returned NoneType>
Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\SE-IR\Lib\concurrent\futures\_base.py", line 340, in _invoke_callbacks
    callback(self)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_21416\136507877.py", line 27, in extract_page
    if result and result.status_code == 200:
       ^^^^^^
UnboundLocalError: cannot access local variable 'result' where it is not associated with a value
exception calling callback for <Future at 0x28f4ece1b80 state=finished returned NoneType>
Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\SE-IR\Lib\concurrent\futures\_base.py", line 340, in _invoke_callbacks
    callback(self)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_21416\136507877.py", line 27, in extract_page
    if result and result.status_code == 200:
       ^^^^^^
UnboundLocalError: cannot access local variable 'result' where it is not associated

{'https://www.cmu.ac.th/en/article/7658e2c0-c55a-4547-af27-76554711ef2c', 'https://www.cmu.ac.th/en/article/94b53a22-79e4-4bf3-bb06-73efa5039b0f', 'https://www.cmu.ac.th/content/Article/2024/c1f7151f-e075-4981-8b64-1c7bbe1d7137/6f6e1234-927a-4b98-ab04-eded09cec58c.jpg', 'https://www.cmu.ac.th/en/article/b76cbaeb-7f4b-438b-b0a8-768ef0881a3f', 'https://www.cmu.ac.th/en/article/c7012c99-4cbf-49d7-ba07-cf692a4242bc', 'https://www.cmu.ac.th/en/article/598fae0b-994c-44d3-90ff-8c3ebc235e96', 'https://www.cmu.ac.th/en/c9c3f70f-5e9c-4f55-ad54-11051c434714', 'https://www.cmu.ac.th/content/Article/2024/15b4ed97-29b0-4411-b40b-58522ef2660a/f45a4896-138c-4415-8ad5-0a827871c049.jpg', 'https://www.cmu.ac.th/content/Article/2024/15b4ed97-29b0-4411-b40b-58522ef2660a/9c192792-5586-48cf-a9ef-b1c3693aca0a.jpg', 'https://www.cmu.ac.th/content/Article/2024/c1f7151f-e075-4981-8b64-1c7bbe1d7137/60c60297-ab66-42e1-9ece-42bb91c3859c.jpg', 'https://www.cmu.ac.th/en/article/5b2725b0-7b90-46b8-8c6e-fb1265a8be5a', 

exception calling callback for <Future at 0x28f4e669490 state=finished returned NoneType>
Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\SE-IR\Lib\concurrent\futures\_base.py", line 340, in _invoke_callbacks
    callback(self)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_21416\136507877.py", line 27, in extract_page
    if result and result.status_code == 200:
       ^^^^^^
UnboundLocalError: cannot access local variable 'result' where it is not associated with a value
exception calling callback for <Future at 0x28f4eee57f0 state=finished returned NoneType>
Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\SE-IR\Lib\concurrent\futures\_base.py", line 340, in _invoke_callbacks
    callback(self)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_21416\136507877.py", line 27, in extract_page
    if result and result.status_code == 200:
       ^^^^^^
UnboundLocalError: cannot access local variable 'result' where it is not associated