In [None]:
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse


class MultiThreadScraper:

    def __init__(self, base_url):

        self.base_url = base_url
        self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
        self.pool = ThreadPoolExecutor(max_workers=20)
        self.scraped_pages = set([])
        self.to_crawl = Queue()
        self.to_crawl.put(self.base_url)

    def parse_links(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=True)
        for link in links:
            url = link['href']
            if url.startswith('/') or url.startswith(self.root_url):
                url = urljoin(self.root_url, url)
                if url not in self.scraped_pages:
                    self.to_crawl.put(url)

    def scrape_info(self, html):
        return

    def post_scrape_callback(self, res):
        result = res.result()
        if result and result.status_code == 200:
            self.parse_links(result.text)
            self.scrape_info(result.text)

    def scrape_page(self, url):
        try:
            res = requests.get(url, timeout=(3, 30))
            return res
        except requests.RequestException:
            return

    def run_scraper(self):
        while True:
            try:
                target_url = self.to_crawl.get(timeout=60)
                if target_url not in self.scraped_pages:
                    print("Scraping URL: {}".format(target_url))
                    self.scraped_pages.add(target_url)
                    job = self.pool.submit(self.scrape_page, target_url)
                    job.add_done_callback(self.post_scrape_callback)
            except Empty:
                return
            except Exception as e:
                print(e)
                continue


In [None]:
import time

In [None]:

if __name__ == '__main__':
    start_time = time.time()
    s = MultiThreadScraper("https://www.w3schools.com/") 
    s.run_scraper()
    end_time = time.time()
    print(end_time-start_time)
    

In [None]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import requests
from bs4 import BeautifulSoup

class ConcurrentListCrawler(object):

    def __init__(self, domain, url_list, threads):
        
        self.domain = domain
        self.urls = url_list
        self.company = list()
        self.max_threads = threads

    def __make_request(self, url):
        try:
            r = requests.get(url=url, timeout=20)
            r.raise_for_status()
        except requests.exceptions.Timeout:
            r = requests.get(url=url, timeout=60)
        except requests.exceptions.ConnectionError:
            r = requests.get(url=url, timeout=60)
        except requests.exceptions.RequestException as e:
            raise e
        return r.url, r.text

    def __parse_results(self, shala, html):

        try:
            while(shala!=None):
                html_soup = BeautifulSoup(html, 'html.parser')
                company_containers = html_soup.find_all('a', class_ = "link_display_like_text")
                first = company_containers[0]
                for comp in company_containers:
                    self.company.append(comp.contents[0].strip())
                next_page_sec = html_soup.find_all('ul', class_ = "pager")
                next_page = (next_page_sec[0].find_all('a')[1]['href'])
                if next_page == "#":
                    shala = None
                else:
                    shala = self.domain+next_page
        except Exception as e:
            print(e)

    def wrapper(self, url):
        url, html = self.__make_request(url)
        self.__parse_results(url, html)

    def run_script(self):
        with ThreadPoolExecutor(max_workers=min(len(self.urls),self.max_threads)) as Executor:
            jobs = [Executor.submit(self.wrapper, u) for u in self.urls]


if __name__ == '__main__':
    example = ConcurrentListCrawler("https://internshala.com", ['https://internshala.com/internships/computer%20science-internship-in-delhi,greater%20noida,gurgaon,gurugram,new%20delhi,noida,south%20west%20delhi,north%20delhi',
                                     'https://internshala.com/internships/marketing-internship-in-delhi,greater%20noida,gurgaon,noida,south%20west%20delhi,gurugram,new%20delhi,north%20delhi',
                                    ], 5)
    example.run_script()
    print(example.company)