## Scraping data with multi-threading

Created by: [tanyongsheng.net](https://tanyongsheng.net)

----
Introduction:
- multi


Reference: 
1. MultiThreading in Python | Python Concurrent futures | ThreadPoolExecutor https://www.youtube.com/watch?v=i0Tey6Gprnc&t=495s
2. How to Make Web Scraping Faster - Python Tutorial https://oxylabs.io/blog/how-to-make-web-scraping-faster
3. Comcrawl script which uses multi-threading for scraping: https://github.com/michaelharms/comcrawl/blob/a89236080c5e7f4ce6a2e0d39c5f59671f22181e/comcrawl/utils/search.py#L11


In [1]:
%%time

import concurrent.futures # for multi-threading
import requests # for downloading data
from tqdm import tqdm # for displaying a smart progress meter in loops

urls = ["https://httpbin.org/delay/10"] * 10  ## Note: 

session = requests.Session()

def scrape(url):
    response = session.get(url)
    return response.json()

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as ex:
    results = {ex.submit(scrape, url): url for url in urls}
    
    data_list = []

    for result in tqdm(concurrent.futures.as_completed(results)):
        data = result.result()

        if isinstance(data, list):
            data_list.extend(data)
        else:
            data_list.append(data)
    print(data_list)

[{'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed72b0-279e7d3c4aae2ebc0482b6df'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed72b0-2e354c41613011517831ed2b'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed72b0-23c2353e72cf78b77fd2c191'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept

In [2]:
import pandas
pandas.DataFrame(data_list)

Unnamed: 0,args,data,files,form,headers,origin,url
0,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
1,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
2,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
3,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
4,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
5,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
6,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
7,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
8,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
9,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10


### Comparison 1: Scraping without multi-threading

In [3]:
%%time
import requests
from tqdm import tqdm

urls = ["https://httpbin.org/delay/10"] * 10

session = requests.Session()

def scrape(url):
    response = session.get(url)
    return response.json()

data_list = []

for url in urls:
    data = scrape(url)
    if isinstance(data, list):
        data_list.extend(data)
    else:
        data_list.append(data)

print(data_list)

[{'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed72bc-4ff50f345486d3cc3b60676c'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed72c6-2a5f25a6229ac5a70a93194b'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed72d0-7a022d52236e2f746a9b30b0'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept

### Comparison 2: Scraping with less threads (e.g., 2 threads)

In [4]:
%%time

import concurrent.futures
import requests

urls = ["https://httpbin.org/delay/10"] * 10
session = requests.Session()

def scrape(url):
    response = session.get(url)
    return response.json()

# scrape data on 2 threads only
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as ex:
    results = {ex.submit(scrape, url): url for url in urls}
    
    data_list = []

    for result in concurrent.futures.as_completed(results):    
        data = result.result()

        if isinstance(data, list):
            data_list.extend(data)
        else:
            data_list.append(data)
    print(data_list)

## Conclusion