## Scraping data with multi-threading

Created by: [tanyongsheng.net](https://tanyongsheng.net)

----

Reference: 
1. MultiThreading in Python | Python Concurrent futures | ThreadPoolExecutor https://www.youtube.com/watch?v=i0Tey6Gprnc&t=495s
2. How to Make Web Scraping Faster - Python Tutorial https://oxylabs.io/blog/how-to-make-web-scraping-faster


In [27]:
%%time

import concurrent.futures
import requests

urls = ["https://httpbin.org/delay/10"] * 10
session = requests.Session()

def scrape(url):
    response = session.get(url)
    return response.json()

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as ex:
    results = {ex.submit(scrape, url): url for url in urls}
    
    data_list = []

    for result in concurrent.futures.as_completed(results):    
        data = result.result()

        if isinstance(data, list):
            data_list.extend(data)
        else:
            data_list.append(data)
    print(data_list)

[{'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed66c5-79da79af3899e33175dcf51b'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed66c5-18c4da854c2adfff581edb1f'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed66c5-3a56e259791c52ba2a517372'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept

In [30]:
import pandas
pandas.DataFrame(data_list)

Unnamed: 0,args,data,files,form,headers,origin,url
0,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
1,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
2,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
3,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
4,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
5,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
6,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
7,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
8,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
9,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10


## Comparison: Scraping without multi-threading

In [32]:
%%time
import requests

urls = ["https://httpbin.org/delay/10"] * 10

session = requests.Session()

def scrape(url):
    response = session.get(url)
    return response.json()

data_list = []

for url in urls:
    data = scrape(url)
    if isinstance(data, list):
        data_list.extend(data)
    else:
        data_list.append(data)

print(data_list)

[{'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed67bf-2906a14f5652106b16526a3b'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed67c9-6149c2e57aec7bc754290cd8'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.31.0', 'X-Amzn-Trace-Id': 'Root=1-65ed67d3-6168530e4e0497d91c4f7e6f'}, 'origin': '34.127.100.114', 'url': 'https://httpbin.org/delay/10'}, {'args': {}, 'data': '', 'files': {}, 'form': {}, 'headers': {'Accept

In [33]:
import pandas
pandas.DataFrame(data_list)

Unnamed: 0,args,data,files,form,headers,origin,url
0,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
1,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
2,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
3,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
4,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
5,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
6,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
7,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
8,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10
9,{},,{},{},"{'Accept': '*/*', 'Accept-Encoding': 'gzip, de...",34.127.100.114,https://httpbin.org/delay/10


## Conclusion