## How long does web-scraping take?

Author: Tiangeng Lu

Date: July 29, 2023

- This notebook shows the processing time of scraping-cleaning web tables to dataframes. 
- I use U.S. Department of Homeland Security/SEVIS STEM enrollment data.
- There are 23 webpages with tables.
- Each table has between 20k to 23k rows and about 6 columns.

In [1]:
%%time
import requests
from scrapy import Selector
import numpy as np
import pandas as pd
import os
from datetime import datetime
from pandas.tseries.offsets import MonthEnd
import datetime as dt
os.chdir('/Users/tiangeng/Documents/Python Files')

CPU times: user 618 ms, sys: 127 ms, total: 746 ms
Wall time: 1.31 s


### Get URLS

In [2]:
%%time
# urls
main_url = 'https://studyinthestates.dhs.gov/sevis-by-the-numbers/sevis-by-the-numbers-data'
main_html = requests.get(main_url).content
main_selector = Selector(text = main_html)
# how many elements? 266 as of 07/27/2023
len(main_selector.xpath('//*'))

# <a data-entity-substitution="canonical" data-entity-type="node" data-entity-uuid="98b24128-4146-433f-9288-77fad251a802" href="/sevis-data-mapping-tool/march-2023-stem-sevis-data-mapping-tool-data">March 2023 STEM SEVIS Data Mapping Tool Data</a>
# extract from selector
all_links = main_selector.xpath('//*[contains(@href,"sevis-data-mapping-tool-data")]/@href').extract()
# urls to stem students
stem_links = [link for link in all_links if '-stem-' in link]
# https://studyinthestates.dhs.gov/sevis-data-mapping-tool/march-2023-stem-sevis-data-mapping-tool-data
prefix = "https://studyinthestates.dhs.gov"
stem_links = [prefix + link for link in stem_links if link.startswith('/sevis')]

CPU times: user 74.8 ms, sys: 23.5 ms, total: 98.3 ms
Wall time: 1.34 s


In [3]:
%%time
stem_catalog = pd.DataFrame(data = {
    'url': stem_links,
    'year':[link.split('/')[4].split('-')[1].upper() for link in stem_links],
    'month': [link.split('/')[4].split('-')[0].upper() for link in stem_links]})
stem_catalog['stamp'] = (stem_catalog['month'].str.cat(stem_catalog['year'], sep = '-'))
stem_catalog['stamp'] = [datetime.strptime(stamp, "%B-%Y").strftime("%Y-%m-%d") for stamp in stem_catalog['stamp']]
stem_catalog['stamp'] = (pd.to_datetime(stem_catalog['stamp'], errors = 'ignore') + MonthEnd()).dt.date
stem_catalog = stem_catalog.sort_values('stamp', ascending = True)

CPU times: user 11.9 ms, sys: 4.84 ms, total: 16.7 ms
Wall time: 38.8 ms


In [4]:
from collections import Counter
stem_counter = Counter(stem_catalog['year'])
print(sorted(stem_counter.items()))

[('2015', 2), ('2016', 2), ('2017', 3), ('2018', 4), ('2019', 1), ('2020', 3), ('2021', 3), ('2022', 3), ('2023', 3)]


### Extracting Elements from URLs

In [6]:
%%time
now = datetime.now()
elements_start = now.strftime("%H:%M:%S")
print("elements_start =", elements_start)
stem_elements = [None] * len(stem_catalog)

for i in range(len(stem_catalog)):
    html = requests.get(stem_catalog['url'][i]).content
    sel = Selector(text = html)
    stem_elements[i] = sel.xpath('//table').extract() 
print("Are there any STEM webpages have more than 1 tables?\n",[len(element) for element in stem_elements if len(element) > 1])
# The webpage of STEM July 2018 is blank as of 07/27/2023. I found this out after seeing an "out of range" error message.
[len(element) for element in stem_elements]
# which webpage doesn't have tables?
blank_page_id = [i for i, element in enumerate(stem_elements) if len(element) < 1]
if len(blank_page_id) == 0:
    print("Looks good. Go ahead clean the tables!")
else:
    print("ALERT! Skip the following because it's blank:\n", str(blank_page_id))
    print("Also, remember to update the catalog!")
# select data row(s) to remove
stem_catalog = stem_catalog.drop(blank_page_id, axis = 0)
stem_catalog = stem_catalog.reset_index()
# update the elements in stem_elements by removing the blank one
stem_elements = [element for element in stem_elements if len(element) > 0]
print(len(stem_elements))
now = datetime.now()
elements_finish = now.strftime("%H:%M:%S")
print("elements_finish =", elements_finish)

elements_start = 11:59:23
Are there any STEM webpages have more than 1 tables?
 []
ALERT! Skip the following because it's blank:
 [8]
Also, remember to update the catalog!
23
elements_finish = 12:00:02
CPU times: user 6.95 s, sys: 272 ms, total: 7.22 s
Wall time: 38.9 s


### Converting Elements to Dataframe

In [11]:
%%time
process_begin = [None] * len(stem_catalog)
process_end = [None] * len(stem_catalog)
STEM_dfs_copy = [None] * len(stem_catalog)
for i, element in enumerate(stem_elements):
    process_begin[i] = datetime.now().strftime("%H:%M:%S")
    print("Element",str(i), "begins:",process_begin[i])
    STEM_dfs_copy = [pd.read_html(element[0], header = 0)[0] for element in stem_elements]
    process_end[i] = datetime.now().strftime("%H:%M:%S")
    print("Element",str(i),"ends:",process_end[i])

Element 0 begins: 13:37:09
Element 0 ends: 13:37:56
Element 1 begins: 13:37:56
Element 1 ends: 13:38:43
Element 2 begins: 13:38:43
Element 2 ends: 13:39:30
Element 3 begins: 13:39:30
Element 3 ends: 13:40:18
Element 4 begins: 13:40:18
Element 4 ends: 13:41:05
Element 5 begins: 13:41:05
Element 5 ends: 13:41:53
Element 6 begins: 13:41:53
Element 6 ends: 13:42:40
Element 7 begins: 13:42:40
Element 7 ends: 13:43:27
Element 8 begins: 13:43:27
Element 8 ends: 13:44:14
Element 9 begins: 13:44:14
Element 9 ends: 13:45:02
Element 10 begins: 13:45:02
Element 10 ends: 13:45:49
Element 11 begins: 13:45:49
Element 11 ends: 13:46:37
Element 12 begins: 13:46:37
Element 12 ends: 13:47:24
Element 13 begins: 13:47:24
Element 13 ends: 13:48:12
Element 14 begins: 13:48:12
Element 14 ends: 13:48:59
Element 15 begins: 13:48:59
Element 15 ends: 13:49:47
Element 16 begins: 13:49:47
Element 16 ends: 13:50:34
Element 17 begins: 13:50:34
Element 17 ends: 13:51:22
Element 18 begins: 13:51:22
Element 18 ends: 13:

In [24]:
pd.DataFrame(data = {
    'name': ['stem_' + str(stamp) for stamp in stem_catalog['stamp']],
    'KB': [round(getsizeof(df)*0.001,1) for df in STEM_dfs_copy],
    'row': [df.shape[0] for df in STEM_dfs_copy],
    'begin': process_begin,
    'end': process_end
})

Unnamed: 0,name,KB,row,begin,end
0,stem_2015-09-30,7175.1,22045,13:37:09,13:37:56
1,stem_2015-12-31,7472.3,22956,13:37:56,13:38:43
2,stem_2016-03-31,7481.1,22982,13:38:43,13:39:30
3,stem_2016-11-30,7636.5,23459,13:39:30,13:40:18
4,stem_2017-03-31,7598.8,23345,13:40:18,13:41:05
5,stem_2017-05-31,7540.9,23167,13:41:05,13:41:53
6,stem_2017-12-31,7850.7,23540,13:41:53,13:42:40
7,stem_2018-03-31,7709.7,23685,13:42:40,13:43:27
8,stem_2018-08-31,7867.0,24147,13:43:27,13:44:14
9,stem_2018-12-31,7792.3,23916,13:44:14,13:45:02


In [25]:
stem_process_time = pd.DataFrame(data = {
    'name': ['stem_' + str(stamp) for stamp in stem_catalog['stamp']],
    'KB': [round(getsizeof(df)*0.001,1) for df in STEM_dfs_copy],
    'row': [df.shape[0] for df in STEM_dfs_copy],
    'begin': process_begin,
    'end': process_end
})
stem_process_time.to_csv('stem_process_time.csv', index = False)