Skip to content

Commit 1f1e909

Browse files
committed
added html table extractor tutorial
1 parent 03d1179 commit 1f1e909

File tree

4 files changed

+99
-0
lines changed

4 files changed

+99
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
6464
- [How to Download All Images from a Web Page in Python](https://www.thepythoncode.com/article/download-web-page-images-python). ([code](web-scraping/download-images))
6565
- [How to Extract All Website Links in Python](https://www.thepythoncode.com/article/extract-all-website-links-python). ([code](web-scraping/link-extractor))
6666
- [How to Make an Email Extractor in Python](https://www.thepythoncode.com/article/extracting-email-addresses-from-web-pages-using-python). ([code](web-scraping/email-extractor))
67+
- [How to Convert HTML Tables into CSV Files in Python](https://www.thepythoncode.com/article/convert-html-tables-into-csv-files-in-python). ([code](web-scraping/html-table-extractor))
6768

6869
- ### [Python Standard Library](https://www.thepythoncode.com/topic/python-standard-library)
6970
- [How to Transfer Files in the Network using Sockets in Python](https://www.thepythoncode.com/article/send-receive-files-using-sockets-python). ([code](general/transfer-files/))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# [How to Convert HTML Tables into CSV Files in Python](https://www.thepythoncode.com/article/convert-html-tables-into-csv-files-in-python)
2+
To run this:
3+
- `pip3 install -r requirements.txt`
4+
- To extract all tables from this [wikipedia page]():
5+
```
6+
python html_table_extractor.py https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
7+
```
8+
This will download all HTML tables and save them as CSV files in your current directory.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import requests
2+
import pandas as pd
3+
from bs4 import BeautifulSoup as bs
4+
5+
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
6+
# US english
7+
LANGUAGE = "en-US,en;q=0.5"
8+
9+
def get_soup(url):
10+
"""Constructs and returns a soup using the HTML content of `url` passed"""
11+
# initialize a session
12+
session = requests.Session()
13+
# set the User-Agent as a regular browser
14+
session.headers['User-Agent'] = USER_AGENT
15+
# request for english content (optional)
16+
session.headers['Accept-Language'] = LANGUAGE
17+
session.headers['Content-Language'] = LANGUAGE
18+
# make the request
19+
html = session.get(url)
20+
# return the soup
21+
return bs(html.content, "html.parser")
22+
23+
24+
def get_all_tables(soup):
25+
"""Extracts and returns all tables in a soup object"""
26+
return soup.find_all("table")
27+
28+
29+
def get_table_headers(table):
30+
"""Given a table soup, returns all the headers"""
31+
headers = []
32+
for th in table.find("tr").find_all("th"):
33+
headers.append(th.text.strip())
34+
return headers
35+
36+
37+
def get_table_rows(table):
38+
"""Given a table, returns all its rows"""
39+
rows = []
40+
for tr in table.find_all("tr")[1:]:
41+
cells = []
42+
# grab all td tags in this table row
43+
tds = tr.find_all("td")
44+
if len(tds) == 0:
45+
# if no td tags, search for th tags
46+
# can be found especially in wikipedia tables below the table
47+
ths = tr.find_all("th")
48+
for th in ths:
49+
cells.append(th.text.strip())
50+
else:
51+
# use regular td tags
52+
for td in tds:
53+
cells.append(td.text.strip())
54+
rows.append(cells)
55+
return rows
56+
57+
58+
def save_as_csv(table_name, headers, rows):
59+
pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
60+
61+
62+
def main(url):
63+
# get the soup
64+
soup = get_soup(url)
65+
# extract all the tables from the web page
66+
tables = get_all_tables(soup)
67+
print(f"[+] Found a total of {len(tables)} tables.")
68+
# iterate over all tables
69+
for i, table in enumerate(tables, start=1):
70+
# get the table headers
71+
headers = get_table_headers(table)
72+
# get all the rows of the table
73+
rows = get_table_rows(table)
74+
# save table as csv file
75+
table_name = f"table-{i}"
76+
print(f"[+] Saving {table_name}")
77+
save_as_csv(table_name, headers, rows)
78+
79+
80+
if __name__ == "__main__":
81+
import sys
82+
try:
83+
url = sys.argv[1]
84+
except IndexError:
85+
print("Please specify a URL.\nUsage: python html_table_extractor.py [URL]")
86+
exit(1)
87+
main(url)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
requests
2+
bs4
3+
pandas

0 commit comments

Comments
 (0)