|
| 1 | +import requests |
| 2 | +import pandas as pd |
| 3 | +from bs4 import BeautifulSoup as bs |
| 4 | + |
| 5 | +USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" |
| 6 | +# US english |
| 7 | +LANGUAGE = "en-US,en;q=0.5" |
| 8 | + |
| 9 | +def get_soup(url): |
| 10 | + """Constructs and returns a soup using the HTML content of `url` passed""" |
| 11 | + # initialize a session |
| 12 | + session = requests.Session() |
| 13 | + # set the User-Agent as a regular browser |
| 14 | + session.headers['User-Agent'] = USER_AGENT |
| 15 | + # request for english content (optional) |
| 16 | + session.headers['Accept-Language'] = LANGUAGE |
| 17 | + session.headers['Content-Language'] = LANGUAGE |
| 18 | + # make the request |
| 19 | + html = session.get(url) |
| 20 | + # return the soup |
| 21 | + return bs(html.content, "html.parser") |
| 22 | + |
| 23 | + |
| 24 | +def get_all_tables(soup): |
| 25 | + """Extracts and returns all tables in a soup object""" |
| 26 | + return soup.find_all("table") |
| 27 | + |
| 28 | + |
| 29 | +def get_table_headers(table): |
| 30 | + """Given a table soup, returns all the headers""" |
| 31 | + headers = [] |
| 32 | + for th in table.find("tr").find_all("th"): |
| 33 | + headers.append(th.text.strip()) |
| 34 | + return headers |
| 35 | + |
| 36 | + |
| 37 | +def get_table_rows(table): |
| 38 | + """Given a table, returns all its rows""" |
| 39 | + rows = [] |
| 40 | + for tr in table.find_all("tr")[1:]: |
| 41 | + cells = [] |
| 42 | + # grab all td tags in this table row |
| 43 | + tds = tr.find_all("td") |
| 44 | + if len(tds) == 0: |
| 45 | + # if no td tags, search for th tags |
| 46 | + # can be found especially in wikipedia tables below the table |
| 47 | + ths = tr.find_all("th") |
| 48 | + for th in ths: |
| 49 | + cells.append(th.text.strip()) |
| 50 | + else: |
| 51 | + # use regular td tags |
| 52 | + for td in tds: |
| 53 | + cells.append(td.text.strip()) |
| 54 | + rows.append(cells) |
| 55 | + return rows |
| 56 | + |
| 57 | + |
| 58 | +def save_as_csv(table_name, headers, rows): |
| 59 | + pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv") |
| 60 | + |
| 61 | + |
| 62 | +def main(url): |
| 63 | + # get the soup |
| 64 | + soup = get_soup(url) |
| 65 | + # extract all the tables from the web page |
| 66 | + tables = get_all_tables(soup) |
| 67 | + print(f"[+] Found a total of {len(tables)} tables.") |
| 68 | + # iterate over all tables |
| 69 | + for i, table in enumerate(tables, start=1): |
| 70 | + # get the table headers |
| 71 | + headers = get_table_headers(table) |
| 72 | + # get all the rows of the table |
| 73 | + rows = get_table_rows(table) |
| 74 | + # save table as csv file |
| 75 | + table_name = f"table-{i}" |
| 76 | + print(f"[+] Saving {table_name}") |
| 77 | + save_as_csv(table_name, headers, rows) |
| 78 | + |
| 79 | + |
| 80 | +if __name__ == "__main__": |
| 81 | + import sys |
| 82 | + try: |
| 83 | + url = sys.argv[1] |
| 84 | + except IndexError: |
| 85 | + print("Please specify a URL.\nUsage: python html_table_extractor.py [URL]") |
| 86 | + exit(1) |
| 87 | + main(url) |
0 commit comments