added html table extractor tutorial

x4nth055 · x4nth055 · commit 1f1e909b65d6 · 2019-12-25T10:42:35.000+01:00
diff --git a/README.md b/README.md
@@ -64,6 +64,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
     - [How to Download All Images from a Web Page in Python](https://www.thepythoncode.com/article/download-web-page-images-python). ([code](web-scraping/download-images))
     - [How to Extract All Website Links in Python](https://www.thepythoncode.com/article/extract-all-website-links-python). ([code](web-scraping/link-extractor))
     - [How to Make an Email Extractor in Python](https://www.thepythoncode.com/article/extracting-email-addresses-from-web-pages-using-python). ([code](web-scraping/email-extractor))
+    - [How to Convert HTML Tables into CSV Files in Python](https://www.thepythoncode.com/article/convert-html-tables-into-csv-files-in-python). ([code](web-scraping/html-table-extractor))
 
 - ### [Python Standard Library](https://www.thepythoncode.com/topic/python-standard-library)
     - [How to Transfer Files in the Network using Sockets in Python](https://www.thepythoncode.com/article/send-receive-files-using-sockets-python). ([code](general/transfer-files/))
diff --git a/web-scraping/html-table-extractor/README.md b/web-scraping/html-table-extractor/README.md
@@ -0,0 +1,8 @@
+# [How to Convert HTML Tables into CSV Files in Python](https://www.thepythoncode.com/article/convert-html-tables-into-csv-files-in-python)
+To run this:
+- `pip3 install -r requirements.txt`
+- To extract all tables from this [wikipedia page]():
+    ```
+    python html_table_extractor.py https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
+    ```
+    This will download all HTML tables and save them as CSV files in your current directory.
diff --git a/web-scraping/html-table-extractor/html_table_extractor.py b/web-scraping/html-table-extractor/html_table_extractor.py
@@ -0,0 +1,87 @@
+import requests
+import pandas as pd
+from bs4 import BeautifulSoup as bs
+
+USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
+# US english
+LANGUAGE = "en-US,en;q=0.5"
+
+def get_soup(url):
+    """Constructs and returns a soup using the HTML content of `url` passed"""
+    # initialize a session
+    session = requests.Session()
+    # set the User-Agent as a regular browser
+    session.headers['User-Agent'] = USER_AGENT
+    # request for english content (optional)
+    session.headers['Accept-Language'] = LANGUAGE
+    session.headers['Content-Language'] = LANGUAGE
+    # make the request
+    html = session.get(url)
+    # return the soup
+    return bs(html.content, "html.parser")
+
+
+def get_all_tables(soup):
+    """Extracts and returns all tables in a soup object"""
+    return soup.find_all("table")
+
+
+def get_table_headers(table):
+    """Given a table soup, returns all the headers"""
+    headers = []
+    for th in table.find("tr").find_all("th"):
+        headers.append(th.text.strip())
+    return headers
+
+
+def get_table_rows(table):
+    """Given a table, returns all its rows"""
+    rows = []
+    for tr in table.find_all("tr")[1:]:
+        cells = []
+        # grab all td tags in this table row
+        tds = tr.find_all("td")
+        if len(tds) == 0:
+            # if no td tags, search for th tags
+            # can be found especially in wikipedia tables below the table
+            ths = tr.find_all("th")
+            for th in ths:
+                cells.append(th.text.strip())
+        else:
+            # use regular td tags
+            for td in tds:
+                cells.append(td.text.strip())
+        rows.append(cells)
+    return rows
+
+
+def save_as_csv(table_name, headers, rows):
+    pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")
+
+
+def main(url):
+    # get the soup
+    soup = get_soup(url)
+    # extract all the tables from the web page
+    tables = get_all_tables(soup)
+    print(f"[+] Found a total of {len(tables)} tables.")
+    # iterate over all tables
+    for i, table in enumerate(tables, start=1):
+        # get the table headers
+        headers = get_table_headers(table)
+        # get all the rows of the table
+        rows = get_table_rows(table)
+        # save table as csv file
+        table_name = f"table-{i}"
+        print(f"[+] Saving {table_name}")
+        save_as_csv(table_name, headers, rows)
+
+
+if __name__ == "__main__":
+    import sys
+    try:
+        url = sys.argv[1]
+    except IndexError:
+        print("Please specify a URL.\nUsage: python html_table_extractor.py [URL]")
+        exit(1)
+    main(url)
diff --git a/web-scraping/html-table-extractor/requirements.txt b/web-scraping/html-table-extractor/requirements.txt
@@ -0,0 +1,3 @@
+requests
+bs4
+pandas