https://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/

In [17]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

class HTMLTableParser:
    
    def parse_url(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        return [(table['id'],self.parse_html_table(table))\
                for table in soup.find_all('table')]  

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):
            
            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)
                    
#             # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0:
            if len(column_names) < n_columns:
                column_names.insert(0, "c0")
                
#             raise Exception("Column titles do not match the number of columns")


        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                            index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text().strip()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1
                
        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass
        
        return df

hp = HTMLTableParser()

In [2]:
base_url = "https://www.stockmonitor.com/sector/"
    
sectors = [
    "basic-materials",
    "communication-services",
    "consumer-cyclical",
    "consumer-defensive",
    "energy",
    "financial-services",
    "healthcare",
    "industrials",
    "technology",
    "utilities"
]

In [35]:
sector_url = {}
for sector in sectors:
    url = f"{base_url}{sector}/"
    sector_url[sector] = url

In [36]:
sector_url

{'basic-materials': 'https://www.stockmonitor.com/sector/basic-materials/',
 'communication-services': 'https://www.stockmonitor.com/sector/communication-services/',
 'consumer-cyclical': 'https://www.stockmonitor.com/sector/consumer-cyclical/',
 'consumer-defensive': 'https://www.stockmonitor.com/sector/consumer-defensive/',
 'energy': 'https://www.stockmonitor.com/sector/energy/',
 'financial-services': 'https://www.stockmonitor.com/sector/financial-services/',
 'healthcare': 'https://www.stockmonitor.com/sector/healthcare/',
 'industrials': 'https://www.stockmonitor.com/sector/industrials/',
 'technology': 'https://www.stockmonitor.com/sector/technology/',
 'utilities': 'https://www.stockmonitor.com/sector/utilities/'}

In [37]:
sector_df = {}
for sector,url in sector_url.items():
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    table = soup.find_all('table')[0]
    df = hp.parse_html_table(table)
    df["Ticker"] = df[["Change%"]]
    df["Sector"] = sector
    sector_df[sector] = df[["Ticker", "Sector", "Company"]]

In [43]:
# sector_df

In [39]:
df = sector_df[sectors[0]].copy()
for i in range(1, len(sectors)):
    sector = sectors[i]
    df = pd.concat([df,sector_df[sector]])

In [40]:
df

Unnamed: 0,Ticker,Sector,Company
0,ASIX,basic-materials,AdvanSix Inc
1,AEM,basic-materials,Agnico Eagle Mines Limited
2,APD,basic-materials,"Air Products and Chemicals, Inc"
3,AGI,basic-materials,Alamos Gold Inc
4,ALB,basic-materials,Albemarle Corporation
...,...,...,...
95,UGI,utilities,UGI Corporation
96,UTL,utilities,Unitil Corporation
97,VST,utilities,Vistra Energy Corp
98,WEC,utilities,"WEC Energy Group, Inc"


In [41]:
df.shape

(4990, 3)

In [42]:
df.to_csv("sector_stocks.csv", index=False)