## Example straight from bls.gov website

In [4]:
import requests
import json
import prettytable
headers = {'Content-type': 'application/json'}
data = json.dumps({"seriesid": ['CUUR0000SA0','SUUR0000SA0'],"startyear":"2011", "endyear":"2014"})
p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers)
json_data = json.loads(p.text)
for series in json_data['Results']['series']:
    x=prettytable.PrettyTable(["series id","year","period","value","footnotes"])
    seriesId = series['seriesID']
    for item in series['data']:
        year = item['year']
        period = item['period']
        value = item['value']
        footnotes=""
        for footnote in item['footnotes']:
            if footnote:
                footnotes = footnotes + footnote['text'] + ','
        if 'M01' <= period <= 'M12':
            x.add_row([seriesId,year,period,value,footnotes[0:-1]])
    output = open(seriesId + '.txt','w')
    output.write (x.get_string())
    output.close()

## Modified Example

In [7]:
import requests
import json
import prettytable

# Headers for BLS API
headers = {'Content-type': 'application/json'}

# Replace "seriesid" with the series ID(s) relevant to Real Estate Brokers and Sales Agents
# Note: Replace "OES_41-9020" with the actual BLS series ID if different
data = json.dumps({
    "seriesid": ["OEWS000000419020"],  # Sample ID for Real Estate Brokers and Sales Agents (41-9020)
    "startyear": "2021",  # Adjust start and end year as needed
    "endyear": "2023"
})

# API request
response = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers)
json_data = json.loads(response.text)

# PrettyTable for formatted output
for series in json_data['Results']['series']:
    table = prettytable.PrettyTable(["series id", "year", "period", "value", "footnotes"])
    series_id = series['seriesID']
    
    for item in series['data']:
        year = item['year']
        period = item['period']
        value = item['value']
        footnotes = ", ".join([f['text'] for f in item['footnotes'] if f])
        
        # Only add monthly data (M01 - M12) for regular wage or employment figures
        if 'M01' <= period <= 'M12':
            table.add_row([series_id, year, period, value, footnotes])

    # Write to file
    output_file = f"{series_id}.txt"
    with open(output_file, 'w') as output:
        output.write(table.get_string())
        print(table)
print(f"Data saved to {series_id}.txt")


+-----------+------+--------+-------+-----------+
| Series ID | Year | Period | Value | Footnotes |
+-----------+------+--------+-------+-----------+
+-----------+------+--------+-------+-----------+


## Status code 403 means not authorized

In [9]:
import requests
from bs4 import BeautifulSoup

# URL of the page to scrape
url = "https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1"

# Set a user-agent to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}

# Send a GET request to the webpage with headers
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the Quick Facts table
    quickfacts_table = soup.find('table', {'id': 'quickfacts'})

    # Check if the table exists
    if quickfacts_table:
        # Get the HTML content of the table
        quickfacts_html = str(quickfacts_table)

        # Save the table HTML to a file
        with open('quickfacts.html', 'w') as file:
            file.write(quickfacts_html)

        print("Quick Facts table has been successfully copied to quickfacts.html")
    else:
        print("Quick Facts table not found on the page.")
else:
    print("Error: Unable to access the webpage. Status code:", response.status_code)


Error: Unable to access the webpage. Status code: 403


## Let's see why we're not authorized:

In [12]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time

# Set up the Edge WebDriver
options = webdriver.EdgeOptions()
options.add_argument('--headless')  # Run headlessly (without opening a browser window)
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Initialize the Edge WebDriver
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

# URL of the page to scrape
url = "https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1"

# Open the URL
driver.get(url)

# Allow time for the page to load
time.sleep(2)  # Adjust the sleep time as necessary

# Get the full HTML content of the page
page_html = driver.page_source

# Print the HTML to the terminal
print(page_html)

# Close the WebDriver
driver.quit()


<html lang="en-us"><head> 
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>Access Denied</title>
<style type="text/css">
    .centerDiv
    {
      width: 60%;
      height:200px;
      margin: 0 auto;
      background-color:#FFFFFF ;
    }
  </style></head>
  
<body>
<div class="centerDiv">
               <h1><a href="https://www.bls.gov"><!--img src="/apology_objects/images/01.jpg" border="0"--></a><span style="font-family: Times, serif; color: #990000; font-size: 38px;">Bureau of Labor Statistics</span></h1>
    <h2>Access Denied</h2>
              
               <p>The BLS is committed to providing data promptly and according to established schedules. Automated retrieval programs (commonly called "robots" or "bots") can cause delays and interfere with other customers' timely access to information. Therefore, bot activity that doesn't conform to BLS usage policy is prohibited.</p>
              
 
               <p>We apologize for any inconvenience. If y

Please note, we're told we can't do it because automatic retrieval is not allowed.  We can work around that, however, we will have to:

# Impersonate a User for Full Access

## Read the HTML, they have youtube videos we could grab and put on our website automatically

In [13]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time

# Set up the Edge WebDriver
options = webdriver.EdgeOptions()
# Remove the headless option to see the browser window
# options.add_argument('--headless')  # Do not use this line

# Initialize the Edge WebDriver
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

# URL of the page to scrape
url = "https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1"

# Open the URL
driver.get(url)

# Allow time for the page to load
time.sleep(5)  # Increase if necessary to allow the page to fully load

# Get the full HTML content of the page
page_html = driver.page_source

# Print the HTML to the terminal
print(page_html)

# Close the WebDriver after you're done
# driver.quit()  # Uncomment this line to close the browser after use


<html lang="en"><head>
<!-- P2 -->

			                			                        	    		        
    			<title>    Real Estate Brokers and Sales Agents :     Occupational Outlook Handbook: :     U.S. Bureau of Labor Statistics</title>
							<meta charset="UTF-8">
		
		
				
        <meta property="og:title" content="Real Estate Brokers and Sales Agents">
				<meta property="og:url" href="https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm">
        <link rel="canonical" href="https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm">
		<meta property="og:type" content="website">
					<meta name="description" content="Real estate brokers and sales agents help clients buy, sell, and rent properties.">
			<meta property="og:description" content="Real estate brokers and sales agents help clients buy, sell, and rent properties.">
		
				

								<meta name="date" content="2024-08-29">
				
						
		    				
		    				
		    		
		
				
				        	<meta n

## Extract just the table

In [14]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time

# Set up the Edge WebDriver
options = webdriver.EdgeOptions()
# Do not use the headless option to see the browser window
# options.add_argument('--headless')  # Do not use this line

# Initialize the Edge WebDriver
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

# URL of the page to scrape
url = "https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1"

# Open the URL
driver.get(url)

# Allow time for the page to load
time.sleep(5)  # Increase if necessary to allow the page to fully load

# Try to find the Quick Facts table
try:
    # Locate the Quick Facts table by its ID
    quickfacts_table = driver.find_element(By.ID, 'quickfacts')
    quickfacts_html = quickfacts_table.get_attribute('outerHTML')

    # Print the Quick Facts table HTML to the terminal
    print(quickfacts_html)
except Exception as e:
    print("Error: Unable to find the Quick Facts table. Exception:", e)

# Close the WebDriver after you're done
# driver.quit()  # Uncomment this line to close the browser after use


<table class="regular-text tooltips" id="quickfacts">
                            <thead>
                                <tr>
                                                                        <th colspan="2" id="quickfacts-0-0" role="columnheader">    Quick Facts:     Real Estate Brokers and Sales Agents</th>
                                </tr>
                            </thead>
                            <tbody>
                                <tr>
                                    <th id="quickfacts-1-0" headers="quickfacts-0-0" role="rowheader"><a id="TB_inline?height=325&amp;width=325&amp;inlineId=qf-wage" href="#TB_inline?height=325&amp;width=325&amp;inlineId=qf-wage" class="thickbox about-qf-section noprint" aria-label="										2023     Median Pay										 										2023     Median Pay										 more information on 										2023 Median Pay										">										2023     Median Pay										</a>									</th>
                                    <td headers="quickf

## This gets everything but the table formatting

In [17]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time

# Set up the Edge WebDriver
options = webdriver.EdgeOptions()
# Do not use the headless option to see the browser window
# options.add_argument('--headless')  # Do not use this line

# Initialize the Edge WebDriver
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

# URL of the page to scrape
url = "https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1"

# Open the URL
driver.get(url)

# Allow time for the page to load
time.sleep(5)  # Increase if necessary to allow the page to fully load

# Try to find the Quick Facts table
try:
    # Locate the Quick Facts table by its ID
    quickfacts_table = driver.find_element(By.ID, 'quickfacts')
    quickfacts_html = quickfacts_table.get_attribute('outerHTML')

    # Create the complete HTML structure with external CSS
    complete_html = f"""
    <html>
        <head>
            <link rel="stylesheet" type="text/css" href="jobs.css">
            <title>Quick Facts Table</title>
        </head>
        <body>
            {quickfacts_html}
        </body>
    </html>
    """

    # Save the complete HTML to a file
    with open("job tables.html", "w", encoding="utf-8") as file:
        file.write(complete_html)

    print("Quick Facts table with styles has been successfully saved to 'job tables.html'")
except Exception as e:
    print("Error: Unable to find the Quick Facts table. Exception:", e)

# Close the WebDriver after you're done
# driver.quit()  # Uncomment this line to close the browser after use


Quick Facts table with styles has been successfully saved to 'job tables.html'


## ONE more change, try and make copied links functional

In [18]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time
from urllib.parse import urljoin

# Set up the Edge WebDriver
options = webdriver.EdgeOptions()
# Do not use the headless option to see the browser window
# options.add_argument('--headless')  # Do not use this line

# Initialize the Edge WebDriver
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

# URL of the page to scrape
url = "https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1"

# Open the URL
driver.get(url)

# Allow time for the page to load
time.sleep(5)  # Increase if necessary to allow the page to fully load

# Try to find the Quick Facts table
try:
    # Locate the Quick Facts table by its ID
    quickfacts_table = driver.find_element(By.ID, 'quickfacts')
    
    # Get the outer HTML of the table
    quickfacts_html = quickfacts_table.get_attribute('outerHTML')
    
    # Convert relative links to absolute links
    base_url = "https://www.bls.gov"
    from bs4 import BeautifulSoup

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(quickfacts_html, 'html.parser')
    
    # Find all anchor tags and update href attributes to be absolute
    for link in soup.find_all('a'):
        if link.get('href'):
            link['href'] = urljoin(base_url, link['href'])

    # Get the updated HTML with absolute links
    updated_quickfacts_html = str(soup)

    # Create the complete HTML structure with external CSS
    complete_html = f"""
    <html>
        <head>
            <link rel="stylesheet" type="text/css" href="jobs.css">
            <title>Quick Facts Table</title>
        </head>
        <body>
            {updated_quickfacts_html}
        </body>
    </html>
    """

    # Save the complete HTML to a file
    with open("job tables.html", "w", encoding="utf-8") as file:
        file.write(complete_html)

    print("Quick Facts table with absolute links has been successfully saved to 'job tables.html'")
except Exception as e:
    print("Error: Unable to find the Quick Facts table. Exception:", e)

# Close the WebDriver after you're done
# driver.quit()  # Uncomment this line to close the browser after use


Quick Facts table with absolute links has been successfully saved to 'job tables.html'


In [19]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time
from bs4 import BeautifulSoup

# Set up the Edge WebDriver
options = webdriver.EdgeOptions()
# Do not use the headless option to see the browser window
# options.add_argument('--headless')  # Do not use this line

# Initialize the Edge WebDriver
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

# URL of the page to scrape
url = "https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1"

# Open the URL
driver.get(url)

# Allow time for the page to load
time.sleep(5)  # Increase if necessary to allow the page to fully load

# Try to find the Quick Facts table
try:
    # Locate the Quick Facts table by its ID
    quickfacts_table = driver.find_element(By.ID, 'quickfacts')
    
    # Get the outer HTML of the table
    quickfacts_html = quickfacts_table.get_attribute('outerHTML')
    
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(quickfacts_html, 'html.parser')
    
    # Remove all links
    for link in soup.find_all('a'):
        link.unwrap()  # Remove the link but keep the text
    
    # Remove specific rows
    rows_to_remove = ["Work Experience in a Related Occupation", "On-the-job Training"]
    for row in soup.find_all('tr'):
        header = row.find('th')
        if header and header.get_text(strip=True) in rows_to_remove:
            row.decompose()  # Remove the row entirely
    
    # Get the updated HTML without links and excluded rows
    updated_quickfacts_html = str(soup)

    # Create the complete HTML structure with external CSS
    complete_html = f"""
    <html>
        <head>
            <link rel="stylesheet" type="text/css" href="jobs.css">
            <title>Quick Facts Table</title>
        </head>
        <body>
            {updated_quickfacts_html}
        </body>
    </html>
    """

    # Save the complete HTML to a file
    with open("job tables.html", "w", encoding="utf-8") as file:
        file.write(complete_html)

    print("Quick Facts table without links and specified rows has been successfully saved to 'job tables.html'")
except Exception as e:
    print("Error: Unable to find the Quick Facts table. Exception:", e)

# Close the WebDriver after you're done
# driver.quit()  # Uncomment this line to close the browser after use


Quick Facts table without links and specified rows has been successfully saved to 'job tables.html'


## Incluse source

In [21]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time
from bs4 import BeautifulSoup

# Set up the Edge WebDriver
options = webdriver.EdgeOptions()
# Do not use the headless option to see the browser window
# options.add_argument('--headless')  # Do not use this line

# Initialize the Edge WebDriver
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

# URL of the page to scrape
url = "https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1"

# Open the URL
driver.get(url)

# Allow time for the page to load
time.sleep(5)  # Increase if necessary to allow the page to fully load

# Try to find the Quick Facts table
try:
    # Locate the Quick Facts table by its ID
    quickfacts_table = driver.find_element(By.ID, 'quickfacts')
    
    # Get the outer HTML of the table
    quickfacts_html = quickfacts_table.get_attribute('outerHTML')
    
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(quickfacts_html, 'html.parser')
    
    # Remove all links
    for link in soup.find_all('a'):
        link.unwrap()  # Remove the link but keep the text
    
    # Remove specific rows
    rows_to_remove = ["Work Experience in a Related Occupation", "On-the-job Training"]
    for row in soup.find_all('tr'):
        header = row.find('th')
        if header and header.get_text(strip=True) in rows_to_remove:
            row.decompose()  # Remove the row entirely
    
    # Add the source link at the bottom of the table
    source_row = soup.new_tag('tr')
    source_header = soup.new_tag('th')
    source_header['colspan'] = '2'  # Span across two columns
    source_header.string = 'Source: '
    
    # Create the source link
    source_link = soup.new_tag('a', href=url)
    source_link.string = 'www.bls.gov'
    source_header.append(source_link)
    
    source_row.append(source_header)
    soup.find('tbody').append(source_row)  # Add the source row to the table body

    # Get the updated HTML without links and excluded rows
    updated_quickfacts_html = str(soup)

    # Create the complete HTML structure with external CSS
    complete_html = f"""
    <html>
        <head>
            <link rel="stylesheet" type="text/css" href="jobs.css">
            <title>Quick Facts Table</title>
        </head>
        <body>
            {updated_quickfacts_html}
        </body>
    </html>
    """

    # Save the complete HTML to a file
    with open("job tables.html", "w", encoding="utf-8") as file:
        file.write(complete_html)

    print("Quick Facts table with source link has been successfully saved to 'job tables.html'")
except Exception as e:
    print("Error: Unable to find the Quick Facts table. Exception:", e)

# Close the WebDriver after you're done
# driver.quit()  # Uncomment this line to close the browser after use


Quick Facts table with source link has been successfully saved to 'job tables.html'


## Try and reformat table

In [22]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import time
from bs4 import BeautifulSoup

# Set up the Edge WebDriver
options = webdriver.EdgeOptions()
# Do not use the headless option to see the browser window
# options.add_argument('--headless')  # Do not use this line

# Initialize the Edge WebDriver
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

# URL of the page to scrape
url = "https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1"

# Open the URL
driver.get(url)

# Allow time for the page to load
time.sleep(5)  # Increase if necessary to allow the page to fully load

# Try to find the Quick Facts table
try:
    # Locate the Quick Facts table by its ID
    quickfacts_table = driver.find_element(By.ID, 'quickfacts')
    
    # Get the outer HTML of the table
    quickfacts_html = quickfacts_table.get_attribute('outerHTML')
    
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(quickfacts_html, 'html.parser')
    
    # Remove all links
    for link in soup.find_all('a'):
        link.unwrap()  # Remove the link but keep the text
    
    # Remove specific rows
    rows_to_remove = ["Work Experience in a Related Occupation", "On-the-job Training"]
    for row in soup.find_all('tr'):
        header = row.find('th')
        if header and header.get_text(strip=True) in rows_to_remove:
            row.decompose()  # Remove the row entirely
    
    # Modify the first row to have a single cell that spans all columns
    first_row = soup.find('tr')
    first_cell = soup.new_tag('th')
    first_cell['colspan'] = '2'  # Set to span all columns
    first_cell.string = 'Quick Facts: Real Estate Brokers and Sales Agents'  # Change this to your desired title
    first_row.clear()  # Clear existing cells in the first row
    first_row.append(first_cell)  # Add the new cell
    
    # Add the source link at the bottom of the table
    source_row = soup.new_tag('tr')
    source_header = soup.new_tag('th')
    source_header['colspan'] = '2'  # Span across two columns
    source_header.string = 'Source: '
    
    # Create the source link
    source_link = soup.new_tag('a', href=url)
    source_link.string = 'www.bls.gov'
    source_header.append(source_link)
    
    source_row.append(source_header)
    soup.find('tbody').append(source_row)  # Add the source row to the table body

    # Get the updated HTML without links and excluded rows
    updated_quickfacts_html = str(soup)

    # Create the complete HTML structure with external CSS
    complete_html = f"""
    <html>
        <head>
            <link rel="stylesheet" type="text/css" href="jobs.css">
            <title>Quick Facts Table</title>
        </head>
        <body>
            {updated_quickfacts_html}
        </body>
    </html>
    """

    # Save the complete HTML to a file
    with open("job tables.html", "w", encoding="utf-8") as file:
        file.write(complete_html)

    print("Quick Facts table with source link has been successfully saved to 'job tables.html'")
except Exception as e:
    print("Error: Unable to find the Quick Facts table. Exception:", e)

# Close the WebDriver after you're done
# driver.quit()  # Uncomment this line to close the browser after use


Quick Facts table with source link has been successfully saved to 'job tables.html'
