# Webscrape Forbes business data

In [19]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import requests
import json
from typing import Dict

chrome_options = Options()
chrome_options.add_argument("--headless") 
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

chrome_driver_path = '/opt/homebrew/bin/chromedriver' 

service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

base_url = 'https://www.forbes.com/business/?sh=74da965e535f'

driver.get(base_url)

SCROLL_PAUSE_TIME = 2
articles_info = []

def scrape_article(url: str) -> Dict:
    """
    Scrapes article information from a given URL.

    Args:
    - url (str): The URL of the article to scrape.

    Returns:
    - Dict: A dictionary containing the following keys:
        - 'url': The input URL.
        - 'title': The title of the article, or 'N/A' if not found.
        - 'date': The date of the article, or 'N/A' if not found.
        - 'author': The author of the article, or 'N/A' if not found.
        - 'content': The content of the article, or 'N/A' if not found.
    """
    article_data = {'url': url}
    response = requests.get(url)
    if response.status_code == 200:
        article_soup = BeautifulSoup(response.content, 'html.parser')
        
        title = article_soup.find('h1')
        article_data['title'] = title.text.strip() if title else 'N/A'
        
        date = article_soup.find('div', {'class': 'content-data'})
        article_data['date'] = date.text.strip() if date else 'N/A'
        
        author_wrapper = article_soup.find('div', {'class': 'fs-author-wrapper'})
        if author_wrapper:
            author = author_wrapper.find('a')
        else:
            author = author_wrapper
        article_data['author'] = author.text.strip() if author else 'N/A'
        
        content_div = article_soup.find('div', {'class': 'article-body'})
        if content_div:
            paragraphs = content_div.find_all('p')
            if paragraphs:
                content = ' '.join(p.text.strip() for p in paragraphs)
            else:
                content = content_div.text.strip()
        else:
            content = 'N/A'
        article_data['content'] = content
        
    return article_data

# Scroll and scrape until 200 articles are collected
while len(articles_info) < 200:
    driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
    
    time.sleep(SCROLL_PAUSE_TIME)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    article_links = set()
    for link in soup.find_all('a', href=True):
        url = link['href']
        # to check if it's actually an article and not author page
        if url.startswith('https://www.forbes.com/sites/') and len(url.split('/')) > 6:
            article_links.add(url)
    
    for url in article_links:
        if len(articles_info) >= 200:
            break
        article_info = scrape_article(url)
        # only append if all info is available for the article
        if article_info['title'] != 'N/A' and article_info['date'] != 'N/A' and article_info['author'] != 'N/A' and article_info['content'] != 'N/A':
            articles_info.append(article_info)
        print(f"Scraped article: {article_info['title']}")
    
    article_links = list(set(article_links))

driver.quit()



print(f"Total number of articles scraped: {len(articles_info)}")

for article in articles_info:
    print(article)





Scraped article: ‘I’m On The Horse’: Biden Defends Himself From Post-Debate Criticisms In NBC Interview
Scraped article: 9 Consequential Energy Predictions - Midyear Review
Scraped article: Republican National Convention: Trump Makes Appearance Days After Assassination Attempt
Scraped article: How November Election Results Could Slow Down The US Energy Transition
Scraped article: The Rolling Stones’ Greatest Hits Help Them Reach A Significant Milestone
Scraped article: Time To End The Political Name-Calling And Actually Debate The Issues
Scraped article: The Secret Sauce Behind This $350 Million Japanese Barbecue Sauce? Grandma.
Scraped article: Nostalgia For Manufacturing Jobs Is Misplaced
Scraped article: WWE Raw Results, Winners And Grades With Rhea Ripley’s Intense Return
Scraped article: Atlanta’s Three All-Star Starters Recall Maddux, Glavine, Smoltz
Scraped article: AC/DC Scores Their Second Career Hit On A Billboard Chart
Scraped article: The Most Important Packers: No. 6 — Jai

In [27]:
output_file = '../data/json/forbes.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(articles_info, f, ensure_ascii=False, indent=4)

In [29]:
count = 0
for article in articles_info:
    if article['title'] == 'N/A' or article['date'] == 'N/A' or article['author'] == 'N/A' or article['content'] == 'N/A':
        count += 1
        print(article)

print(count)

0
