# Scraping Reuters Business News Articles for Select Companies

In [1]:
import requests
import random

from bs4 import BeautifulSoup as bs
import pandas as pd

import urllib.request
from datetime import datetime
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException

In [22]:
# open up a selenium browser and log in to Reuters - free account to access all articles
username = "username" #
password = "password" #

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome('chromedriver', options = chrome_options)
wait = WebDriverWait(driver, 30)
driver.get("https://www.reuters.com/account/sign-in")
wait.until(EC.element_to_be_clickable((By.NAME, "email"))).send_keys(username)
wait.until(EC.element_to_be_clickable((By.NAME, "password"))).send_keys(password)
wait.until(EC.element_to_be_clickable((By.XPATH, '//span[contains(text(), "Sign in")]'))).click()
print("Logged in")

Logged in


In [5]:
# set begin and end dates of articles to scrape - in this case we are doing 3 months prior and 3 months after ChatGPT launched on Nov 30, 2022
enddate = datetime(2023,2,28)
startdate = datetime(2022,9,1)

In [6]:
# selected some popular companies since we wanted to get a large number of articles
search_companies = ['Netflix', 'Microsoft', 'Apple', 'Twitter', 'Google', 'Tesla']

In [14]:
links = []
dates = []
companies = []

root = 'https://www.reuters.com' # used for updating the link to the articles in the dataframe

for s in search_companies:

    driver.get('https://www.reuters.com/site-search/?query='+s+'&section=business&offset=0&date=past_year')
    
    time.sleep(random.uniform(2,5))
    html = driver.page_source
    soup = bs(html, 'html.parser')
    results = int(soup.find('div', class_="search-results__subtitle__3k4lv").text.split()[0])
    offsets = [x for x in range(0, results, 20)] #20 articles per page, need to iterate over 


    for o in offsets:    
        url2 = 'https://www.reuters.com/site-search/?query='+s+'&section=business&offset='+str(o)+'&date=past_year'

        driver.get(url2)

        #print(driver.title) # used this in testing that the code was working
        time.sleep(random.uniform(2,4))
        html = driver.page_source # Get the html of the page

        soup = bs(html, 'html.parser')
            
        b = soup.find_all('div', class_='media-story-card__body__3tRWy')


        for article in b:
            date = article.find('time').attrs['datetime']
            date = date.split("T")[0]
            date = datetime.strptime(date, '%Y-%m-%d')
            if enddate > date > startdate:
                link = root+article.find('a', href=True)['href']
                # fill in the lists
                links.append(link)
                dates.append(date)
                companies.append(s)
                
                
    print(s, 'links saved')
    
    
print('All Article Links Saved!')

#driver.quit() # Close the browser if just getting the links, but need it open to get the artilce text below

Netflix links saved
Microsoft links saved
Apple links saved
Twitter links saved
Google links saved
Tesla links saved
All Article Links Saved!


In [23]:
print(len(links))

2718


In [24]:
articles = []

for link in links:
    driver.get(link)
    html = driver.page_source
    
    time.sleep(random.uniform(3,5)) # let the page load
    soup = bs(html, 'html.parser')
    
    para = soup.find_all('p', class_="text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__large__nEccO body__full_width__ekUdw body__large_body__FV5_X article-body__element__2p5pI")
    
    article_text = ''
    
    for p in para:
        article_text += p.text
    articles.append(article_text)
    

driver.quit() # Close the browser

In [25]:
data = pd.DataFrame({'company':companies,'date':dates,'article':articles,'link':links})
data.head()

Unnamed: 0,company,date,article,link
0,Netflix,2023-02-24,"SINGAPORE, Feb 24 (Reuters) - U.S streaming gi...",https://www.reuters.com/technology/netflix-mak...
1,Netflix,2023-02-24,Feb 23 (Reuters) - Wall Street ended a topsy-t...,https://www.reuters.com/markets/us/futures-ris...
2,Netflix,2023-02-23,"MEXICO CITY, Feb 23 (Reuters) - Grupo Televisa...",https://www.reuters.com/business/media-telecom...
3,Netflix,2023-02-23,Feb 23 (Reuters) - Netflix Inc (NFLX.O) said o...,https://www.reuters.com/business/media-telecom...
4,Netflix,2023-02-23,"BRUSSELS, Feb 23 (Reuters) - The European Comm...",https://www.reuters.com/technology/eu-eyes-big...


In [26]:
data.to_csv('news_articles.csv', index=False)