In [2]:
import re
import csv
from time import sleep
from bs4 import BeautifulSoup
import requests

headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

def get_article(card):
    """Extract article information from the raw html"""
    headline = card.find('h4', 's-title').text
    source = card.find("span", 's-source').text
    posted = card.find('span', 's-time').text.replace('·', '').strip()
    description = card.find('p', 's-desc').text.strip()
    raw_link = card.find('a').get('href')
    unquoted_link = requests.utils.unquote(raw_link)
    pattern = re.compile(r'RU=(.+)\/RK')
    clean_link = re.search(pattern, unquoted_link).group(1)
    
    article = (headline, source, posted, description, clean_link)
    return article

def get_the_news(search):
    """Run the main program"""
    template = 'https://news.search.yahoo.com/search?p={}'
    url = template.format(search)
    articles = []
    links = set()
    
    while True:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'NewsArticle')
        
        # extract articles from page
        for card in cards:
            article = get_article(card)
            link = article[-1]
            if not link in links:
                links.add(link)
                articles.append(article)        
                
        # find the next page
        try:
            url = soup.find('a', 'next').get('href')
            sleep(1)
        except AttributeError:
            break
            
    # save article data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Headline', 'Source', 'Posted', 'Description', 'Link'])
        writer.writerows(articles)
        
    return articles

In [3]:
# run the main program
articles = get_the_news('iphone')

In [4]:
# show the first 4 records
articles[0:4]

[('Wall Street is eager to see some sign that AI is driving iPhone demand in...',
  'Business Insider ·  via Yahoo Finance',
  '54 minutes ago',
  'Wall Street anticipates $94.36 billion in revenue and adjusted earnings per share of $1.60. Analysts...',
  'https://finance.yahoo.com/news/wall-street-eager-see-sign-031943613.html?fr=sycsrp_catchall'),
 ('5 phones you should buy instead of the iPhone 16',
  'Digital Trends ·  via Yahoo News',
  '14 hours ago',
  'Apple’s iPhone 16 has arrived, and it’s quite an impressive offering this year. Not only does it come in some of the best colors we’ve seen in a while,...',
  'https://www.yahoo.com/tech/5-phones-buy-instead-iphone-143032836.html'),
 ('Two Boston men arrested for stealing iPhone package from porch of Billerica...',
  'The Boston Globe',
  '2 hours ago',
  'Two Boston men were arrested for allegedly stealing an Apple iPhone package from the porch of a home in Billerica, police said. Daneuri Diaz Romero, 24, and Yonaykin Guerrero P