In [7]:
import requests

In [8]:
url = "https://text.npr.org/"
resp = requests.get(url)

In [6]:
# this is the HTML of the web page.
print(resp.text)

<!DOCTYPE html>
<html lang="en">
<head>
    <title>NPR : National Public Radio</title>
    <meta http-equiv="Content-Type" content="text/html;charset=utf-8">
    <meta name="viewport" content="width=device-width">
    <link id="favicon" rel="shortcut icon" type="image/png" href="">
    <style>
        body {
    display: block;
    padding: 0px 20px;
    max-width: 550px;
    margin: 0 auto;
    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
}

.full-version-link {
    margin-left: 15px;
}

.slug-line {
    font-size: 1.1rem;
    margin-bottom: 15px;
}

.hr-line {
    position: relative;
    height: 4px;
}

## BeautifulSoup

In [9]:
from bs4 import BeautifulSoup

# read the webpage as bs4
soup = BeautifulSoup(resp.text)

# select all the "a" tags with the specified class.
articles = soup.find_all("a", {"class": "topic-title"})

# iterate through each headline and grab the title and link of each story.
data = []
for elm in articles:
    link = elm.get('href')
    link = f"https://npr.org{link}"
    title = elm.text
    row = {'link' : link, 'title': title}
    data.append(row)
    
data[:5]

[{'link': 'https://npr.org/nx-s1-5459871',
  'title': 'Texas is relying on FEMA. State leaders said it should be cut'},
 {'link': 'https://npr.org/nx-s1-5460995',
  'title': 'In Texas, a major search effort is being led by the father of a flood victim'},
 {'link': 'https://npr.org/nx-s1-5461000',
  'title': 'The United Cajun Navy helps Texans deal with flood aftermath'},
 {'link': 'https://npr.org/g-s1-76471',
  'title': 'Graphics: Where the Texas floods happened and how high the waters rose'},
 {'link': 'https://npr.org/nx-s1-5460018',
  'title': 'After quitting antidepressants, some people suffer surprising, lingering symptoms'}]

Let's look at the overkill version: <br>
https://github.com/the-markup/investigation-google-search-audit/blob/master/utils/parsers.py

## XPath

In [None]:
/html/body/main/div/ul/li

In [4]:
from lxml import etree

# read the webpage as lxml
tree = etree.HTML(resp.text)

# select all the "a" tags with the specified class.
xpath_article = './/a[@class="topic-title"]'
elements = tree.findall(xpath_article)

# iterate through each headline and grab the title and link of each story.
data = []
for elm in elements:
    link = elm.get('href')
    link = f"https://npr.org{link}"
    title = elm.text
    row = {'link' : link, 'title': title}
    data.append(row)
    
data[:5]

[{'link': 'https://npr.org/nx-s1-5454669',
  'title': 'DOJ launches unusual lawsuit against entire federal district court in Maryland'},
 {'link': 'https://npr.org/nx-s1-5429615',
  'title': 'Supreme Court allows Trump to resume mass federal layoffs for now'},
 {'link': 'https://npr.org/nx-s1-5460898',
  'title': 'Texas flood death toll tops 100 and more than 160 people are missing, Gov. Abbott says '},
 {'link': 'https://npr.org/g-s1-76471',
  'title': 'Graphics: Where the Texas floods happened and how high the waters rose'},
 {'link': 'https://npr.org/nx-s1-5460025',
  'title': "Haiti's iconic Hotel Oloffson, long a cultural beacon, destroyed by gang violence"}]

In [None]:
/html/body/main/div/ul/li/a[3]

In [None]:
//a[@class="topic-title" and contains(text(), "Texas")] 

In [None]:
//a