# Mars News

In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup

# Import JSON
import json 

In [2]:
# Set up Splinter
browser = Browser('chrome')

## Step 1: Visit the Website

In [3]:
# Retrieve URL
url = "https://static.bc-edx.com/data/web/mars_news/index.html"

# Visit the Mars News site with the browser
browser.visit(url)

In [4]:
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

## Step 2: Scrape the Website

In [None]:
# Scrape the website
html = browser.html

# Create a BeautifulSoup object from scraped HTML
soup = BeautifulSoup(html, 'html.parser')

In [6]:
# Show scraped HTML
print(soup.prettify())

<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="css/bootstrap.min.5.2.2.css" rel="stylesheet" type="text/css"/>
  <link href="css/font.css" rel="stylesheet" type="text/css"/>
  <link href="css/app.css" rel="stylesheet" type="text/css"/>
  <title>
   News - Mars Exploration Program
  </title>
 </head>
 <body>
  <div class="col-md-12">
   <div class="row">
    <nav class="navbar navbar-expand-lg navbar-light fixed-top">
     <div class="container-fluid">
      <a class="navbar-brand" href="#">
       <img src="images/logo.png" width="80"/>
       <span class="logo">
        MARS Planet Science
       </span>
       <span class="logo1">
        Exploration Program
       </span>
      </a>
      <button aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation" class="navbar-toggler" data-bs-target="#navbarNav" data-bs-toggle="collapse" type="button">
       <span class="navbar-toggler-icon"

In [7]:
# Extract all the text elements
all_elements = [tag.name for tag in soup.find_all()]
all_elements

['html',
 'head',
 'meta',
 'meta',
 'link',
 'link',
 'link',
 'title',
 'body',
 'div',
 'div',
 'nav',
 'div',
 'a',
 'img',
 'span',
 'span',
 'button',
 'span',
 'div',
 'ul',
 'li',
 'a',
 'li',
 'a',
 'li',
 'a',
 'li',
 'a',
 'li',
 'a',
 'li',
 'a',
 'li',
 'a',
 'section',
 'div',
 'h1',
 'div',
 'div',
 'div',
 'input',
 'input',
 'div',
 'select',
 'option',
 'div',
 'select',
 'option',
 'div',
 'section',
 'div',
 'div',
 'hr',
 'div',
 'div',
 'div',
 'img',
 'div',
 'div',
 'div',
 'div',
 'div',
 'div',
 'hr',
 'div',
 'div',
 'div',
 'img',
 'div',
 'div',
 'div',
 'div',
 'div',
 'div',
 'hr',
 'div',
 'div',
 'div',
 'img',
 'div',
 'div',
 'div',
 'div',
 'div',
 'div',
 'hr',
 'div',
 'div',
 'div',
 'img',
 'div',
 'div',
 'div',
 'div',
 'div',
 'div',
 'hr',
 'div',
 'div',
 'div',
 'img',
 'div',
 'div',
 'div',
 'div',
 'div',
 'div',
 'hr',
 'div',
 'div',
 'div',
 'img',
 'div',
 'div',
 'div',
 'div',
 'div',
 'div',
 'hr',
 'div',
 'div',
 'div',
 'img',


## Step 3: Store the Results

In [8]:
# Extract titles and preview text of the news articles. Then store scraping results in Python data structure. 

# Create empty list
extracted_article_list = []

# Find all the articles on the Mars News page
extracted_articles = soup.find_all('div', class_='list_text')

# Loop through the articles - collect article title and preview text
for articles in extracted_articles:
    title = articles.find('div', class_='content_title').text
    preview = articles.find('div', class_='article_teaser_body').text
    
    # Create article dictionary
    article_dict = {
        "title": title,
        "preview": preview
    }
    
    # Append the article dictionary to the list 
    extracted_article_list.append(article_dict)

In [9]:
# Print the extracted article list encompassing titles and preview text of the news articles
extracted_article_list

[{'title': "NASA's MAVEN Observes Martian Light Show Caused by Major Solar Storm",
  'preview': 'For the first time in its eight years orbiting Mars, NASA’s MAVEN mission witnessed two different types of ultraviolet aurorae simultaneously, the result of solar storms that began on Aug. 27.'},
 {'title': "NASA Prepares to Say 'Farewell' to InSight Spacecraft",
  'preview': 'A closer look at what goes into wrapping up the mission as the spacecraft’s power supply continues to dwindle.'},
 {'title': 'NASA and ESA Agree on Next Steps to Return Mars Samples to Earth',
  'preview': 'The agency’s Perseverance rover will establish the first sample depot on Mars.'},
 {'title': "NASA's InSight Lander Detects Stunning Meteoroid Impact on Mars",
  'preview': 'The agency’s lander felt the ground shake during the impact while cameras aboard the Mars Reconnaissance Orbiter spotted the yawning new crater from space.'},
 {'title': 'NASA To Host Briefing on InSight, Mars Reconnaissance Orbiter Findings',


In [10]:
# Use extracted_article_list; first convert to JSON and then add to a file
with open('extracted_news_articles.json', 'w') as f:     
    json.dump(extracted_article_list, f)

### Alternate way to obtain scraped news articles using zip method  

In [11]:
# Extract titles of all the Mars News Articles 
extracted_titles = soup.find_all('div', 'content_title')
# print(extracted_titles)

# Cleaned extracted titles 
clean_titles = [title.text for title in extracted_titles]
clean_titles

["NASA's MAVEN Observes Martian Light Show Caused by Major Solar Storm",
 "NASA Prepares to Say 'Farewell' to InSight Spacecraft",
 'NASA and ESA Agree on Next Steps to Return Mars Samples to Earth',
 "NASA's InSight Lander Detects Stunning Meteoroid Impact on Mars",
 'NASA To Host Briefing on InSight, Mars Reconnaissance Orbiter Findings',
 'Why NASA Is Trying To Crash Land on Mars',
 'Curiosity Mars Rover Reaches Long-Awaited Salty Region',
 'Mars Mission Shields Up for Tests',
 "NASA's InSight Waits Out Dust Storm",
 "NASA's InSight 'Hears' Its First Meteoroid Impacts on Mars",
 "NASA's Perseverance Rover Investigates Geologically Rich Mars Terrain",
 'NASA to Host Briefing on Perseverance Mars Rover Mission Operations',
 "NASA's Perseverance Makes New Discoveries in Mars' Jezero Crater",
 "10 Years Since Landing, NASA's Curiosity Mars Rover Still Has Drive",
 "SAM's Top 5 Discoveries Aboard NASA's Curiosity Rover at Mars"]

In [12]:
# Extract preview text of all the Mars News Articles 
extracted_preview = soup.find_all('div', class_='article_teaser_body')
# print(extracted_preview)

# Cleaned extracted preview text
clean_preview = [article.text for article in extracted_preview]
clean_preview

['For the first time in its eight years orbiting Mars, NASA’s MAVEN mission witnessed two different types of ultraviolet aurorae simultaneously, the result of solar storms that began on Aug. 27.',
 'A closer look at what goes into wrapping up the mission as the spacecraft’s power supply continues to dwindle.',
 'The agency’s Perseverance rover will establish the first sample depot on Mars.',
 'The agency’s lander felt the ground shake during the impact while cameras aboard the Mars Reconnaissance Orbiter spotted the yawning new crater from space.',
 'Scientists from two Mars missions will discuss how they combined images and data for a major finding on the Red Planet.',
 'Like a car’s crumple zone, the experimental SHIELD lander is designed to absorb a hard impact.',
 'After years of climbing, the Mars rover has arrived at a special region believed to have formed as Mars’ climate was drying.',
 'Protecting Mars Sample Return spacecraft from micrometeorites requires high-caliber work.',

In [13]:
# Alternate method:
# Extract titles and preview text of the news articles. Then store scraping results in Python data structure. 

# Create alternate empty list
alt_extracted_article_list = []

# Loop through the articles - collect article title and preview text
for extracted_titles, extracted_preview in zip(extracted_titles, extracted_preview):
  # Extract the title text from the elements
    alt_title = extracted_titles.get_text()
  
  # Extract the preview text from the elements
    alt_preview = extracted_preview.get_text() 

  # Create alternative article dictionary
    alt_article_dict = {
      'title': alt_title,
      'preview': alt_preview
  }
  
    # Create alternative article dictionary
    alt_extracted_article_list.append(alt_article_dict)

In [14]:
alt_extracted_article_list

[{'title': "NASA's MAVEN Observes Martian Light Show Caused by Major Solar Storm",
  'preview': 'For the first time in its eight years orbiting Mars, NASA’s MAVEN mission witnessed two different types of ultraviolet aurorae simultaneously, the result of solar storms that began on Aug. 27.'},
 {'title': "NASA Prepares to Say 'Farewell' to InSight Spacecraft",
  'preview': 'A closer look at what goes into wrapping up the mission as the spacecraft’s power supply continues to dwindle.'},
 {'title': 'NASA and ESA Agree on Next Steps to Return Mars Samples to Earth',
  'preview': 'The agency’s Perseverance rover will establish the first sample depot on Mars.'},
 {'title': "NASA's InSight Lander Detects Stunning Meteoroid Impact on Mars",
  'preview': 'The agency’s lander felt the ground shake during the impact while cameras aboard the Mars Reconnaissance Orbiter spotted the yawning new crater from space.'},
 {'title': 'NASA To Host Briefing on InSight, Mars Reconnaissance Orbiter Findings',


In [16]:
# Use alt_extracted_article_list; first convert to JSON and then add to a file
with open('alt_extracted_article_list.json', 'w') as f:     
    json.dump(alt_extracted_article_list, f)

In [15]:
browser.quit()