# Mars Scraper

### Dependencies

In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as Soup
import pandas as pd

### Set up browser

In [2]:
# Chromedriver executable path
! which chromedriver

/usr/local/bin/chromedriver


In [3]:
# Start browser
exe_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **exe_path, headless=False)

### Scrape the most recent Mars article from NASA

In [4]:
# Visit the news page on the NASA Mars site
news_url = 'https://mars.nasa.gov/news/'
browser.visit(news_url)

# Optional delay for loading the page
browser.is_element_present_by_css('ul.item_list li.slide', wait_time=1)

True

In [5]:
# Parse the HTML
news_soup = Soup(browser.html, 'html.parser')

# Get the first article's title and summary
first_article = news_soup.select_one('ul.item_list li.slide')
title = first_article.find('div', class_='content_title').get_text()
summary = first_article.find('div', class_='article_teaser_body').get_text()
title, summary

("NASA's Mars Helicopter Reports In ",
 'The technology demonstration has phoned home from where it is attached to the belly of NASA’s Perseverance rover. ')

### Scrape the Mars hemisphere images from USGS

In [6]:
# Visit the Mars hemisphere search results page
hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemi_url)

In [7]:
# Visit the 1st hemisphere page
print(browser.is_element_present_by_css('div.description a.product-item', 1))
hemisphere = browser.links.find_by_partial_text('Hemisphere Enhanced')[0].click() # 1st page

True


In [8]:
# Parse the HTML
browser.is_element_present_by_css('div.downloads a', 1)
hemi1_soup = Soup(browser.html, 'html.parser') # parse HTML

# Get hemisphere name and image link
hemi1_name = hemi1_soup.select_one('section.metadata h2.title').text.strip()[:-9]
hemi1_img_url = hemi1_soup.select_one('div.downloads a').attrs['href']
hemi1_name, hemi1_img_url

('Cerberus Hemisphere',
 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg')

In [9]:
# Click back to search results
browser.back()

# Visit the 2nd hemisphere page
print(browser.is_element_present_by_css('div.description a.product-item', 1))
hemisphere = browser.links.find_by_partial_text('Hemisphere Enhanced')[1].click() # 2nd page

True


In [10]:
# Parse the HTML
browser.is_element_present_by_css('div.downloads a', 1)
hemi2_soup = Soup(browser.html, 'html.parser') # parse HTML

# Get hemisphere name and image link
hemi2_name = hemi2_soup.select_one('section.metadata h2.title').text.strip()[:-9]
hemi2_img_url = hemi2_soup.select_one('div.downloads a').attrs['href']
hemi2_name, hemi2_img_url

('Schiaparelli Hemisphere',
 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg')

### Scrape the featured Mars image from NASA's JPL

In [11]:
# # Visit the Mars images page
# img_url = 'https://www.jpl.nasa.gov/images?search=&category=Mars'
# browser.visit(img_url)

In [12]:
# # Click full image button on featured image
# print(browser.is_element_present_by_css('a#full_image', wait_time=1))
# browser.links.find_by_partial_text('FULL IMAGE').click()

# # Click more info buttom in slide show
# print(browser.is_element_present_by_css('div.buttons a.button', wait_time=1))
# browser.links.find_by_partial_text('more info').click()

In [13]:
# # Parse the HTML
# img_soup = Soup(browser.html, 'html.parser')

# # Get the main image URL on the page
# main_img_url = img_soup.select_one('figure.lede a img').get('src')
# main_img_url = 'https://www.jpl.nasa.gov' + main_img_url
# main_img_url

### Scrape the most recent Mars image from NASA's JPL

In [14]:
# Visit the Mars images page
img_url = 'https://www.jpl.nasa.gov/images?search=&category=Mars'
browser.visit(img_url)

In [15]:
# Click the most recent image in the search results
print(browser.is_element_present_by_css('section a.group', wait_time=1))
browser.links.find_by_partial_href('images/').click()

True


In [16]:
# Parse the HTML
img_soup = Soup(browser.html, 'html.parser')

# Get the image URL
first_img = img_soup.find('img', class_='BaseImage').attrs['src']
first_img

'https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA23726.2e16d0ba.fill-400x400-c50.jpg'

### Scrape Mars facts from space-facts

In [17]:
# Mars facts page url
facts_url = 'https://space-facts.com/mars/'
browser.visit(facts_url)

# Extract the main table on the page
facts = pd.read_html(facts_url)[0].set_index(0)
facts.index.name = None
facts.columns = ['value']
facts

Unnamed: 0,value
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [18]:
# Convert the df to HTML
facts_html = facts.to_html()
facts_html

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

In [19]:
# Quit browser
browser.quit()

### Transfer to script

In [20]:
from splinter import Browser
from bs4 import BeautifulSoup as Soup
import pandas as pd
from IPython.display import display


# Start browser
exe_path = '/usr/local/bin/chromedriver'
exe_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **exe_path, headless=False)


""" Scrape the most recent Mars article from NASA """

# Visit the Mars news page
news_url = 'https://mars.nasa.gov/news/'
browser.visit(news_url)

# Parse the HTML
print(browser.is_element_present_by_css('ul.item_list li.slide', wait_time=1))
news_soup = Soup(browser.html, 'html.parser')

# Get the first article's title and summary
first_article = news_soup.select_one('ul.item_list li.slide')
title = first_article.find('div', class_='content_title').get_text()
summary = first_article.find('div', class_='article_teaser_body').get_text()
print(title)
print(summary)
print()


""" Scrape the Mars hemisphere images from USGS """

# Visit the Mars hemisphere search results page
hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemi_url)

# Scrape the name and image URL for all 4 hemispheres
for i in range(4):
    # Visit hemisphere page
    print(browser.is_element_present_by_css('div.description a.product-item', 1))
    hemisphere = browser.links.find_by_partial_text('Hemisphere Enhanced')[i].click()

    # Parse the HTML
    browser.is_element_present_by_css('div.downloads a', 1)
    hemi_soup = Soup(browser.html, 'html.parser') # parse HTML

    # Get hemisphere name and image link
    hemi_name = hemi_soup.select_one('section.metadata h2.title').text.strip()[:-9]
    hemi_img_url = hemi_soup.select_one('div.downloads a').attrs['href']
    print(hemi_name, hemi_img_url)
    
    # Back to search results
    browser.back()
    
print()


# """ Scrape the featured Mars image from NASA's JPL """

# # Visit the Mars images page
# img_url = 'https://www.jpl.nasa.gov/images?search=&category=Mars'
# browser.visit(img_url)

# # Click the full image button on the featured image
# print(browser.is_element_present_by_css('a#full_image', wait_time=1))
# browser.links.find_by_partial_text('FULL IMAGE').click()

# # Click the more info buttom in the slide show
# print(browser.is_element_present_by_css('div.buttons a.button', wait_time=1))
# browser.links.find_by_partial_text('more info').click()

# # Get the main image url on the page
# imgs_soup = Soup(browser.html, 'html.parser')
# img_url = imgs_soup.select_one('figure.lede a img').get('src')
# img_url = 'https://www.jpl.nasa.gov' + main_img_url
# print(img_url)
# print()


""" Scrape the most recent Mars image from NASA's JPL """

# Visit the Mars images page
img_url = 'https://www.jpl.nasa.gov/images?search=&category=Mars'
browser.visit(img_url)

# Click the most recent image in the search results
print(browser.is_element_present_by_css('section a.group', wait_time=1))
browser.links.find_by_partial_href('images/').click()

# Get the image URL
img_soup = Soup(browser.html, 'html.parser')
first_img = img_soup.find('img', class_='BaseImage').attrs['src']
print(first_img)
print()


""" Scrape Mars facts from space-facts """

# Mars facts page url
facts_url = 'https://space-facts.com/mars/'

# Extract the main table on the page
facts = pd.read_html(facts_url)[0].set_index(0)
facts.index.name = None
facts.columns = ['value']
display(facts)
print()

# Convert the df to HTML
facts_html = facts.to_html()
print(facts_html)


# Quit browser
browser.quit()

True
NASA's Mars Helicopter Reports In 
The technology demonstration has phoned home from where it is attached to the belly of NASA’s Perseverance rover. 

True
Cerberus Hemisphere https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
True
Schiaparelli Hemisphere https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg
True
Syrtis Major Hemisphere https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg
True
Valles Marineris Hemisphere https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg

True
https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA23726.2e16d0ba.fill-400x400-c50.jpg



Unnamed: 0,value
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers



<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Equatorial Diameter:</th>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>Polar Diameter:</th>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>Mass:</th>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>Moons:</th>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>Orbit Distance:</th>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>Orbit Period:</th>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>Surface Temperature:</th>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>First Record:</th>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <th>Recorded By:</th>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>
