In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
import time

In [3]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path)

In [4]:
# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# Optional delay for loading the page in case the page is resource heavy and takes a while to load
# Look for ul and li HTML tags, specifically the item_list and slide attributes
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [5]:
# Scraping NASA website ------------------
# Create our scraping environment and parse our text to news_soup
html = browser.html
news_soup = soup(html, 'html.parser')
# search our parsed html for lists with class = slide, THEN unordered lists with class = item_list. CSS works RIGHT TO LEFT IN ORDER
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [6]:
# Search for each article title 
slide_elem.find("div", class_='content_title')


<div class="content_title"><a href="/news/8880/nasa-awards-mars-ascent-propulsion-system-contract-for-sample-return/" target="_self">NASA Awards Mars Ascent Propulsion System Contract for Sample Return</a></div>

In [7]:
# Use the parent element to find the first `a` tag and save it as `news_title`
# Use the get_text() method to ONLY RETURN TEXT FROM THE OVERALL HTML OUTPUT OF OUR SCRAPE
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title

'NASA Awards Mars Ascent Propulsion System Contract for Sample Return'

## Difference between find() and find_all()

find() finds the first instance of a class and attribute

find_all() returns all tags and attributes

In [8]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

'The award moves NASA and ESA a step closer to realizing Mars Sample Return, an ambitious planetary exploration program that will build upon decades of science, knowledge, and experience.'

### Featured Images

In [9]:
# Visit URL
url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url)

In [10]:
# Find and click the full image button
# Index [1] indicates that we want to store the SECOND RESULT of the find_by_tag('button') search. IE the second button that we find
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [11]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [12]:
# Find the relative image url
# <img class="fancybox-image" src="image/featured/mars3.jpg" alt="">
# In this case, we cannot pull the image url directly from inspecting elements, as that would return the same image every time we ran our program
# Instead we want a RELATIVE LINK/SOURCE that will always point to the image held in that position
# In this case, we find the <img> tag, the fancybox-image class, and then use the get() method to pull the link from the 'src'
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars3.jpg'

In [13]:
# Use the base URL to create an absolute URL
img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'
img_url

'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars3.jpg'

In [14]:
# Scraping facts about Mars
# We can use pandas' read_html() function

# read_html() reads the given url for any tables and returns a DataFrame object if possible. Chaining index 0 means we save the FIRST table to variable df
df = pd.read_html('http://space-facts.com/mars/')[0]
df.columns = ['description','value']
# Set the index column to = the description column, inplace means it happens to the existing dataframe instead of creating a new object
df.set_index('description', inplace=True)
df


Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [15]:
# use pandas' to_html() function to convert the dataframe to HTML, which can be passed into a website for display
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

In [16]:
# End the automated browser session (stops the scraper)
browser.quit()

# D1: Scrape High-Resolution Mars’ Hemisphere Images and Titles

### Hemispheres

In [2]:
# 1. Use browser to visit the URL 
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
# Resize the browser to 1080p
browser.driver.set_window_size(1920,1080)
browser.visit(url)

time.sleep(1)

## Testing space


In [3]:
browser.find_by_css('img[class="thumb"]')[0].click()

time.sleep(1)

In [7]:
html = browser.html
branch_soup = soup(html, 'html.parser')

In [8]:
# Find all links, access the 5th link, and access the link
branch_soup.find_all('a')[5].get('href')

'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'

In [9]:
title = branch_soup.find_all('h2')[0].text

In [10]:
title

'Cerberus Hemisphere Enhanced'

In [8]:
html = browser.html
home_soup = soup(html, 'html.parser')

In [20]:
# Count the number of instances of the item class
item_count = len(home_soup.find_all('div', class_='item'))

In [21]:
item_count

4

In [22]:
# Loop through each item in the container
for i in range(item_count):
    # initialize a dictionary
    hemisphere_dict = {}
    # for each item, click on link (image or text)
    browser.find_by_css('img[class="thumb"]')[i].click()
    # Access img class "wide-image" and store url to dictionary with key "img_url"
    branch_soup = soup(html, 'html.parser')
    hemisphere_dict['img_url'] = branch_soup.find_all('a')[4].get('href')
    # Access first h2 class "title" and store text to dictionary with key "title"
    hemisphere_dict['title'] = branch_soup.find_all('h2', 'title')[0].text
    # Append the dictionary to hemisphere_image_urls list
    hemisphere_image_urls.append(hemisphere_dict)

IndexError: list index out of range

## Test function for export

In [5]:
# 2. Create a list to hold the images and titles.
hemisphere_image_urls = []

# 3. Write code to retrieve the image urls and titles for each hemisphere.

#---------------------

# Parse HTML with soup

html = browser.html
home_soup = soup(html, 'html.parser')

# Count the number of instances of the item class
item_count = len(home_soup.find_all('div', class_='item'))

for i in range(item_count):
    # initialize a dictionary
    hemisphere_dict = {}
    # for each item, click on link (image or text)
    browser.find_by_css('img[class="thumb"]')[i].click()
    # Access img class "wide-image" and store url to dictionary with key "img_url"
    html = browser.html
    hemisphere_soup = soup(html, 'html.parser')
    # Find the background image source by indexing img and get the link
    hemisphere_dict['img_url'] = hemisphere_soup.find_all('img')[5].get('src')
    # Access first h2 class "title" and store text to dictionary with key "title"
    hemisphere_dict['title'] = hemisphere_soup.find_all('h2')[0].text
    # Append the dictionary to hemisphere_image_urls list
    hemisphere_image_urls.append(hemisphere_dict)
    # Return back
    browser.back()

In [6]:
# 4. Print the list that holds the dictionary of each image url and title.
hemisphere_image_urls

[{'img_url': '/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': '/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': '/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': '/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]

In [27]:
# 5. Quit the browser
browser.quit()