In [6]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd

In [7]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path)

In [8]:
# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# Optional delay for loading the page in case the page is resource heavy and takes a while to load
# Look for ul and li HTML tags, specifically the item_list and slide attributes
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [9]:
# Scraping NASA website ------------------
# Create our scraping environment and parse our text to news_soup
html = browser.html
news_soup = soup(html, 'html.parser')
# search our parsed html for lists with class = slide, THEN unordered lists with class = item_list. CSS works RIGHT TO LEFT IN ORDER
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [10]:
# Search for each article title 
slide_elem.find("div", class_='content_title')


<div class="content_title"><a href="/news/8875/testing-proves-its-worth-with-successful-mars-parachute-deployment/" target="_self">Testing Proves Its Worth With Successful Mars Parachute Deployment</a></div>

In [11]:
# Use the parent element to find the first `a` tag and save it as `news_title`
# Use the get_text() method to ONLY RETURN TEXT FROM THE OVERALL HTML OUTPUT OF OUR SCRAPE
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title

'Testing Proves Its Worth With Successful Mars Parachute Deployment'

## Difference between find() and find_all()

find() finds the first instance of a class and attribute

find_all() returns all tags and attributes

In [12]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

'The giant canopy that helped land Perseverance on Mars was tested here on Earth at NASA’s Wallops Flight Facility in Virginia.'

### Featured Images

In [13]:
# Visit URL
url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url)

In [14]:
# Find and click the full image button
# Index [1] indicates that we want to store the SECOND RESULT of the find_by_tag('button') search. IE the second button that we find
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [15]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [16]:
# Find the relative image url
# <img class="fancybox-image" src="image/featured/mars3.jpg" alt="">
# In this case, we cannot pull the image url directly from inspecting elements, as that would return the same image every time we ran our program
# Instead we want a RELATIVE LINK/SOURCE that will always point to the image held in that position
# In this case, we find the <img> tag, the fancybox-image class, and then use the get() method to pull the link from the 'src'
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars2.jpg'

In [17]:
# Use the base URL to create an absolute URL
img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'
img_url

'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars2.jpg'

In [21]:
# Scraping facts about Mars
# We can use pandas' read_html() function

# read_html() reads the given url for any tables and returns a DataFrame object if possible. Chaining index 0 means we save the FIRST table to variable df
df = pd.read_html('http://space-facts.com/mars/')[0]
df.columns = ['description','value']
# Set the index column to = the description column, inplace means it happens to the existing dataframe instead of creating a new object
df.set_index('description', inplace=True)
df


Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [23]:
# use pandas' to_html() function to convert the dataframe to HTML, which can be passed into a website for display
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

In [24]:
# End the automated browser session (stops the scraper)
browser.quit()