In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
# set the executable path, and then set up the URL (NASA Mars News) for scraping.
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [/Users/wonheeyun/.wdm/drivers/chromedriver/mac64/97.0.4692.71/chromedriver] found in cache


In [3]:
# assign the url and instruct the browser to visit it
url = 'https://redplanetscience.com'
browser.visit(url)

# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)
# This line of code serves two purposes:
# First, we're searching for elements with a specific combination of tag (div) and attribute (list_text)
# for example, ul.item_list would be found in HTML as <ul class="item_list">
# Second, we're also telling our browser to wait one second before searching for components. 
# The optional delay is useful because sometimes dynamic pages take a little while to load, 
# especially if they are image-heavy.

True

In [4]:
# Set up the HTML parser:
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')
# We've assigned slide_elem as the variable to look for the <div /> tag and 
# its descendent (the other tags within the <div /> element)

## Scraping Mars Data: The News

In [5]:
slide_elem.find('div', class_='content_title')
# When we do this, we're saying, "This variable holds a ton of information, 
# so look inside of that information to find this specific data." 
# The data we're looking for is the content title, which we've specified by saying, 
# "The specific data is in a <div /> with a class of 'content_title'."

<div class="content_title">Heat and Dust Help Launch Martian Water Into Space, Scientists Find</div>

In [6]:
# But we need to get just the text, and the extra HTML stuff isn't necessary. 
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

'Heat and Dust Help Launch Martian Water Into Space, Scientists Find'

In [7]:
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

'Scientists using an instrument aboard NASA’s Mars Atmosphere and Volatile EvolutioN, or MAVEN, spacecraft have discovered that water vapor near the surface of the Red Planet is lofted higher into the atmosphere than anyone expected was possible. '

## Scraping Mars Data: Featured Images

In [8]:
# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [9]:
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()
# The indexing chained at the end of the first line of code means we want the browser to select the second button

In [10]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [11]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='headerimage').get('src')
img_url_rel

'image/featured/mars3.jpg'

In [12]:
# Use the base URL to create an absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars3.jpg'

## Scraping Mars Data: Mars Facts

In [13]:
# Visit URL
url = 'https://galaxyfacts-mars.com/'
browser.visit(url)

In [14]:
# <tbody> is the body of the table—the headers, columns, and rows.
# <tr> is the tag for each table row
# <td> is the stored table data
# Instead of scraping each row, or the data in each <td />, 
# we're going to scrape the entire table with Pandas' .read_html() function.

df = pd.read_html("https://galaxyfacts-mars.com/")[0]
# The Pandas function read_html() specifically searches for and returns a list of tables found in the HTML. 
# By specifying an index of 0, we're telling Pandas to pull only the first table it encounters, 
# or the first item in the list. Then, it turns the table into a DataFrame.

df.columns=['description', 'Mars', 'Earth']
# we assign columns to the new DataFrame for additional clarity.

df.set_index('description', inplace=True)
# By using the .set_index() function, we're turning the Description column into the DataFrame's index. 
# inplace=True means that the updated index will remain in place, without having to reassign the DataFrame to a new variable.

df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [22]:
# Pandas also has a way to easily convert our DataFrame back into HTML-ready code using the .to_html() function
df.to_html()
# After adding this exact block of code to Robin's web app, 
# the data it's storing will be presented in an easy-to-read tabular format.

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [15]:
browser.quit()

## Export to Python