## Scrape: NASA Mars News Site
Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.


In [1]:
# Dependencies
import pandas as pd
from bs4 import BeautifulSoup as bs    # Beautiful Soup is a Python library for pulling data out of HTML and XML files
from splinter import Browser           # Splinter is python tool for testing web applications
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Boiler plate code that will be used whenever we use Splinter; allows 
executable_path = {'executable_path': ChromeDriverManager().install()}   #Takes us to chromedriver manager that creates the browser window for our automated actions
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\findv\.wdm\drivers\chromedriver\win32\87.0.4280.20\chromedriver.exe] found in cache


 


In [3]:
# Nasa URL to be scraped
nasa_url = 'https://mars.nasa.gov/news/'

# Visit the website
browser.visit(nasa_url)

In [4]:
# HTML Object
nasa_html = browser.html

In [5]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = bs(nasa_html, 'html.parser')

# Examine results and find the Latest news title and paragraph text
print(soup.body.prettify())

<body id="news" style="">
 <svg display="none" height="0" width="0">
  <symbol height="30" id="circle_plus" viewbox="0 0 30 30" width="30">
   <g fill-rule="evenodd" transform="translate(1 1)">
    <circle cx="14" cy="14" fill="#fff" fill-opacity=".1" fill-rule="nonzero" r="14" stroke="inherit" stroke-width="1">
    </circle>
    <path class="the_plus" d="m18.856 12.96v1.738h-4.004v3.938h-1.848v-3.938h-4.004v-1.738h4.004v-3.96h1.848v3.96z" fill="inherit" stroke-width="0">
    </path>
   </g>
  </symbol>
  <symbol height="30" id="circle_arrow" viewbox="0 0 30 30" width="30" xmlns="http://www.w3.org/2000/svg">
   <g transform="translate(1 1)">
    <circle cx="14" cy="14" fill="#fff" fill-opacity=".1" r="14" stroke="inherit" stroke-width="1">
    </circle>
    <path class="the_arrow" d="m8.5 15.00025h7.984l-2.342 2.42c-.189.197-.189.518 0 .715l.684.717c.188.197.494.197.684 0l4.35-4.506c.188-.199.188-.52 0-.717l-4.322-4.48c-.189-.199-.496-.199-.684 0l-.684.716c-.189.197-.189.519 0 .716l2.3

In [6]:
# Beautiful Soup select_one() method will find only the first tag that matches a selector:
latest_section = soup.select_one('ul.item_list li.slide')
latest_section

<li class="slide"><div class="image_and_description_container"><a href="/news/8805/moxie-could-help-future-rockets-launch-off-mars/" target="_self"><div class="rollover_description"><div class="rollover_description_inner">NASA's Perseverance rover carries a device to convert Martian air into oxygen that, if produced on a larger scale, could be used not just for breathing, but also for fuel.</div><div class="overlay_arrow"><img alt="More" src="/assets/overlay-arrow.png"/></div></div><div class="list_image"><img alt="Engineers lower MOXIE into the belly of NASA's Perseverance rover." src="/system/news_items/list_view_images/8805_1-MOXIE-PIA24176-320.gif"/></div><div class="bottom_gradient"><div><h3>MOXIE Could Help Future Rockets Launch Off Mars</h3></div></div></a><div class="list_text"><div class="list_date">November 24, 2020</div><div class="content_title"><a href="/news/8805/moxie-could-help-future-rockets-launch-off-mars/" target="_self">MOXIE Could Help Future Rockets Launch Off Ma

In [7]:
# Collect latest news title
latest_news_title = latest_section.find('div', class_='content_title').get_text()
latest_news_title

'MOXIE Could Help Future Rockets Launch Off Mars'

In [8]:
# Collect latest news paragraph text
latest_ptext = latest_section.find('div', class_='article_teaser_body').get_text()
latest_ptext

"NASA's Perseverance rover carries a device to convert Martian air into oxygen that, if produced on a larger scale, could be used not just for breathing, but also for fuel."

## Scrape: JPL Features Space Image

Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called `featured_image_url`. Make sure to find the image url to the full size `.jpg` image.

In [9]:
# JPL URL to be scraped
jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

In [12]:
# Examine results and find the Latest news title and paragraph text
browser.visit(jpl_url)

In [13]:
# Request splinter to find the "Full Image" button and click it on the jpl_url provided
full_image_button = browser.find_by_id('full_image')
full_image_button.click()

In [14]:
# Find the more info button and click on that
browser.is_element_present_by_text('more info', wait_time=1)
more_info_element = browser.find_link_by_partial_text('more info')
more_info_element.click()



In [15]:
# Create a Beautiful Soup object to parse html
jpl_html = browser.html
image_soup = bs(jpl_html, 'html.parser')


In [16]:
# Examine results and find the Latest news title and paragraph text
print(image_soup.body.prettify())

<body class="light_background logged_out mobile_menu" id="image_detail" style="">
 <!--[if lt IE 9]>
      <div class='browsehappy' style='font-size: 30px; color: white; position:absolute; top: 0; margin: 0; height: 3000px; width: 100%; background: #000; z-index: 10000; padding: 5%;'>
        You are using an
        <strong>outdated</strong>
        browser. Please
        <a href='http://browsehappy.com/'>click here</a>
        to upgrade or change your browser.
      </div>
    <![endif]-->
 <!-- Google Tag Manager (noscript) -->
 <noscript>
  <iframe height="0" src="https://www.googletagmanager.com/ns.html?id=GTM-NLDQZ25" style="display:none;visibility:hidden" width="0">
  </iframe>
 </noscript>
 <!-- End Google Tag Manager (noscript) -->
 <div id="main_container">
  <div id="site_body">
   <div class="site_header_area">
    <header class="site_header">
     <div class="brand_area">
      <div class="brand1">
       <a class="nasa_logo" href="http://www.nasa.gov" title="NASA">
    

In [None]:
# Beautiful Soup select_one() method will find only the first tag that matches a selector:
latest_section = soup.select_one('ul.item_list li.slide')
latest_section

In [33]:
# Retrieve the src url for the large image from jpl
jpl_image_src = image_soup.select_one('figure.lede a img').get('src')

# Because the src url is not complete, create a complete url string for this image and name it featured_image_url
featured_image_url = (f'https://jpl.nasa.gov{jpl_image_src}')
featured_image_url

'https://jpl.nasa.gov/spaceimages/images/largesize/PIA15256_hires.jpg'