In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import pymongo
import time
import numpy as np
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

# Step 1 - Scraping
<hr>

### NASA Mars News Site
<p> Scrape NASA Mars news site for latest headlines and preview text

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324






[WDM] - Driver [C:\Users\WelanR_01\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


In [3]:
# Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. 
# Assign the text to variables for later reference.

# URL of page to be scraped
url = 'https://mars.nasa.gov/news/'

# Retrieve page in splinter browser
browser.visit(url)

In [4]:
# HTML object
html = browser.html

# Retrieve the parent divs for all articles
soup = BeautifulSoup(html, 'lxml')

#### Minor gotcha: website needs to be run through Splinter before scraping with BeautifulSoup, otherwise it won't appear in BeautifulSoup's scrape

In [5]:
# Retrieve the parent divs for all headlines and preview text
results = soup.find_all("div", class_="list_text")

In [7]:
# Create empty lists to append with headlines and preview text data
headers = list()
preview_texts = list()

# loop over results to get headlines and preview text data
for result in results:
    # scrape the article header 
    header = result.find("div", class_="content_title").text
    
    # scrape the article subheader
    preview_text = result.find("div", class_="article_teaser_body").text
    
    # print article data
    print('-----------------')
    print(header)
    print(preview_text)

    # Append lists with headlines and preview texts
    headers.append(header)
    preview_texts.append(preview_text)

-----------------
Testing Proves Its Worth With Successful Mars Parachute Deployment
The giant canopy that helped land Perseverance on Mars was tested here on Earth at NASA’s Wallops Flight Facility in Virginia.
-----------------
NASA's Perseverance Rover Gives High-Definition Panoramic View of Landing Site
A 360-degree panorama taken by the rover’s Mastcam-Z instrument will be discussed during a public video chat this Thursday.
-----------------
Nearly 11 Million Names of Earthlings are on Mars Perseverance
When the Perseverance rover safely touched down on the Martian surface, inside Jezero Crater, on Feb. 18, 2021, it was also a safe landing for the nearly 11 million names on board.
-----------------
NASA's Mars Perseverance Rover Provides Front-Row Seat to Landing, First Audio Recording of Red Planet 
The agency’s newest rover captured first-of-its kind footage of its Feb. 18 touchdown and has recorded audio of Martian wind.


-----------------
NASA to Reveal New Video, Images From

### NASA JPL Images
<p> Scrape NASA Mars news site for latest headlines and preview text

In [8]:
# URL of JPL page to be scraped
jpl_url = "https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html"

# Retrieve page in splinter browser
browser.visit(jpl_url)

In [9]:
# HTML object
jpl_html = browser.html

# Retrieve the parent divs for all articles
jpl_soup = BeautifulSoup(jpl_html, 'lxml')

In [10]:
# Retrieve featured image relative URL 
featured_image = jpl_soup.find("img", class_="headerimage")["src"]

#### Minor gotcha: JPL url needs to have the "index.html" suffix removed before appending the image's relative URL

In [11]:
# Clean up the webpage URL to combine with 
# featured image relative URL to create full URL
clean_url = jpl_url.split("index.html")

In [12]:
# Concatenate URLs to create full featured image URL
featured_image_url = clean_url[0] + featured_image

In [13]:
featured_image_url

'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars2.jpg'

### Mars Facts
<p> Scrape Space Facts website for space facts

In [14]:
# URL of Space Facts page to be scraped
facts_url = "https://space-facts.com/mars/"

In [15]:
# Get all tables in URL via pandas
tables = pd.read_html(facts_url)
tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
   Mars - Earth Comparison             Mars            Earth
 0               Diameter:         6,779 km        12,742 km
 1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 2                  Moons:                2                1
 3      Distance from Sun:   227,943,824 km   149,598,262 km
 4         Length of Year:   687 Earth days      365.24 days
 5            Temperature:     -87 to -5 °C      -88 to 58°C,
           

In [16]:
# Save relevant table as DataFrame
df = tables[0]

In [17]:
# Rename column headers to appropriate names
df = df.rename(columns={0:"Description",1:"Mars"})

In [18]:
# Export table as HTML string, remove superfluous index
df.to_html('table.html', index=False)

### Mars Hemispheres
<p> Scrape USGS Astrogeology website for Mars hemispheres facts

In [19]:
hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

# Retrieve page in splinter browser
browser.visit(hemi_url)


hemisphere_image_urls = list()

for image in images:
    click on links with partial match "Enhanced"
    title = soup.find("h2", class_="title").text
    img_url = soup.find("div", class_="downloads").find("a")[i]["href"]
    
    title_strip = title.strip(" ")
    title_clean = f"{title_strip[0]} {title_strip[1]}"
    
    hemisphere_image_urls.append({"title":title_clean, "img_url":img_url})
    click back
    


In [20]:
# HTML object
hemi_html = browser.html

# Retrieve the parent divs for all articles
hemi_soup = BeautifulSoup(hemi_html, 'lxml')

In [21]:
hemi_results = hemi_soup.find_all("div", class_="item")

4

In [28]:
hemispheres = list()

for r in hemi_results:
    hemisphere = r.find("h3").text
    hemi_split = hemisphere.split(" ")
    hemispheres.append(hemi_split[0])
    
hemispheres

['Cerberus', 'Schiaparelli', 'Syrtis', 'Valles']

In [38]:
hemisphere_image_urls = list()

for hemi in hemispheres:
    try:
        # click on links with partial match "Enhanced"
        browser.click_link_by_partial_text(f"{hemi}")
        
        # get full image webpage URL and save as BeautifulSoup
        hemi_image_html = browser.html
        image_soup = BeautifulSoup(hemi_image_html, 'lxml')
        
        # find full image name and URL
        title = image_soup.find("h2", class_="title").text
        img_url = image_soup.find("div", class_="downloads").find("a")["href"]

        # remove word "Enhanced" from hemisphere name
        title_split = title.split(" ")
        title_clean = f"{title_split[0]} {title_split[1]}"

        # print to check the code works
        print(title_clean)
        print(img_url)
        
        # append list with dictionary
        hemisphere_image_urls.append({"title":title_clean, "img_url":img_url})
        
        time.sleep(2)
        
        # click back
        browser.back()
        
    # stop code if all images scraped    
    except ElementDoesNotExist:
        print("Scraping Complete")

Cerberus Hemisphere
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
Schiaparelli Hemisphere
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg
Syrtis Major
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg
Valles Marineris
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg


In [39]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

# Step 2 - MongoDB and Flask Application
<hr>