In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd
from splinter import Browser
from selenium import webdriver

In [2]:
# Set up Chrome Web driver with Splinter (above)
executable_path = {"executable_path": "chromedriver.exe"}
browser = Browser("chrome", **executable_path, headless=False)

In [3]:
# Initialize PyMongo to work with MongoDBs
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)

In [4]:
# Define database and collection
db = client.mars_db
collection = db.items

## NASA Mars News
* Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title and Paragraph Text. 
* Assign the text to variables that you can reference later.

In [5]:
# URL of page to scrape
# Note: Run splinter line above to open up a browser window, otherwise code won't run
url = "https://mars.nasa.gov/news/"
browser.visit(url)
html = browser.html

In [6]:
# Create BeautifulSoup object; parse with html.parser for html structure
soup = BeautifulSoup(html, "html.parser")
#print(soup.prettify())

In [7]:
# Search for the div where the title is located
results = soup.find_all('div', class_="content_title")
latest_news_title = results[1].text
print(f"Title: {latest_news_title}")

Title: New Study Challenges Long-Held Theory of Fate of Mars' Water


In [8]:
# Scrape html container that has info; using .text to convert html detail body to text
latest_news_detail = soup.find("div", class_="article_teaser_body").text
latest_news_detail

'The new science results indicate that a large quantity of the Red Planet’s water is trapped in its crust rather than having escaped into space.'

In [9]:
# Scrape news title and content
print(f"----Latest News----")
print(f"Title: {latest_news_title}")
print(f"Paragraph: {latest_news_detail}")

----Latest News----
Title: New Study Challenges Long-Held Theory of Fate of Mars' Water
Paragraph: The new science results indicate that a large quantity of the Red Planet’s water is trapped in its crust rather than having escaped into space.


In [None]:
# Note: browser.quit() will hang .py export and won't enable it to run correctly. So rem the line out on export.
# browser.quit()

## JPL Mars Space Images - Featured Image
* Visit the url for JPL Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars).
* Use splinter to navigate the site and find the image url for the current Featured Mars Image and assign the url string to a variable called `featured_image_url`.
* Make sure to find the image url to the full size `.jpg` image.
* Make sure to save a complete url string for this image.


In [None]:
# Reminder to run first row of code if browser.quit() is run above

In [17]:
# Open browser with JPL Featured Space Image url through splinter module
url_spaceimage = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url_spaceimage)

In [18]:
# HTML Object
html = browser.html
soup = BeautifulSoup(html, "html.parser")
#soup = BeautifulSoup(img_html, "html.parser")

In [22]:
# Featured image - pull first chunk of html code
results = soup.find("div", class_ = "sm:object-cover object-cover")
print(results)

<div class="sm:object-cover object-cover" data-v-a6031820=""><img alt="" class="BaseImage object-contain" data-src="https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA24466.2e16d0ba.fill-400x400-c50.jpg" loading="lazy" src="https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA24466.2e16d0ba.fill-400x400-c50.jpg"/></div>


In [23]:
# Featured image - refine first chunk of html code from above
results2 = results.find("img")
print(results2)

<img alt="" class="BaseImage object-contain" data-src="https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA24466.2e16d0ba.fill-400x400-c50.jpg" loading="lazy" src="https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA24466.2e16d0ba.fill-400x400-c50.jpg"/>


In [24]:
# Featured image - refine second chunk of html code from above pull out URL
# Note: Rerun 2 rows above if the URL does not display below
image = results.findAll("img")
for results in image:
    print (results["src"])

https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA24466.2e16d0ba.fill-400x400-c50.jpg


In [25]:
# Display url of the full image
featured_image_url = f"Featured Image:{img_results}"
print("--JPL Featured Space Image----")
print (results["src"])

--JPL Featured Space Image----
https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA24466.2e16d0ba.fill-400x400-c50.jpg


In [None]:
# Note: browser.quit() will hang .py export and won't enable it to run correctly. So rem the line out on export.
# browser.quit()

## Mars Facts
* Visit the Mars Facts webpage [here](https://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
* Use Pandas to convert the data to a HTML table string.

In [None]:
# Reminder to run first row of code if browser.quit() is run above

In [None]:
# Open browser using Chromedriver through splinter module
#executable_path = {"executable_path": "chromedriver.exe"}
#browser = Browser("chrome", **executable_path, headless=False)

In [26]:
url="https://space-facts.com/mars/"
browser.visit(url)
html = browser.html

In [27]:
# Pull all Mars facts
facts=pd.read_html(url)
#facts

In [28]:
# Put Mars facts into an indexed dataframe
mars_facts_df=facts[0]
mars_facts_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [29]:
# Save dataframe to html
mars_facts_df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    <

In [30]:
# Save html to table
html_table = mars_facts_df.to_html()

# Remove unwanted newlines to clean up the table
html_table.replace("\n", "")

# Save html table to folder Mission_to_Mars
mars_facts_df.to_html("mars_facts_data.html")

In [None]:
# Note: browser.quit() will hang .py export and won't enable it to run correctly. So rem the line out on export.
# browser.quit()

## Mars Hemispheres
* Visit the USGS Astrogeology site [here](https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres.
* Click each of the links to the hemispheres in order to find the image url to the full resolution image.
* Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys `img_url` and `title`.
* Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [None]:
# Reminder to run first row of code if browser.quit() is run above

In [None]:
# Open browser using Chromedriver through splinter module
#executable_path = {"executable_path": "chromedriver.exe"}
#browser = Browser("chrome", **executable_path, headless=False)

In [31]:
url="https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url)
html = browser.html

In [32]:
# Create BeautifulSoup object; parse with html.parser for html structure
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())

<html lang="en">
 <head>
  <link href="//ajax.googleapis.com/ajax/libs/jqueryui/1.11.3/themes/smoothness/jquery-ui.css" rel="stylesheet" type="text/css"/>
  <title>
   Astropedia Search Results | USGS Astrogeology Science Center
  </title>
  <meta content="USGS Astrogeology Science Center Astropedia search results." name="description"/>
  <meta content="USGS,Astrogeology Science Center,Cartography,Geology,Space,Geological Survey,Mapping" name="keywords"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <meta content="x61hXXVj7wtfBSNOPnTftajMsZ5yB2W-qRoyr7GtOKM" name="google-site-verification"/>
  <!--<link rel="stylesheet" href="http://fonts.googleapis.com/css?family=Open+Sans:400italic,400,bold"/>-->
  <link href="/css/main.css" media="screen" rel="stylesheet"/>
  <link href="/css/print.css" media="print" rel="styles

In [33]:
# Scrape all 4 items from page containing Mars hemispheres info. This creates 4 image URLs.
result=soup.find_all("div", class_="item")
#result

# Create empty list to store image titles and urls
image_urls=[]

# Assign base url for loop
base_url="https://astrogeology.usgs.gov"

# Go through above scrape and refine/clean further using for loop
for item in result:
    
    # Get image titles, strip "enhanced" from the end of each title
    titles=item.find("h3").text
    titles=titles.strip("Enhanced")
    
    # Get relative links of images 
    link=item.find("a", class_="itemLink product-item")["href"]
    
    # Get absolute links of images 
    image_link=base_url+link
    
    # Browse to page with link for full image; this opens a new tab for each image
    browser.visit(image_link)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    records = soup.find("div", class_="downloads")
    
    # Get links of full-sized images
    image_url = records.find("a")["href"]
    
    # Append results into a list of dictionaries
    image_urls.append({"title": titles, "image_url": image_url})

# Print title and image URL formatted
    print(titles)
    print(image_url)
    print("-----------")

# Or print url string outside of for loop
#print(image_urls)


Cerberus Hemisphere 
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
-----------
Schiaparelli Hemisphere 
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg
-----------
Syrtis Major Hemisphere 
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg
-----------
Valles Marineris Hemisphere 
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg
-----------


In [None]:
# Note: browser.quit() will hang .py export and won't enable it to run correctly. So rem the line out on export.
# browser.quit()