In [1]:
# Dependencies
import pandas as pd
from pprint import pprint
from splinter import Browser
from bs4 import BeautifulSoup as bs
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Define function to choose the executable path
def init_browser():
    executable_path = {"executable_path": ChromeDriverManager().install()}
    return Browser("chrome", **executable_path, headless = False, user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36")

# Set variable for multiple use string
parser = "html.parser"

In [3]:
# Setup Browser
browser = init_browser()



Current google-chrome version is 101.0.4951
Get LATEST chromedriver version for 101.0.4951 google-chrome
Driver [C:\Users\INTEL\.wdm\drivers\chromedriver\win32\101.0.4951.41\chromedriver.exe] found in cache


# Connection to NASA Mars News

In [4]:
# Visit to Mars News Site
mars_news_url = "http://redplanetscience.com"
browser.visit(mars_news_url)

# HTML Object
html = browser.html

# Parse HTML with Beautiful Soup
news_soup = bs(html, parser)

## Latest Mars News

In [5]:
# Retrieve the latest article's title
news_title = news_soup.find("div", class_ = "content_title")
news_title = news_title.text.strip()
print(news_title)

The Extraordinary Sample-Gathering System of NASA's Perseverance Mars Rover


In [6]:
# Retrieve the latest article's paragraph
news_paragraph = news_soup.find("div", class_ = "article_teaser_body")
news_paragraph = news_paragraph.text.strip()
print(news_paragraph)

Two astronauts collected Moon rocks on Apollo 11. It will take three robotic systems working together to gather up the first Mars rock samples for return to Earth.


# Connection to JPL Mars Space Images - Featured Image

In [7]:
# Connect to Featured Space Image site
mars_featured_image_url = "http://spaceimages-mars.com/"
browser.visit(mars_featured_image_url)

# HTML Object
html = browser.html

# Parse HTML with Beautiful Soup
image_soup = bs(html, parser)

## JPL Mars Space Images - Featured Image

In [8]:
# Assign the image's full URL string to a variable called "featured_image_url"
featured_image = image_soup.find("img", class_ = "headerimage fade-in")
featured_image_url = mars_featured_image_url + featured_image["src"]
print(featured_image_url)

http://spaceimages-mars.com/image/featured/mars3.jpg


# Connection to get Mars Facts

In [9]:
# URL for Mars Facts
mars_facts_url = "http://space-facts.com/mars/"

## Mars Facts

In [10]:
# Use Pandas to convert the data to a HTML table string
mars_facts = pd.read_html(mars_facts_url)

# Save as DataFrame
mars_facts_df = pd.DataFrame(mars_facts[1])

mars_facts_df

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-87 to -5 °C,-88 to 58°C


In [11]:
# Save DataFrame to html
mars_facts_df.to_html("Resources/mars_facts.html")

In [12]:
# Convert DataFrame to HTML string
mars_facts = mars_facts_df.to_html(header = True, index = True)
pprint(mars_facts)

('<table border="1" class="dataframe">\n'
 '  <thead>\n'
 '    <tr style="text-align: right;">\n'
 '      <th></th>\n'
 '      <th>Mars - Earth Comparison</th>\n'
 '      <th>Mars</th>\n'
 '      <th>Earth</th>\n'
 '    </tr>\n'
 '  </thead>\n'
 '  <tbody>\n'
 '    <tr>\n'
 '      <th>0</th>\n'
 '      <td>Diameter:</td>\n'
 '      <td>6,779 km</td>\n'
 '      <td>12,742 km</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>1</th>\n'
 '      <td>Mass:</td>\n'
 '      <td>6.39 × 10^23 kg</td>\n'
 '      <td>5.97 × 10^24 kg</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>2</th>\n'
 '      <td>Moons:</td>\n'
 '      <td>2</td>\n'
 '      <td>1</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>3</th>\n'
 '      <td>Distance from Sun:</td>\n'
 '      <td>227,943,824 km</td>\n'
 '      <td>149,598,262 km</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 '      <th>4</th>\n'
 '      <td>Length of Year:</td>\n'
 '      <td>687 Earth days</td>\n'
 '      <td>365.24 days</td>\n'
 '    </tr>\n'
 '    <tr>\n'
 

# Connection to astrogeology site

In [13]:
# Visit the url for Mars Hemisphere
mars_hemispheres_url = "http://marshemispheres.com/"
browser.visit(mars_hemispheres_url)

# HTML Object
html = browser.html

# Parse HTML with Beautiful Soup
hemispheres_soup = bs(html, parser)

# Each link is located in "div" tag, class "description"
# Find all elements and store in variable
hems_url = hemispheres_soup.find_all("div", class_ = "description")

## Mars Hemispheres

In [14]:
# Create empty list for each URL
hemis_url = []

# Append all URL
for hem in hems_url:
    
    hem_url = hem.find("a")["href"]
    hemis_url.append(hem_url)

In [15]:
# Create list of dictionaries called hemisphere_image_urls
hemisphere_image_urls = []

# Iterate through all URLs saved in variable
for hemi in hemis_url:
    
    mars_hem_url = mars_hemispheres_url + hemi
    print(mars_hem_url)
    
    # Visit to Hemisphere
    browser.visit(mars_hem_url)
    
    # HTML Object
    html = browser.html

    # Parse HTML with Beautiful Soup
    hemi_soup = bs(html, parser)

    # Find all titles and save to be cleaned
    raw_title = hemi_soup.find("h2", class_ = "title").text
    
    # Remove " Enhanced" tag text from each "title"
    title = raw_title.split(" Enhanced")[0]
    
    # Find all full resolution image for all Hemisphere URLs
    img_url = hemi_soup.find("img", class_ = "wide-image")["src"]
    
    # Append "title" and "img_url" to "hemisphere_image_url"
    hemisphere_image_urls.append({"title": title, "img_url": mars_hemispheres_url + img_url})

# Exit Browser
browser.quit()

http://marshemispheres.com/cerberus.html
http://marshemispheres.com/schiaparelli.html
http://marshemispheres.com/syrtis.html
http://marshemispheres.com/valles.html


In [16]:
pprint(hemisphere_image_urls)

[{'img_url': 'http://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg',
  'title': 'Cerberus Hemisphere'},
 {'img_url': 'http://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg',
  'title': 'Schiaparelli Hemisphere'},
 {'img_url': 'http://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg',
  'title': 'Syrtis Major Hemisphere'},
 {'img_url': 'http://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg',
  'title': 'Valles Marineris Hemisphere'}]


# Prepare a Python dictionary to store in Mongo

In [17]:
# Create dictionary for all Mars Data.
mars_data = {}

# Append news_title and news_paragraph to mars_data
mars_data["news_title"] = news_title
mars_data["news_paragraph"] = news_paragraph

In [18]:
# Append featured_image_url to mars_data.
mars_data["featured_image_url"] = featured_image_url

In [19]:
# Append mars_facts to mars_data.
mars_data["mars_facts"] = mars_facts

In [20]:
# Append hemisphere_image_urls to mars_data.
mars_data["hemisphere_image_urls"] = hemisphere_image_urls

In [21]:
pprint(mars_data)

{'featured_image_url': 'http://spaceimages-mars.com/image/featured/mars3.jpg',
 'hemisphere_image_urls': [{'img_url': 'http://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg',
                            'title': 'Cerberus Hemisphere'},
                           {'img_url': 'http://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg',
                            'title': 'Schiaparelli Hemisphere'},
                           {'img_url': 'http://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg',
                            'title': 'Syrtis Major Hemisphere'},
                           {'img_url': 'http://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg',
                            'title': 'Valles Marineris Hemisphere'}],
 'mars_facts': '<table border="1" class="dataframe">\n'
               '  <thead>\n'