## Imports

In [39]:
#Imports
from bs4 import BeautifulSoup
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import pandas as pd
import re
import requests
from IPython.display import Image

## Mechanism to Open Webpage on Chrome

In [40]:
#https://splinter.readthedocs.io/en/latest/drivers/chrome.html
#Installed selenium and chromedriver using Honeybrew
#Located chromedriver path
!which chromedriver

/usr/local/bin/chromedriver


In [41]:
#Passed the executable path as a dictionary to the **kwargs argument
executable_path = {'executable_path':'/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

## Red Planet Science Web Scraping 

In [42]:
#Created variable for webpage
science_url = "https://redplanetscience.com"
#Visited webpage
browser.visit(science_url)

In [43]:
#Ensured that request succeeded
response = requests.get(science_url)
response

<Response [200]>

In [44]:
#Created variable for automated web testing
science_html = browser.html
#Used Beautiful Soup to parse through HTML
science_bs = BeautifulSoup(science_html, "html.parser")
#Used Prettify to enhance HTML readability
print(science_bs.prettify())

<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link crossorigin="anonymous" href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0-beta1/dist/css/bootstrap.min.css" integrity="sha384-giJF6kkoqNQ00vy+HMDP7azOuL0xtbfIcaT9wjKHr8RbDVddVHyTfAAsrekwKmP1" rel="stylesheet"/>
  <link href="css/font.css" rel="stylesheet" type="text/css"/>
  <link href="css/app.css" rel="stylesheet" type="text/css"/>
  <link crossorigin="anonymous" href="https://pro.fontawesome.com/releases/v5.10.0/css/all.css" integrity="sha384-AYmEC3Yw5cVb3ZcuHtOA93w35dYTsvhLPVnYs9eStHfGJvOvKxVfELGroGkvsg+p" rel="stylesheet"/>
  <title>
   News - Mars Exploration Program
  </title>
 </head>
 <body>
  <div class="col-md-12">
   <div class="row">
    <nav class="navbar navbar-expand-lg navbar-light fixed-top">
     <div class="container-fluid">
      <a class="navbar-brand" href="#">
       <img src="image/nasa.png" width="80"/>
       <span class="logo">
        MA

In [45]:
#Scraped the Mars news site and collected the latest news title and paragraph text
#Assigned the text to variables for later reference
science_title = science_bs.find("div", class_="content_title").text
science_paragraph = science_bs.find("div", class_="article_teaser_body").text
print(f"1. Latest News Title: {science_title}")
print(f"2. Corresponding Paragraph Text: {science_paragraph}")

1. Latest News Title: NASA's Mars 2020 Heads Into the Test Chamber
2. Corresponding Paragraph Text: In this time-lapse video taken at JPL, engineers move the Mars 2020 rover into a large vacuum chamber for testing in Mars-like environmental conditions.


## Mars Space Images Web Scraping

In [46]:
#Created variable for webpage
space_url = "https://spaceimages-mars.com"
#Visited webpage
browser.visit(space_url)

In [47]:
#Ensured that request succeeded
response = requests.get(science_url)
response

<Response [200]>

In [48]:
#Created variable for automated web testing
space_html = browser.html
#Used Beautiful Soup to parse through HTML
space_bs = BeautifulSoup(space_html, "html.parser")
#Used Prettify to enhance HTML readability
print(space_bs.prettify())

<html class="">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet"/>
  <!-- <link rel="stylesheet" type="text/css" href="css/font.css"> -->
  <link href="css/app.css" rel="stylesheet" type="text/css"/>
  <link href="https://stackpath.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
  <title>
   Space Image
  </title>
  <style type="text/css">
   .fancybox-margin{margin-right:0px;}
  </style>
 </head>
 <body>
  <div class="header">
   <nav class="navbar navbar-expand-lg">
    <a class="navbar-brand" href="#">
     <img id="logo" src="image/nasa.png"/>
     <span class="logo">
      Jet Propulsion Laboratory
     </span>
     <span class="logo1">
      California Institute of Technology
     </span>
    </a>
    <button aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle 

In [49]:
#Scraped the space images site and collected the all JPEGs
jpeg_list = [item['src'] for item in space_bs.select("[src$='.jpg']")]
print(jpeg_list)

['image/featured/mars3.jpg', 'image/mars/Icaria Fossae7.jpg', 'image/mars/Proctor Crater Dunes 7.jpg', 'image/mars/Icaria Fossae7.jpg', 'image/mars/Proctor Crater Dunes 7.jpg', 'image/mars/Proctor Crater Dunes 7.jpg', 'image/mars/Icaria Fossae7.jpg', 'image/mars/Icaria Fossae.jpg', 'image/mars/Ariadnes Colles4.jpg', 'image/mars/Niger Vallis.jpg', 'image/mars/Proctor Crater Dunes.jpg', 'image/mars/Niger Vallis.jpg', 'image/mars/Daedalia Planum.jpg', 'image/mars/Sirenum Fossae.jpg', 'image/mars/Ariadnes Colles4.jpg', 'image/mars/South Polar Cap.jpg', 'image/mars/Daedalia Planum.jpg', 'image/mars/Ariadnes Colles3.jpg', 'image/mars/Atlantis Chaos.jpg', 'image/mars/Daedalia Planum.jpg', 'image/mars/Icaria Fossae.jpg', 'image/mars/Niger Vallis.jpg', 'image/mars/Proctor Crater Dunes.jpg', 'image/mars/Reull Vallis.jpg', 'image/mars/Ariadnes Colles3.jpg', 'image/mars/Sirenum Fossae.jpg', 'image/mars/South Polar Cap.jpg', 'image/mars/Niger Vallis.jpg', 'image/mars/Daedalia Planum.jpg', 'image/ma

In [50]:
#Selected the first JPEG in the list using indexing
#Assigned the JPEG to variable for later reference
featured_image = [item['src'] for item in space_bs.select("[src$='.jpg']")][0]
print(featured_image)

image/featured/mars3.jpg


In [51]:
#Printed full url of first JPEG
featured_image_url = f"https://spaceimages-mars.com/{featured_image}"
print(f"Complete URL String:\n{featured_image_url}")

Complete URL String:
https://spaceimages-mars.com/image/featured/mars3.jpg


## Galaxy FACT Web Scraping

In [52]:
#Created variable for webpage
facts_url = "https://galaxyfacts-mars.com"
#Visited webpage
browser.visit(facts_url)

In [53]:
#Ensured that request succeeded
response = requests.get(facts_url)
response

<Response [200]>

In [54]:
#Created variable for automated web testing
facts_html = browser.html
#Used Beautiful Soup to parse through HTML
facts_bs = BeautifulSoup(space_html, "html.parser")
#Used Prettify to enhance HTML readability
print(space_bs.prettify())

<html class="">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet"/>
  <!-- <link rel="stylesheet" type="text/css" href="css/font.css"> -->
  <link href="css/app.css" rel="stylesheet" type="text/css"/>
  <link href="https://stackpath.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
  <title>
   Space Image
  </title>
  <style type="text/css">
   .fancybox-margin{margin-right:0px;}
  </style>
 </head>
 <body>
  <div class="header">
   <nav class="navbar navbar-expand-lg">
    <a class="navbar-brand" href="#">
     <img id="logo" src="image/nasa.png"/>
     <span class="logo">
      Jet Propulsion Laboratory
     </span>
     <span class="logo1">
      California Institute of Technology
     </span>
    </a>
    <button aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle 

In [55]:
#Converted site into dataframe
facts_df = pd.read_html(facts_html)
facts_df

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [56]:
#Created new dataframe for relevant table
facts_df2 = facts_df[1]
facts_df2

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 ( Phobos & Deimos )
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [57]:
#Inspected columns
facts_df2.columns

Int64Index([0, 1], dtype='int64')

In [58]:
#Renamed columns
facts_df2.columns=("Category", "Information")
facts_df2

Unnamed: 0,Category,Information
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 ( Phobos & Deimos )
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [59]:
#Converted table to HTML
html_table = facts_df2.to_html(index=False)
print(html_table)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Category</th>
      <th>Information</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Equatorial Diameter:</td>
      <td>6,792 km</td>
    </tr>
    <tr>
      <td>Polar Diameter:</td>
      <td>6,752 km</td>
    </tr>
    <tr>
      <td>Mass:</td>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <td>Moons:</td>
      <td>2 ( Phobos &amp; Deimos )</td>
    </tr>
    <tr>
      <td>Orbit Distance:</td>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <td>Orbit Period:</td>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <td>Surface Temperature:</td>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <td>First Record:</td>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <td>Recorded By:</td>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>


In [60]:
#Created HTML file and dropped Index
facts_df2.to_html("table.html",index=False)

## GUSS Science Center Webscraping

In [61]:
#Created variable for webpage
astropedia_url = "https://marshemispheres.com/"
#Visited webpage
browser.visit(astropedia_url)

In [62]:
#Ensured that request succeeded
response = requests.get(astropedia_url)
response

<Response [200]>

In [63]:
#Created variable for automated web testing
astropedia_html = browser.html
#Used Beautiful Soup to parse through HTML
astropedia_bs = BeautifulSoup(astropedia_html, "html.parser")
#Used Prettify to enhance HTML readability
print(astropedia_bs.prettify())

<html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <link href="css/jquery-ui.css" rel="stylesheet" type="text/css"/>
  <title>
   Astropedia Search Results | GUSS Astrogeology Science Center
  </title>
  <meta content="GUSS Astrogeology Science Center Astropedia search results." name="description"/>
  <meta content="GUSS,Astrogeology Science Center,Cartography,Geology,Space,Geological Survey,Mapping" name="keywords"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <link href="css/main.css" media="screen" rel="stylesheet"/>
  <link href="css/print.css" media="print" rel="stylesheet"/>
  <link href="#" rel="icon" type="image/x-ico"/>
 </head>
 <body id="results">
  <header>
   <a href="#" style="float:right;margin-top:10px;" target="_blank">
    <img alt="USGS: Science for a Changing World" class="logo" height="60" src="images/usgs_logo_main_

In [64]:
#Discovered preliminary image links
html_list = [item['href'] for item in astropedia_bs.select("[href$='.html']")]
print(html_list)

['cerberus.html', 'cerberus.html', 'schiaparelli.html', 'schiaparelli.html', 'syrtis.html', 'syrtis.html', 'valles.html', 'valles.html']


In [65]:
#Eliminated duplicates
html_list = list(set(html_list))
html_list

['schiaparelli.html', 'valles.html', 'syrtis.html', 'cerberus.html']

In [66]:
#Added webpage index to the links of final images
preliminary_url_list = []

for item in html_list:
    complete_url = astropedia_url + item
    preliminary_url_list.append(complete_url)

print(preliminary_url_list)

['https://marshemispheres.com/schiaparelli.html', 'https://marshemispheres.com/valles.html', 'https://marshemispheres.com/syrtis.html', 'https://marshemispheres.com/cerberus.html']


In [67]:
#Accessed links to final images urls
draft_url_list = []

for item in preliminary_url_list:
    browser.visit(item)
    image_html = browser.html
    image_bs = BeautifulSoup(image_html, "html.parser")
    image_path = image_bs.select_one("ul")
    image_url = image_path.a["href"]
    draft_url_list.append(image_url)
    
draft_url_list    

['images/schiaparelli_enhanced-full.jpg',
 'images/valles_marineris_enhanced-full.jpg',
 'images/syrtis_major_enhanced-full.jpg',
 'images/full.jpg']

In [68]:
#Added webpage index to the final image url
final_url_list = []

for item in draft_url_list:
    #print(item)
    final_complete_url = astropedia_url + item
    final_url_list.append(final_complete_url)

print(final_url_list)

['https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg', 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg', 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg', 'https://marshemispheres.com/images/full.jpg']


In [69]:
#Truncated HTML to search for subtitles
subtitle_html = astropedia_bs.find_all("div", class_="description")
subtitle_html

[<div class="description">
 <a class="itemLink product-item" href="cerberus.html">
 <h3>Cerberus Hemisphere Enhanced</h3>
 </a>
 <span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/>
 <p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p>
 </div>,
 <div class="description">
 <a class="itemLink product-item" href="schiaparelli.html">
 <h3>Schiaparelli Hemisphere Enhanced</h3>
 </a>
 <span class="subtitle" style="float:left">image/tiff 35 MB</span><span class="pubDate" style="float:right"></span><br/>
 <p>Mosaic of the Schiaparelli hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. The images were acquired in 1980 during early northern…</p>
 </div>,
 <div class="description">
 <a class="itemLink product-item" 

In [70]:
#Created subtitle list
subtitle_list = []
    
for item in subtitle_html:
    subtitle = item.find("h3").text
    subtitle_list.append(subtitle)
        
subtitle_list

['Cerberus Hemisphere Enhanced',
 'Schiaparelli Hemisphere Enhanced',
 'Syrtis Major Hemisphere Enhanced',
 'Valles Marineris Hemisphere Enhanced']

In [71]:
#Created dictionary consisting of subtitle and final image URL
hemisphere_image_urls = []

for url, title in zip(final_url_list, subtitle_list):
    hemisphere_image_dictionary = {}
    hemisphere_image_dictionary["Title"] = title
    hemisphere_image_dictionary["Image_URL"] = url
    hemisphere_image_urls.append(hemisphere_image_dictionary)
    
hemisphere_image_urls 

[{'Title': 'Cerberus Hemisphere Enhanced',
  'Image_URL': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg'},
 {'Title': 'Schiaparelli Hemisphere Enhanced',
  'Image_URL': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg'},
 {'Title': 'Syrtis Major Hemisphere Enhanced',
  'Image_URL': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg'},
 {'Title': 'Valles Marineris Hemisphere Enhanced',
  'Image_URL': 'https://marshemispheres.com/images/full.jpg'}]

## Images 

In [84]:
Image(url='https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg')

In [85]:
Image(url='https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg')

In [86]:
Image(url='https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg')

In [87]:
Image(url='https://marshemispheres.com/images/full.jpg')