In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager


In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

##  Scrape Mars News website
### Parse html code from the website to collect latest news titles and paragraph texts.

In [3]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')
#print(soup.prettify())

In [4]:
# Identify and return title of article
title = soup.find('div', class_='content_title')
news_title = title.text
news_title

"\n\nTeam Behind NASA's Newest Mars Rover to Honor Persevering Students\n\n"

In [5]:
 # Identify and return description of article
description = soup.find('div', class_='rollover_description_inner')
news_p = description.text
news_p

'\nMiddle schoolers who have pushed past obstacles to reach their academic goals will be celebrated by the mission team with a personal message beamed down from NASA’s Perseverance rover.\n'

## Find url of the featured image using Splinter

In [6]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 96.0.4664
Get LATEST chromedriver version for 96.0.4664 google-chrome
Driver [/Users/whitneywong/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


In [7]:
images_url = 'https://spaceimages-mars.com/'
browser.visit(images_url)

In [8]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# print(soup.prettify())

In [9]:
# from the home page, use Splinter to click on "FULL IMAGE"
browser.links.find_by_partial_text('FULL IMAGE').click()

In [10]:
soup.find('a', class_='showimg fancybox-thumbs')['href']

'image/featured/mars1.jpg'

In [11]:
feat_img_url = f"{images_url}{soup.find('a', class_='showimg fancybox-thumbs')['href']}"
feat_img_url

'https://spaceimages-mars.com/image/featured/mars1.jpg'

## Scrape Mars Facts tables using Pandas

In [12]:
mars_facts_url = 'https://galaxyfacts-mars.com/'
browser.visit(mars_facts_url)

In [13]:
# return a list of dataframes for any tabular data that Pandas finds
tables = pd.read_html(mars_facts_url)
tables

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [14]:
mars_facts_df = tables[0]
mars_facts_df

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [15]:
# reset index
mars_facts_df = mars_facts_df.reset_index(drop=True)
mars_facts_df

#change headers and index values

mars_facts_df.columns=['Description', 'Mars', 'Earth']
mars_facts_df.set_index('Description', inplace=True)
mars_facts_df

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [16]:
# to_html method generates HTML tables from DataFrames
html_table = mars_facts_df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [17]:
# clean - get rid of new lines 
html_table.replace('\n', '')

# save the new table to a file, open the file
mars_facts_df.to_html('table.html')
!open table.html

## Scrape Mars Hemispheres site
### create a list that contains one dictionary (including titles and image urls) for each hemisphere

In [18]:
mars_hem_url = 'https://marshemispheres.com/'
browser.visit(mars_hem_url)

### Hemisphere 1: Cerberus

In [19]:
cerberus_url = 'https://marshemispheres.com/cerberus.html'
browser.visit(cerberus_url)

# from the Cerberus home page, use Splinter to click on "Open" image
browser.links.find_by_text('Sample').click()

# direct browser to second tab with the full image to scrape
browser.windows.current = browser.windows[1]

html = browser.html
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())

In [20]:
# bring the window back to the first page and close all other windows
window = browser.windows[0]
window.close_others()

browser.windows.current = browser.windows[0]

In [21]:
# get img source
cerberus_img = soup.find('img')['src']
cerberus_img

'https://marshemispheres.com/images/full.jpg'

In [22]:
# get Cerberus title
cerberus_title = 'Cerberus Hemisphere Enhanced'
cerberus_title

'Cerberus Hemisphere Enhanced'

### Hemisphere 2: Schiaparelli

In [23]:
schia_url = 'https://marshemispheres.com/schiaparelli.html'
browser.visit(schia_url)

# from the Schiaparelli home page, use Splinter to click on "Open" image
browser.links.find_by_text('Sample').click()

# direct browser to second tab with the full image to scrape
browser.windows.current = browser.windows[1]

html = browser.html
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())

In [24]:
# bring the window back to the first page and close all other windows
window = browser.windows[0]
window.close_others()

browser.windows.current = browser.windows[0]

In [25]:
# get img source
schia_img = soup.find('img')['src']
schia_img

'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg'

In [26]:
# get Schiaparelli title
schia_title = 'Schiaparelli Hemisphere Enhanced'
schia_title

'Schiaparelli Hemisphere Enhanced'

### Hemisphere 3: Syrtis

In [27]:
syrtis_url = 'https://marshemispheres.com/syrtis.html'
browser.visit(syrtis_url)

# from the Syrtis home page, use Splinter to click on "Open" image
browser.links.find_by_text('Sample').click()

# direct browser to second tab with the full image to scrape
browser.windows.current = browser.windows[1]

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

# print(soup.prettify())

In [28]:
# bring the window back to the first page and close all other windows
window = browser.windows[0]
window.close_others()

browser.windows.current = browser.windows[0]

In [29]:
# get img source
syrtis_img = soup.find('img')['src']
syrtis_img

'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg'

In [30]:
# get Syrtis title
syrtis_title = 'Syrtis Major Hemisphere Enhanced'
syrtis_title

'Syrtis Major Hemisphere Enhanced'

### Hemisphere 4: Valles Marineris

In [31]:
valles_url = 'https://marshemispheres.com/valles.html'
browser.visit(valles_url)

# from the Valles Marineris home page, use Splinter to click on "Open" image
browser.links.find_by_text('Sample').click()

# direct browser to second tab with the full image to scrape
browser.windows.current = browser.windows[1]

html = browser.html
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())

In [32]:
# bring the window back to the first page and close all other windows
window = browser.windows[0]
window.close_others()

browser.windows.current = browser.windows[0]

In [33]:
# get img source
valles_img = soup.find('img')['src']
valles_img

'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg'

In [34]:
# get Valles title
valles_title = 'Valles Marineris Hemisphere Enhanced'
valles_title

'Valles Marineris Hemisphere Enhanced'

### Testing: Create a for loop that gathers the Mars hemispheres images and urls and appends them into a dictionary.

In [35]:
# connect to website
url = 'https://marshemispheres.com/'

browser.visit(url)

In [38]:
# make list of urls
hemis_urls = []

#loop through page to get image urls and titles 
for hemis in range(4):
    # articles
    browser.links.find_by_partial_text('Hemisphere')[hemis].click()
    
    # parse HTML and scrape
    html = browser.html
    hemi_soup = BeautifulSoup(html, 'html.parser')
    
    title = hemi_soup.find('h2', class_='title').text
    img_url = hemi_soup.find('li').a.get('href')
    
    # store dictionary and append to list
    hemispheres = {}
    hemispheres['img_url'] = f'https://marshemispheres.com/{img_url}'
    hemispheres['title'] = title
    hemis_urls.append(hemispheres)
    
    # repeat
    browser.back()

# quit browser
browser.quit()

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=55521): Max retries exceeded with url: /session/69f12a4d8952f0916988330e9bf0ccdb/elements (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fdeb4b048e0>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [37]:
# print image urls abd titles
hemis_urls

[{'img_url': 'https://marshemispheres.com/images/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]