# Step 1 - Scraping

In [1]:
# Import Dependencies
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import pymongo
from flask import Flask, render_template, redirect
from flask_pymongo import PyMongo
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import time
import pprint

## *1. Scraping NASA Mars News - API*

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\yu_ka\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache


In [3]:
# URL of page to be scraped
url = 'https://redplanetscience.com/'
browser.visit(url)
time.sleep(2)
html = browser.html

# Create BeautifulSoup object, parse with 'lxml'
soup = bs(html, 'html.parser')

In [4]:
# Retrieve the dives for all the titles and paragraphs
results = soup.find_all('div', class_='list_text')

# Loop over the results to get the article titles and paragraph texts
for result in results:
    
    # Scrape the article date
    news_date = result.find('div', class_='list_date').text
    # Scrape the article title
    news_title = result.find('div', class_='content_title').text
    
    # Scrape the article paragraph
    news_p = result.find('div', class_='article_teaser_body').text
    
    # Print all the data
    print('-------------------------------')
    print(news_date)
    print(news_title)
    print(news_p)
    
    # Dictionary to be inserted into MongoDB
    news = {
        'news_title': news_title,
        'news_p': news_p,
        'news_date': news_date,
    }

-------------------------------
June 29, 2021
NASA Invites Students to Name Mars 2020 Rover
Through Nov. 1, K-12 students in the U.S. are encouraged to enter an essay contest to name NASA's next Mars rover.
-------------------------------
June 28, 2021
8 Martian Postcards to Celebrate Curiosity's Landing Anniversary
The NASA rover touched down eight years ago, on Aug. 5, 2012, and will soon be joined by a second rover, Perseverance.
-------------------------------
June 25, 2021
NASA to Broadcast Mars 2020 Perseverance Launch, Prelaunch Activities
Starting July 27, news activities will cover everything from mission engineering and science to returning samples from Mars to, of course, the launch itself.
-------------------------------
June 23, 2021
Alabama High School Student Names NASA's Mars Helicopter
Vaneeza Rupani's essay was chosen as the name for the small spacecraft, which will mark NASA's first attempt at powered flight on another planet.
-------------------------------
June 17,

## *2. Scraping the Featured Image - Splinter*

In [5]:
# URL of page to be scraped
url= 'https://spaceimages-mars.com/'
browser.visit(url)

In [6]:
# html Object
html = browser.html

# Parse with beautiful soup
soup = bs(html, 'html.parser')

# retrieve image url
image_url = soup.find_all('img')[1]['src']
featured_image_url = url + image_url

# Print image url
print(f'featured_image_url = {featured_image_url}')

featured_image_url = https://spaceimages-mars.com/image/featured/mars3.jpg


## *3. Scraping Mars Facts - Pandas*

In [7]:
# Mars Facts web page url 
url = 'https://galaxyfacts-mars.com/'

In [8]:
# List all table on page
tables = pd.read_html(url)
tables

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [9]:
# Slice off table of interest
df = tables[0]
df

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [10]:
#make the first row the table header

#grab the first row for the header
new_header = df.iloc[0]

#take the data less the header row
df = df[0:] 

#set the header row as the df header
df.columns = new_header

df

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [11]:
# Drop the row with index = 0
df = df.iloc[1:]
df

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [12]:
# Rename first column and set it as the index
df = df.rename(columns={"Mars - Earth Comparison": "Description"})
df = df.set_index('Description')
df

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [13]:
#Convert table to html
mars_facts = df.to_html()

# Clean up unwanted new lines
mars_facts = mars_facts.replace('\n', '')
mars_facts

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Mars</th>      <th>Earth</th>    </tr>    <tr>      <th>Description</th>      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Diameter:</th>      <td>6,779 km</td>      <td>12,742 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg</td>      <td>5.97 × 10^24 kg</td>    </tr>    <tr>      <th>Moons:</th>      <td>2</td>      <td>1</td>    </tr>    <tr>      <th>Distance from Sun:</th>      <td>227,943,824 km</td>      <td>149,598,262 km</td>    </tr>    <tr>      <th>Length of Year:</th>      <td>687 Earth days</td>      <td>365.24 days</td>    </tr>    <tr>      <th>Temperature:</th>      <td>-87 to -5 °C</td>      <td>-88 to 58°C</td>    </tr>  </tbody></table>'

In [14]:
print(mars_facts)

<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Mars</th>      <th>Earth</th>    </tr>    <tr>      <th>Description</th>      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Diameter:</th>      <td>6,779 km</td>      <td>12,742 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg</td>      <td>5.97 × 10^24 kg</td>    </tr>    <tr>      <th>Moons:</th>      <td>2</td>      <td>1</td>    </tr>    <tr>      <th>Distance from Sun:</th>      <td>227,943,824 km</td>      <td>149,598,262 km</td>    </tr>    <tr>      <th>Length of Year:</th>      <td>687 Earth days</td>      <td>365.24 days</td>    </tr>    <tr>      <th>Temperature:</th>      <td>-87 to -5 °C</td>      <td>-88 to 58°C</td>    </tr>  </tbody></table>


## *4. Scraping Mars Hemisphers*

In [15]:
# URL of page to be scraped
mars_url = 'https://marshemispheres.com/'
browser.visit(mars_url)

In [16]:
# html Object
html = browser.html

# Parse with Beautiful Soup
mars_soup = bs(html, 'html.parser')

In [17]:
# Mars hemispheres info
mars_hemispheres = mars_soup.find('div', class_='collapsible results')
hemisphere_info = mars_hemispheres.find_all('div', class_='item')

#create an empty list to store names & urls of the hemispheres 
image_urls = []

# Iterate through hemisphere info
for data in hemisphere_info:
    # Get image title
    hemisphere = data.find('div', class_="description")
    title = hemisphere.h3.text
    
    # Collect image link by browsing to hemisphere page
    hemisphere_image_link = hemisphere.a["href"]    
    browser.visit(mars_url + hemisphere_image_link)
    
    hemisphere_html = browser.html
    hemisphere_soup = bs(hemisphere_html, 'html.parser')
    
    hemisphere_link = hemisphere_soup.find('div', class_='downloads')
    hemisphere_url = hemisphere_link.find('li').a['href']

    # Create Dictionary to store title and url info
    image_dict = {}
    image_dict['title'] = title
    image_dict['img_url'] =  (mars_url + hemisphere_url)
    
    image_urls.append(image_dict)

pprint.pprint(image_urls)

[{'img_url': 'https://marshemispheres.com/images/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]


In [18]:
 browser.quit()