# Step1. Scraping

### 1. Scrape the latest News Title and Paragraph Text

In [1]:
# Dependencies
from bs4 import BeautifulSoup

# import the splinter to browse continuous pages
from splinter import Browser

# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
#!which chromedriver

# use the path to start the chrome
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [2]:
# use Splinter with BeautifulSoup
url_news = 'https://mars.nasa.gov/news/'
browser.visit(url_news)

# use beautifulsoup and its html.parser on html
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [3]:
# scrape and store the list
time = soup.find('div', class_='list_date').text
title = soup.find('div', class_='content_title').text
paragraph = soup.find('div', class_='article_teaser_body').text

In [4]:
# print the result
print("The lastest news is published at", time)
print("--------------------------------------")
print("Its title:",title)
print("--------------------------------------")
print("It talks about", paragraph)

The lastest news is published at February 28, 2019
--------------------------------------
Its title: After a Reset, Curiosity Is Operating Normally
--------------------------------------
It talks about Curiosity has returned to science operations and is once again exploring the clay unit. 


### 2. Scrape the Featured Images 

In [5]:
# Dependencies
from bs4 import BeautifulSoup

# import the splinter to browse continuous pages
from splinter import Browser

# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

# use the path to start the chrome
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [6]:
# use Splinter with BeautifulSoup
url_main = 'https://www.jpl.nasa.gov'
url_mars = "/spaceimages/?search=&category=Mars"
browser.visit(url_main+url_mars)

# use beautifulsoup and its html.parser on html
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [7]:
# scrape the ('url_main'+'url_mars') to find content that includes the link for original image news
url_middle = soup.find_all('a', class_='button')

# extract the link for the image news from the contents above
middle_img_url = url_middle[0].get('data-link')
middle_img_url

'/spaceimages/details.php?id=PIA16884'

In [8]:
## combine the link with 'url_main' to navigate to the image news
browser.visit(url_main + middle_img_url)

# use beautifulsoup and its html.parser on html
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [9]:
# scrape the full image link from the image news page
img_url = soup.find('figure',class_='lede').a.get('href')

# combine the 'img-url' with 'url_main' and store as the featured_image_url
featured_image_url = url_main + img_url
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16884_hires.jpg'

### 3. Scrape the weather

In [10]:
# Dependencies
from bs4 import BeautifulSoup

# import the splinter to browse continuous pages
from splinter import Browser

# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

# use the path to start the chrome
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [11]:
# use Splinter with BeautifulSoup
url_tweets = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url_tweets)

# use beautifulsoup and its html.parser on html
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [12]:
# find contents of weather
weather_content = soup.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text')

# only scrape the text in parent level not including the children/embedding <a>
# and store the text
mars_weather = weather_content.find(text=True, recursive=False)
mars_weather

'InSight sol 92 (2019-03-01) low -94.4ºC (-137.9ºF) high -12.9ºC (8.8ºF)\nwinds from the SW at 4.6 m/s (10.2 mph) gusting to 10.4 m/s (23.2 mph)\npressure at 7.20 hPa'

### 4. Scrape Mars facts - table with pandas

In [13]:
# import the splinter to browse continuous pages
from splinter import Browser

# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

# use the path to start the chrome
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

# use pandas to scrape the table
import pandas as pd

In [14]:
# use Splinter with BeautifulSoup
mars_facts = 'http://space-facts.com/mars/'
browser.visit(mars_facts)

In [15]:
# pandas extract table from html
tables = pd.read_html(mars_facts) # Returns list of all tables on page
mars_extract = tables[0] # Select table of interest

# use pandas module to convert dataframework into a '.html' file
mars_extract.to_html("mars_table.html")

### 5. Scrape image url and titles

In [16]:
# Dependencies
from bs4 import BeautifulSoup

# import the splinter to browse continuous pages
from splinter import Browser

# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

# use the path to start the chrome
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [17]:
# use Splinter with BeautifulSoup
url_astro_main = 'https://astrogeology.usgs.gov'
url_astro_each = '/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url_astro_main + url_astro_each)

# use beautifulsoup and its html.parser on html
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [18]:
# create a loop to extract all the img_url and the corresponding titles
hemisphere_image_urls = []

for i in soup.find('div', class_='collapsible results').find_all('div', class_='item'):
    middle = i.a.get('href')
    
    ## combine the links ('url_astro_main'+'url_astro_each_img_news') to navigate to the image news
    browser.visit(url_astro_main + middle)

    ## use beautifulsoup and its html.parser on html
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # scrape the full img_url and its title
    img_url_half = soup.find('img',class_='wide-image').get('src')
    
    # create a collection for each scraped result
    result = {}
    result["title"] = soup.find('h2',class_='title').text
    result["img_url"] = url_astro_main + img_url_half
    hemisphere_image_urls.append(result)

In [19]:
from pprint import pprint

## pretty print the final json
pprint(hemisphere_image_urls)

[{'img_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]
