In [30]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
import json
import datetime

In [2]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

#### NASA Website Scraping (Article Title + Teaser)

In [3]:
url_nasa = 'https://mars.nasa.gov/news/'
browser.visit(url_nasa)

In [7]:
html = browser.html
soup = bs(html, "html.parser")

In [8]:
article_title = soup.find_all('div', {'class':'content_title'})[1].text.strip()
article_title

"How NASA's Mars Helicopter Will Reach the Red Planet's Surface"

In [9]:
article_teaser = soup.find_all('div', {'class':'article_teaser_body'})[1].text
article_teaser

"The Red Planet's surface has been visited by eight NASA spacecraft. The ninth will be the first that includes a roundtrip ticket in its flight plan. "

#### JPL Featured Image

In [10]:
url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url_jpl)
browser.click_link_by_id("full_image")

In [13]:
html_jpl = browser.html
soup_jpl = bs(html_jpl, "html.parser")

In [14]:
temp = soup_jpl.find('img', {'class':'fancybox-image'})['src']
temp

'/spaceimages/images/mediumsize/PIA08003_ip.jpg'

In [15]:
figure_url = 'https://www.jpl.nasa.gov' + temp
figure_url

'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA08003_ip.jpg'

#### Mars Twitter

In [16]:
url_twitter = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url_twitter)

In [19]:
html_twitter = browser.html
soup_twitter = bs(html_twitter, "html.parser")

In [25]:
tweet_divs = soup_twitter.find('div',{'data-testid':'tweet'})
tweet_divs.find_all('span')

[<span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0"><span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">Mars Weather</span></span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">Mars Weather</span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">@MarsWxReport</span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">·</span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">InSight sol 564 (2020-06-28) low -89.3ºC (-128.8ºF) high -4.3ºC (24.2ºF)
 winds from the SSW at 5.0 m/s (11.3 mph) gusting to 15.4 m/s (34.4 mph)
 pressure at 7.70 hPa</span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-n6v787 r-1sf4r6n r-1n0xq6e r-bcqeeo r-d3hbe1 r-1wgg2b2 r-axxi2z r-qvutc0"><span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">1</span></span>,
 <span class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0">1</sp

In [None]:
for span in tweet_spans: 
    if "InSight sol" in span.text: 
        tweet = span.text
        break

In [None]:
tweet

#### Mars Facts

In [43]:
mars_facts = pd.read_html('https://space-facts.com/mars/')[0]
mars_facts = mars_facts.rename(columns = {mars_facts.columns[0]:'Attribute',mars_facts.columns[1]:'Value',})
mars_facts

Unnamed: 0,Attribute,Value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


#### Hemisphere Images

In [None]:
url_hemisphere = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url_hemisphere)

In [None]:
html = browser.html
soup = bs(html, "html.parser")

In [None]:
images = soup.find_all(['a', 'img alt'], {'class':'itemLink product-item'})
images

In [None]:
hemi_links = []
for image in images: 
    link = 'https://astrogeology.usgs.gov' + image['href']
    hemi_links.append(link)

In [None]:
hemi_links = set(hemi_links)
hemi_links

In [None]:
temp_link = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'
browser.visit(temp_link)
html = browser.html
soup = bs(html, "html.parser")

In [None]:
img_part = soup.find(['img'], {'class':'wide-image'})['src']
img_links = 'https://astrogeology.usgs.gov' + img_part
img_links

In [None]:
soup.find(['h2'], {'class':'title'}).text

In [None]:
hemi_list = []

for link in hemi_links: 
    browser.visit(link)
    html = browser.html
    soup = bs(html, "html.parser")
    partial = soup.find(['img'], {'class':'wide-image'})['src']
    full_link = 'https://astrogeology.usgs.gov' + partial
    title = soup.find(['h2'], {'class':'title'}).text
    dict_entry = {'title':title, 'img_url':full_link}
    hemi_list.append(dict_entry)

In [None]:
hemi_list

### Put It All Together

In [44]:
def scrape():
    #Executable Path + Browser
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=True)
    
    #NASA Website Scraper 
    #visit page
    url_nasa = 'https://mars.nasa.gov/news/'
    browser.visit(url_nasa)
    time.sleep(1)
    
    #soupify 
    html = browser.html
    soup = bs(html, "html.parser")
    
    #grab article title
    article_title = soup.find_all('div', {'class':'content_title'})[1].text.strip()
    
    #grab article teaser paragraph 
    article_teaser = soup.find_all('div', {'class':'article_teaser_body'})[1].text
    
    #JPL Featured Image Scraper
    #visit site 
    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_jpl)
    browser.click_link_by_id("full_image")
    time.sleep(1)
    
    #soupify 
    html_jpl = browser.html
    soup_jpl = bs(html_jpl, "html.parser")
    
    #grab url
    temp = soup_jpl.find('img', {'class':'fancybox-image'})['src']
    figure_url = 'https://www.jpl.nasa.gov' + temp
    
    #Mars Twitter Scraper
    #visit site
    url_twitter = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url_twitter)
    time.sleep(1)
    
    #soupify
    html_twitter = browser.html
    soup_twitter = bs(html_twitter, "html.parser")
    time.sleep(1)
    
    #find tweet
    tweet_div = soup_twitter.find('div',{'data-testid':'tweet'})
    tweet_spans = tweet_div.find_all('span')
    
    for span in tweet_spans: 
        if "InSight sol" in span.text: 
            tweet = span.text
            break
    
    #Mars Facts Scraper 
    mars_facts = pd.read_html('https://space-facts.com/mars/')[0]
    mars_facts = mars_facts.rename(columns = {mars_facts.columns[0]:'Attribute',mars_facts.columns[1]:'Value',})
    mars_json = json.loads(mars_facts.to_json(orient="records"))
    
    #Hemisphere Images Scraper 
    url_hemisphere = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_hemisphere)
    time.sleep(1)
    
    html = browser.html
    soup = bs(html, "html.parser")
    
    images = soup.find_all(['a', 'img alt'], {'class':'itemLink product-item'})
    
    hemi_links = []
    
    for image in images: 
        link = 'https://astrogeology.usgs.gov' + image['href']
        hemi_links.append(link)
    hemi_links = set(hemi_links)
    
    
    hemi_list = []
    for link in hemi_links: 
        browser.visit(link)
        time.sleep(1)
        
        html = browser.html
        soup = bs(html, "html.parser")
        partial = soup.find(['img'], {'class':'wide-image'})['src']
        full_link = 'https://astrogeology.usgs.gov' + partial
        title = soup.find(['h2'], {'class':'title'}).text
        dict_entry = {'title':title, 'img_url':full_link}
        hemi_list.append(dict_entry)
    
    #quit the browser 
    browser.quit()
    
    #make a dictionary
    mars_dict = {
        "ArticleTitle": article_title,
        "ArticleTeaser": article_teaser,
        "featureImageURL": figure_url,
        "tweetWeatherText": tweet,
        "marsStats": mars_json,
        "HemisphereImages": hemi_list,
        "dateScraped": datetime.datetime.now()
    }
    
    return mars_dict

In [45]:
marsNews = scrape()
marsNews

{'ArticleTitle': "How NASA's Mars Helicopter Will Reach the Red Planet's Surface",
 'ArticleTeaser': "The Red Planet's surface has been visited by eight NASA spacecraft. The ninth will be the first that includes a roundtrip ticket in its flight plan. ",
 'featureImageURL': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA18846_ip.jpg',
 'tweetWeatherText': 'InSight sol 564 (2020-06-28) low -89.3ºC (-128.8ºF) high -4.3ºC (24.2ºF)\nwinds from the SSW at 5.0 m/s (11.3 mph) gusting to 15.4 m/s (34.4 mph)\npressure at 7.70 hPa',
 'marsStats': [{'Attribute': 'Equatorial Diameter:', 'Value': '6,792 km'},
  {'Attribute': 'Polar Diameter:', 'Value': '6,752 km'},
  {'Attribute': 'Mass:', 'Value': '6.39 × 10^23 kg (0.11 Earths)'},
  {'Attribute': 'Moons:', 'Value': '2 (Phobos & Deimos)'},
  {'Attribute': 'Orbit Distance:', 'Value': '227,943,824 km (1.38 AU)'},
  {'Attribute': 'Orbit Period:', 'Value': '687 days (1.9 years)'},
  {'Attribute': 'Surface Temperature:', 'Value': '-87 to -5 °