In [2]:
import pymongo
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
import pandas as pd
import numpy as np
import time
import datetime

In [2]:
# Create BeautifulSoup object for NASA website
nasa_url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
nasa_html = requests.get(nasa_url).text
nasa_soup = bs(nasa_html, 'lxml')

In [3]:
# Get first title
title_results = nasa_soup.find_all('div', class_="content_title")
title_list = []
for result in title_results:
    try:
        title = result.find('a').text.strip()
        if title:
            # print(title)
            title_list.append(title)
    except Exception as e:
        print(f'Fail: {e}')
news_title = title_list[0]
news_title

"NASA's InSight Places First Instrument on Mars"

In [4]:
# Get first paragraph
p_results = nasa_soup.find_all('div', class_="rollover_description_inner")
p_list = []
for p in p_results:
    try:
        par = p.text.strip()
        if par:
            p_list.append(par)
    except Exception as e:
        print(f'Fail: {e}')
news_p = p_list[0]
news_p

'In deploying its first instrument onto the surface of Mars, the lander completes a major mission milestone.'

In [5]:
# Create splinter broswer instance
executable_path = {'executable_path':'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'}
browser = Browser('chrome', **executable_path)

In [6]:
# Scrape featured image from NASA
nasa_images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(nasa_images_url)
# Click button for full image
browser.find_by_css('.button').first.click()
# Pause to let load
time.sleep(3)
# Click more info button
browser.find_by_css('.button').last.click()
# Get image name from "Image Details" and use to navigate to image URL
partial_link = browser.find_by_css('.download_tiff').last.value.split(" ")[2]
browser.click_link_by_partial_href(partial_link)
featured_image_url = browser.url
browser.quit()
featured_image_url

'https://photojournal.jpl.nasa.gov/jpeg/PIA19913.jpg'

In [7]:
# Create BeautifulSoup object for Twitter site
twitter_url = "https://twitter.com/marswxreport?lang=en"
twitter_html = requests.get(twitter_url).text
twitter_soup = bs(twitter_html, 'lxml')

In [8]:
# Get tweets
tweets = twitter_soup.find_all('div', class_ = "content")
weather_only_tweets = []

# Loop through tweets to find weather report tweets
for tweet in tweets:
    # Eliminate retweets
    username = tweet.find('span', class_ = "username u-dir u-textTruncate")
    if username.text == "@MarsWxReport":
        tweet_content = tweet.find('p', class_ = "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text.strip()
        # Eliminate non-weather tweets
        report_test = tweet_content.split(" ")
        if report_test[0] == "Sol":
            weather_only_tweets.append(tweet_content)
mars_weather = weather_only_tweets[0]
mars_weather

'Sol 2305 (2019-01-30), high -4C/24F, low -73C/-99F, pressure at 8.14 hPa, daylight 06:47-18:54pic.twitter.com/OTkUTDyRpu'

In [17]:
# Scrape facts table
facts_url = "https://space-facts.com/mars/"
facts_table = pd.read_html(facts_url)
facts_df = facts_table[0]
facts_df=facts_df.set_index(0)

# Convert facts dataframe to HTML
facts_html = facts_df.to_html(header = False, index_names=False).replace("\n", "")
facts_html

'<table border="1" class="dataframe">  <thead>    <tr>      <th>0</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.42 x 10^23 kg (10.7% Earth)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.52 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-153 to 20 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

In [10]:
# Create splinter broswer instance
executable_path = {'executable_path':'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'}
browser = Browser('chrome', **executable_path)
# Open browser & visit website
usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(usgs_url)

In [11]:
# Make list of hemispheres
link_objects = browser.find_by_css('h3')
hemisphere_list = []
{hemisphere_list.append(link.value.replace(" Enhanced","")) for link in link_objects}
hemisphere_list

['Cerberus Hemisphere',
 'Schiaparelli Hemisphere',
 'Syrtis Major Hemisphere',
 'Valles Marineris Hemisphere']

In [12]:
# Make list of urls
url_list = []
for hemisphere in hemisphere_list:
    browser.click_link_by_partial_text(hemisphere)
    image_object = browser.find_by_css('img.wide-image')
    img_url = image_object['src']
    url_list.append(img_url)
    browser.back()
browser.quit()
url_list

['https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg',
 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg',
 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg',
 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg']

In [13]:
# Make dictionary
hemisphere_image_urls = []
for hemisphere, url in zip(hemisphere_list, url_list):
    hemisphere_dict = {"title": hemisphere, "url": url}
    hemisphere_image_urls.append(hemisphere_dict)
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]

In [4]:
def scrape():
    scrape_dict = {}
    
    # Update dictionary with scrape time
    scrape_dict["scrape_time"] = str(datetime.datetime.now())

    # Get most current news story from NASA's mars site
    nasa_news_url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    nasa_html = requests.get(nasa_news_url).text
    nasa_soup = bs(nasa_html, 'lxml')
    # Get first title
    title_results = nasa_soup.find_all('div', class_="content_title")
    title_list = []
    for result in title_results:
        try:
            title = result.find('a').text.strip()
            if title:
                title_list.append(title)
        except Exception as e:
            return e
    news_title = title_list[0]
    # Get first paragraph
    p_results = nasa_soup.find_all('div', class_="rollover_description_inner")
    p_list = []
    for p in p_results:
        try:
            par = p.text.strip()
            if par:
                p_list.append(par)
        except Exception as e:
            return e
    news_p = p_list[0]
    # Update dictionary
    scrape_dict["mars_news_title"] = news_title
    scrape_dict["mars_news_p"] = news_p

    # Create splinter browser instance
    executable_path = {'executable_path':'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'}
    browser = Browser('chrome', **executable_path)
    # Scrape NASA images page for featured image
    nasa_images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(nasa_images_url)
    browser.find_by_css('.button').first.click()
    time.sleep(3)
    browser.find_by_css('.button').last.click()
    partial_link = browser.find_by_css('.download_tiff').last.value.split(" ")[2]
    browser.click_link_by_partial_href(partial_link)
    featured_image_url = browser.url
    # Update dictionary
    scrape_dict["featured_image"] = featured_image_url

    # Use Splinter to scrape USGS for hemisphere images and urls
    usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(usgs_url)
    link_objects = browser.find_by_css('h3')
    hemisphere_list = []
    {hemisphere_list.append(link.value.replace(" Enhanced","")) for link in link_objects}
    url_list = []
    for hemisphere in hemisphere_list:
        browser.click_link_by_partial_text(hemisphere)
        image_object = browser.find_by_css('img.wide-image')
        img_url = image_object['src']
        url_list.append(img_url)
        browser.back()
    browser.quit()
    hemisphere_image_urls = []
    for hemisphere, url in zip(hemisphere_list, url_list):
        hemisphere_dict = {"title": hemisphere, "url": url}
        hemisphere_image_urls.append(hemisphere_dict)
    hemisphere_image_urls
    # Update dictionary
    scrape_dict["hemisphere_images"] = hemisphere_image_urls
    
    # Scrape weather conditions from Mars Weather Twitter
    twitter_url = "https://twitter.com/marswxreport?lang=en"
    twitter_html = requests.get(twitter_url).text
    twitter_soup = bs(twitter_html, 'lxml')
    tweets = twitter_soup.find_all('div', class_ = "content")
    weather_only_tweets = []
    for tweet in tweets:
        username = tweet.find('span', class_ = "username u-dir u-textTruncate")
        if username.text == "@MarsWxReport":
            tweet_content = tweet.find('p', class_ = "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text.strip()
            report_test = tweet_content.split(" ")
            if report_test[0] == "Sol":
                weather_only_tweets.append(tweet_content)
    mars_weather = weather_only_tweets[0]
    # Update dictionary
    scrape_dict["mars_weather"] = mars_weather
    
    # Scrape facts table
    facts_url = "https://space-facts.com/mars/"
    facts_table = pd.read_html(facts_url)
    facts_df = facts_table[0]
    facts_html = facts_df.to_html(header = False, index = False).replace("\n", "")
    # Update dictionary
    scrape_dict["mars_facts"] = facts_html
    
    return scrape_dict