In [1]:
import numpy as np
import pandas as pd
from splinter import Browser
from selenium import webdriver
from bs4 import BeautifulSoup
import requests as req

Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.
# Example:
news_title = "NASA's Next Mars Mission to Investigate Interior of Red Planet"
news_p = "Preparation of NASA's next spacecraft to Mars, InSight, has ramped up this summer, on course for launch next May from Vandenberg Air Force Base in central California -- the first interplanetary launch in history from America's West Coast."

In [50]:
news_data = {}                  # initializes empty dictionary for news data
paragraph_text = []             # initializes empty list for news article paragraph

In [51]:
browser.visit("https://mars.nasa.gov/news/")            # visits search URL with automated browser
nasa_soup = BeautifulSoup(browser.html, 'html.parser')  # sends response to beautiful soup

WebDriverException: Message: Failed to decode response from marionette


In [4]:
soup_div = nasa_soup.find(class_="slide")                   # finds class
soup_news = soup_div.find_all('a')                          # finds all anchors
news_title = soup_news[1].get_text().strip()                # extracts and cleans title

In [5]:
soup_p = soup_div.find_all('a', href=True)          # finds paragraphs
soup_p_url = soup_p[0]['href']                      # gets paragraphs URL
paragraph_url = base_url + soup_p_url               # concatenates URL for paragraph
response_2 = req.get(paragraph_url)                 # acquires second response from URL
para_soup = BeautifulSoup(response_2.text, "html.parser") # sends response to beautiful soup
ww_paragraphs = para_soup.find(class_='wysiwyg_content')# finds class
paragraphs = ww_paragraphs.find_all('p')                # finds paragraphs

In [6]:
for paragraph in paragraphs:                            # iterates through paragraphs
    clean_paragraph = paragraph.get_text().strip()      # extracts and cleans paragraphs    
    paragraph_text.append(clean_paragraph)              # appends to list


In [7]:
news_data["news_title"] = news_title                    # adds title to dictionary

In [8]:
news_data["paragraph_text_1"] = paragraph_text[0]       # adds paragraph summary to dictionary

In [9]:
news_data["paragraph_text_2"] = paragraph_text[1]   # adds paragraph detail to dictionary

In [10]:
news_data          # displays dictionary

{'news_title': "NASA's Opportunity Rover Mission on Mars Comes to End",
 'paragraph_text_1': "One of the most successful and enduring feats of interplanetary exploration, NASA's Opportunity rover mission is at an end after almost 15 years exploring the surface of Mars and helping lay the groundwork for NASA's return to the Red Planet.",
 'paragraph_text_2': "The Opportunity rover stopped communicating with Earth when a severe Mars-wide dust storm blanketed its location in June 2018. After more than a thousand commands to restore contact, engineers in the Space Flight Operations Facility at NASA's Jet Propulsion Laboratory (JPL) made their last attempt to revive Opportunity Tuesday, to no avail. The solar-powered rover's final communication was received June 10."}

In [11]:
#This code tests to make sure the firefox driver has been installed properly. It took a lot of troubleshooting
#Firefox doesn't allow direct communciations between firefox and selenium, the geckodrier is the proxy
#Because this change was made with version 48 of Firefox, prior instructions are incorrect
#It required me to find the correct $PATH to place the geckodriver that proxies between firefox/selenium

from selenium import webdriver;
browser = webdriver.Firefox();
browser.get('http://www.seleniumhq.org');

In [12]:
#browser.visit seems to be incompatible with the lastest version of firefox/selenium
#My workaround was to use requests.get instead of browser.visit to acquire responses from the url

jpl_fullsize_url = 'https://photojournal.jpl.nasa.gov/jpeg/'    # defines base URL for fullsize images
jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"      # defines search URL
mars_url='https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' # sets url to alias "mars_url"
response5 = req.get(mars_url)                  # acquires first response from URL
jpl_html = response5  # acquires reesponse from URL
jpl_soup = BeautifulSoup(jpl_html.text, 'html.parser')  # sends response to beautiful soup

In [13]:
featured_image_list = []                                                     # initializes empty list

for image in jpl_soup.find_all('div',class_="img"):                          # extracts all images
    featured_image_list.append(image.find('img').get('src'))                 # appends URL to list

In [14]:
feature_image = featured_image_list[0]                      # extracts first image found
temp_list_1 = feature_image.split('-')                      # splits on '-' (removes size limiters)
temp_list_2 = temp_list_1[0].split('/')                             # splits on '/' (parses out base filename)
featured_image_url = jpl_fullsize_url + temp_list_2[-1] + '.jpg'    # concatenates fullsize image URL

In [15]:
featured_image_url                                                           # displays URL

'https://photojournal.jpl.nasa.gov/jpeg/PIA22928.jpg'

In [16]:
browser.quit()                                                               # closes automated browser

### Mars Weather

* Visit the Mars Weather twitter account [here](https://twitter.com/marswxreport?lang=en) and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called `mars_weather`.

In [26]:
#browser.visit seems to be incompatible with the lastest version of firefox/selenium
#My workaround was to use requests.get instead of browser.visit to acquire responses from the url
browser = Browser('firefox', headless=False)                    # defines browser
tweet_url = 'https://twitter.com/marswxreport?lang=en'          # defines search URL
response6 = requests.get(tweet_url)                             # acquires response from URL
tweet_html = response6                                            # acquires reesponse from URL
tweet_soup = BeautifulSoup(tweet_html.text, 'html.parser')          # sends response to beautiful soup

In [27]:
weather_info_list = []                                                       # initializes empty list
# extracts all tweets from soup
for weather_info in tweet_soup.find_all('p',class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"):
    weather_info_list.append(weather_info.text.strip())                      # appends cleaned tweet to list

In [28]:
for value in reversed(weather_info_list):                                    # loops through list backwards
    if value[:3]=='Sol':                                                     # isolates weather tweet
        mars_weather = value                                                 # assigns to variable

In [29]:
mars_weather         

'Sol 2319 (2019-02-13), high -17C/1F, low -72C/-97F, pressure at 8.12 hPa, daylight 06:46-18:52pic.twitter.com/anlHR95BMs'

In [30]:
browser.quit()                                                               # closes automated browser

### Mars Facts

* Visit the Mars Facts webpage [here](http://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.

* Use Pandas to convert the data to a HTML table string.

In [2]:
facts_url = 'https://space-facts.com/mars/'                      # defines search URL
fact_list = pd.read_html(facts_url)                              # extracts data from URL using pandas
facts_df = fact_list[0]                                          # converts list to dataframe
facts_table = facts_df.to_html(header=False, index=False)        # converts dataframe to html table
print(facts_table)  

<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td>Equatorial Diameter:</td>
      <td>6,792 km</td>
    </tr>
    <tr>
      <td>Polar Diameter:</td>
      <td>6,752 km</td>
    </tr>
    <tr>
      <td>Mass:</td>
      <td>6.42 x 10^23 kg (10.7% Earth)</td>
    </tr>
    <tr>
      <td>Moons:</td>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <td>Orbit Distance:</td>
      <td>227,943,824 km (1.52 AU)</td>
    </tr>
    <tr>
      <td>Orbit Period:</td>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <td>Surface Temperature:</td>
      <td>-153 to 20 °C</td>
    </tr>
    <tr>
      <td>First Record:</td>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <td>Recorded By:</td>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>


### Mars Hemispheres

* Visit the USGS Astrogeology site [here](https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres.

In [46]:
#browser.visit seems to be incompatible with the lastest version of firefox/selenium
#My workaround was to use requests.get instead of browser.visit to acquire responses from the url

browser = Browser('firefox', headless=False)                                  # defines browser                                 
browser.visit('https://astrogeology.usgs.gov/maps/mars-viking-hemisphere-point-perspectives')

# defines search URL
#usgs_url = 'https://astrogeology.usgs.gov/maps/mars-viking-hemisphere-point-perspectives'
#response7 = req.get(usgs_url)                             # acquires response from URL
usgs_html = response7                                          # acquires reesponse from URL
#usgs_soup = BeautifulSoup(usgs_html.text, 'html.parser')       # sends response to beautiful soup
hemis_titles = browser.find_by_css('a.item')  #get all titles to the hemispheres on the page

In [None]:
browser.visit

In [47]:
print(hemis_titles)

[<splinter.driver.webdriver.WebDriverElement object at 0x1226b79e8>, <splinter.driver.webdriver.WebDriverElement object at 0x1226b7320>, <splinter.driver.webdriver.WebDriverElement object at 0x1226b79b0>, <splinter.driver.webdriver.WebDriverElement object at 0x1226b7b70>, <splinter.driver.webdriver.WebDriverElement object at 0x1226b7ba8>, <splinter.driver.webdriver.WebDriverElement object at 0x1226b7be0>, <splinter.driver.webdriver.WebDriverElement object at 0x1226b7c18>, <splinter.driver.webdriver.WebDriverElement object at 0x1226b7c50>]


* You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.

* Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys `img_url` and `title`.

* Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [38]:
#usgs_html = browser.html
#usgs_soup = BeautifulSoup(usgs_html, 'html.parser')

hemisphere_image_urls = []     #create empty list to put titles

hemis_titles = usgs_soup.find_all("a.item")  #get all titles to the hemispheres on the page
#print(hemis_title)
print(hemis_titles)

[]


In [18]:


for i in range(len(hemis_titles)):          #read in all 4 hemisphere titles
    hemis_title = hemis_titles[i].text
    print(hemis_title)
    
    hemis_images = browser.find_by_tag(a.item)  #click the headers 
    hemis_images[i].click()
    
    usgs_html = browser.html
    usgs_soup = BeautifulSoup(html, 'html.parser')
    
    for m in range(len(img_url)):
        img_url = usgs_soup.find('img', class_='wide-image')['src']
        img_url = "https://astrogeology.usgs.gov" + img_url
        print(img_url)
    
    hemis_dict = {"title": hemis_title, "img_url":img_url}
    hemisphere_image_urls.append(hemis_dict)
    
    browser.back()

NameError: name 'hemis_title' is not defined

In [19]:
hemis_dict                                                     # displays list of dictionaries

NameError: name 'hemis_dict' is not defined

In [None]:
browser.quit()                                                               # closes automated browser