In [1]:
# Install dependencies
import pandas as pd
import os
from bs4 import BeautifulSoup
import requests
from pprint import pprint
import json


In [2]:
#URL's of pages to be scraped
# NASA news web page = "https://mars.nasa.gov/news/"
nasa_news_url = "https://mars.nasa.gov/api/v1/news_items/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&blank_scope=Latest"
jpl_images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
jpl_base = "https://www.jpl.nasa.gov"
mars_weather_url = "https://twitter.com/marswxreport?lang=en"
mars_facts_url = "http://space-facts.com/mars/"
usgs_images_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"


In [3]:
# Response function to use requests
def response(url):
    response = requests.get(url)
    return response
# Function to write response infomration to file
def open_file(file_name, response):
    with open(file_name,"w+") as write_file:
        json.dump(response,write_file)

In [4]:
#Mars news scrape -we will use the JSON response from teh news page.  "https://mars.nasa.gov/api/v1/news_items/*
# This is an API and returns a nested JSON with all the information we need (title, short description, etc.)

In [5]:
mars_news_json = response(nasa_news_url).json()

In [6]:
# Write JSON data to file
open_file("mars_news_data_file.json", mars_news_json)

In [7]:
# pprint(mars_news_json)

In [8]:
# The code below was sourced from https://hackersandslackers.com/extract-data-from-complex-json-python/
# It extracts nested data from a complex JSON
def extract_values(obj, key):
    """Pull all values of specified key from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    results = extract(obj, arr, key)
    return results

In [9]:
titles = extract_values(mars_news_json,"title")
# print(titles)

In [10]:
# Extract the most recent title.  They are in order from newest to oldest in the titles list
news_title = titles[0]

In [11]:
# print (news_title)

In [12]:
# Bonus return the short paragrapgh text "description"
descriptions = extract_values(mars_news_json, "description")
# print (descriptions[0])

In [13]:
# Get URL for Featured JPL Mars IMage
mars_image_text = response(jpl_images_url).text
# Write JSON data to file
open_file("jpl_image_file.json", mars_image_text)
    
# print(mars_image_text)

In [93]:
soup=BeautifulSoup(mars_image_text, 'lxml')
footer_tag = soup.footer.a.attrs

In [98]:
# Get the url for the featured image on the JPL Mars page
img_url = footer_tag['data-fancybox-href']
featured_image_url = jpl_base + img_url
# print(featured_image_url)

In [95]:
#get the latest Mars weather from twitter
mars_twitter_text = response(mars_weather_url).text
# Write JSON data to file
open_file("mars_weather_file.json", mars_twitter_text)

In [17]:
soup=BeautifulSoup(mars_twitter_text, 'lxml')
mars_weather = soup.find(class_='tweet-text').get_text()

In [97]:
# print(mars_weather)

In [19]:
# get mars facts from space-facts.com using pandas

In [20]:
dfs = pd.read_html(mars_facts_url, header = None)[0]\
                            .rename(columns={0:'description', 1:'value'})


In [99]:
# print(dfs)

In [22]:
# Convert string to dataframe
mars_facts_df = pd.DataFrame(dfs)

In [100]:
# mars_facts_df.head()

In [88]:
# Get mars hemisphere images from usgs
mars_usgs_text = response(usgs_images_url).text
astro_usgs_base_url = "https://astrogeology.usgs.gov"

In [85]:
# print(mars_usgs_text)

In [86]:
soup=BeautifulSoup(mars_usgs_text, 'lxml')
mars_usgs_tags = soup.find_all(class_='itemLink product-item')
# pprint(mars_usgs_tags)

In [84]:
# Get names of thumbnail images
h3_tags = soup.find_all('h3')
hemi_titles = [t.text for t in h3_tags]
print(hemi_titles)

['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced']


In [89]:
# Get link for full size image
hrefs = [(astro_usgs_base_url + t.attrs['href']) for t in mars_usgs_tags]

In [90]:
print(hrefs)

['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']


In [101]:
# assign the image urls to a variable.  These will be used by Requests to get the high res image
image1_url = hrefs[0]
image2_url = hrefs[1]
image3_url = hrefs[2]
image4_url = hrefs[3]

In [102]:
high_res_images = response(image1_url).text
print(high_res_images)

<!DOCTYPE html>
<html lang="en">
	<head>
		<link rel="stylesheet" type="text/css" href="//ajax.googleapis.com/ajax/libs/jqueryui/1.11.4/themes/smoothness/jquery-ui.css" />
<title>Cerberus Hemisphere Enhanced | USGS Astrogeology Science Center</title>
		<meta name="description" content="Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from&hellip;"/>
		<meta name="keywords" content="USGS,Astrogeology Science Center,Cartography,Geology,Space,Geological Survey,Mapping"/>
		<meta http-equiv="X-UA-Compatible" content="IE=edge"/>
		<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
		<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>
		<meta name="google-site-verification" content="x61hXXVj7wtfBSNOPnTftajMsZ5yB2W-qRoyr7GtOKM"/>
		<!--<link rel="stylesheet" href="http://fonts.googleapis.com/css?family=Open+Sans:400italic,400,bold"/>-->
		<link rel="stylesheet"

In [108]:
soup=BeautifulSoup(high_res_images, 'lxml')
high_res_image = soup.find_all(target='_blank')[1]['href']
print(high_res_image)

http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif
