## Step 1 - Scraping
_______________________________________________________________________________
Reminder: activate mongo db [mongod]
_______________________________________________________________________________

In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd
from pprint import pprint

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.mars_db
collection = db.marsdata

In [4]:
# URL of pages to be scraped
url1 = 'https://mars.nasa.gov/news/'
url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
url3 = 'https://twitter.com/marswxreport?lang=en'
url4 = 'http://space-facts.com/mars/'
url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [5]:
# Scrape Format:

##### Get request of url to be scraped
# response = requests.get(url)

##### Create Beautiful Soup Object and parse with 'lxml'
# soup = BeautifulSoup(response.text, 'lxml')

##### Print object to determine element that contains sought info
# print(soup.prettify())

### NASA Mars News
Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title and Paragragh Text

In [6]:
# Retrieve page with the requests module
response1 = requests.get(url1)
# Create BeautifulSoup object; parse with 'lxml'
soup1 = BeautifulSoup(response1.text, 'lxml')

In [7]:
# Search for article location within soup object
results = soup1.find_all('div', class_='slide')

In [8]:
# Find latest article on Mars
news_title = results[0].find("div", class_="content_title").text
news_title = news_title.replace('\n',"")
news_para = results[0].find("div", class_="rollover_description_inner").text
news_para = news_para.replace('\n',"")
print(f"{news_title} - {news_para}")

NASA Invests in Visionary Technology  - NASA is investing in technology concepts, including several from JPL, that may one day be used for future space exploration missions.


In [9]:
# Method for finding all news articles
# Loop through returned results
# for result in results:
    # Error handling
#    try:
        # Identify and return title of news article
#        news_title = result.find("div", class_="content_title").text
        # Identify and return paragraph of news article
#        news_para = result.find("div", class_="rollover_description_inner").text
#    except Exception as e:
#        print(e)
    # Dictionary to be inserted into MongoDB
#    post = {
#        'title': news_title,
#        'para': news_para,
#    }
    # Insert dictionary into MongoDB as a document
#    collection.insert_one(post)

### JPL Mars Space Images - Featured Image
Visit the url for JPL's Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)

In [10]:
# Retrieve page with the requests module
response2 = requests.get(url2)
# Create BeautifulSoup object; parse with 'lxml'
soup2 = BeautifulSoup(response2.text, 'lxml')
# print(soup2.prettify())

In [11]:
slide = soup2.find('li', class_='slide')
pic_url = slide.find('a', class_='fancybox')
# pic_url

In [12]:
# base used to construct full link to image file
featured_image_base = 'https://www.jpl.nasa.gov'
featured_image_url = featured_image_base + pic_url['data-fancybox-href']
featured_image_title = pic_url['data-title']
print(f"{featured_image_title} - {featured_image_url}")

Lobo Vallis - https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA22374_hires.jpg


In [13]:
# Dictionary to be inserted into MongoDB
# post = {
#    'title': featured_image_title,
#    'img_url': featured_image_url,
# }
# Insert dictionary into MongoDB as a document
# collection.insert_one(post)

### Mars Weather
Visit the Mars Weather twitter account [here](https://twitter.com/marswxreport?lang=en) and scrape the latest Mars weather tweet from the page.

In [14]:
# Retrieve page with the requests module
response3 = requests.get(url3)
# Create BeautifulSoup object; parse with 'lxml'
soup3 = BeautifulSoup(response3.text, 'lxml')
# print(soup3.prettify())

In [15]:
mars_weather = soup3.find('div', class_='js-tweet-text-container').text
mars_weather = mars_weather.replace('\n',"")
mars_weather

'Sol 2026 (April 18, 2018), Sunny, high -6C/21F, low -73C/-99F, pressure at 7.19 hPa, daylight 05:26-17:21'

In [16]:
# Dictionary to be inserted into MongoDB
# post = {
#    'title': "Current Weather on Mars",
#    'para': mars_weather,
# }
# Insert dictionary into MongoDB as a document
# collection.insert_one(post)

### Mars Facts
Visit the Mars Facts webpage [here](http://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. Use Pandas to convert the data to a HTML table string.

In [17]:
tables = pd.read_html(url4)
# tables

In [18]:
mars_df = pd.DataFrame(tables[0])
mars_df.columns = ['Measurement',"Value"]
mars_df.set_index('Measurement', inplace=True)
mars_df.head()

Unnamed: 0_level_0,Value
Measurement,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"


In [19]:
marsfactsdict = mars_df.to_dict()
marsfactsdict

{'Value': {'Equatorial Diameter:': '6,792 km',
  'First Record:': '2nd millennium BC',
  'Mass:': '6.42 x 10^23 kg (10.7% Earth)',
  'Moons:': '2 (Phobos & Deimos)',
  'Orbit Distance:': '227,943,824 km (1.52 AU)',
  'Orbit Period:': '687 days (1.9 years)',
  'Polar Diameter:': '6,752 km',
  'Recorded By:': 'Egyptian astronomers',
  'Surface Temperature:': '-153 to 20 °C'}}

In [20]:
# Method to convert table into html format and remove extra new line feed
# mars_html = mars_df.to_html()
# mars_html.replace('\n',"")
# mars_html

# Output dataframe to html file
# mars_df.to_html('mars_table.html')

### Mars Hemisperes
Visit the USGS Astrogeology site [here](https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres. Save the full resolution hemipshere image, and the Hemisphere title containing the hemisphere name, and store in Python dictionary with keys `img_url` and `title`. Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [21]:
# Retrieve page with the requests module
response5 = requests.get(url5)
# Create BeautifulSoup object; parse with 'lxml'
soup5 = BeautifulSoup(response5.text, 'lxml')

In [22]:
# Search for hemispheres within soup object
results = soup5.find_all('div', class_='item')

In [23]:
# base used to construct full link to image file
img_dl_base = 'https://astrogeology.usgs.gov'
# list and dictionary set up to store hemisphere titles and image urls
hemilist = []
hemidict = {}

In [24]:
# Loop through returned results
for result in results:
    # Error handling
    try:
        # Identify and return hemisphere title
        hemi_title = result.find("h3").text
        # Identify and return image of hemisphere [this link goes to the image dl page]
        img_dl_url = result.a['href']
        # construct link to download the hemisphere image
        url = img_dl_base + img_dl_url
        # use constructed link to scrape hemisphere image
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        result = soup.find("div", class_="downloads")
        # save hemisphere image url
        hemi_url = result.a['href']
    except Exception as e:
        print(e)
    # Python Dictionary of hemisphere title and image url
    hemidict = {
        'title': hemi_title,
        'img_url': hemi_url,
    }
    # Append hemisphere dictionary to list 
    hemilist.append(hemidict)

In [25]:
pprint(hemilist)

[{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]


In [26]:
# List of Dictionaries to be inserted into MongoDB
# post = {
#    'hemi': hemilist,
# }
# Insert dictionary into MongoDB as a document
# collection.insert_one(post)

In [27]:
# Display the MongoDB records stored from the above scraping
# contents = db.marsdata.find()
# for content in contents:
#    pprint(content)

In [28]:
# USE FOR CLEANUP ONLY
# collection.drop()

#### Combined News, Featured Image, Weather, Facts, and Hemispheres Scraped Data into a Python Dictionary

In [30]:
# Single Python Dictionary with all scraped data
# example on querying data: mars_data['news']['title']
# example on querying hemispheres data: mars_data['hemispheres'][0]['title']
mars_data = {
    'news': {
        'title': news_title,
        'para': news_para,
    },
    'featured': {
        'title': featured_image_title,
        'img_url': featured_image_url,
    },
    'weather': {
        'title': "Current Weather on Mars",
        'para': mars_weather,        
    },
    'facts': marsfactsdict,
    'hemispheres': hemilist
}
pprint(mars_data)

{'facts': {'Value': {'Equatorial Diameter:': '6,792 km',
                     'First Record:': '2nd millennium BC',
                     'Mass:': '6.42 x 10^23 kg (10.7% Earth)',
                     'Moons:': '2 (Phobos & Deimos)',
                     'Orbit Distance:': '227,943,824 km (1.52 AU)',
                     'Orbit Period:': '687 days (1.9 years)',
                     'Polar Diameter:': '6,752 km',
                     'Recorded By:': 'Egyptian astronomers',
                     'Surface Temperature:': '-153 to 20 °C'}},
 'featured': {'img_url': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA22374_hires.jpg',
              'title': 'Lobo Vallis'},
 'hemispheres': [{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
                  'title': 'Cerberus Hemisphere Enhanced'},
                 {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
        