# Module 12 Challenge
## Deliverable 1: Scrape Titles and Preview Text from Mars News

In [1]:
# Import Splinter, BeautifulSoup and other dependencies 
from splinter import Browser
from bs4 import BeautifulSoup as bs
from webdriver_manager.chrome import ChromeDriverManager
import requests
import pymongo
import json

In [2]:
# initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# define database and collection 
db = client.mars_db
collection = db.articles

In [4]:
# splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

### Step 1: Visit the Website

1. Use automated browsing to visit the [Mars NASA news site](https://redplanetscience.com). Inspect the page to identify which elements to scrape.

      > **Hint** To identify which elements to scrape, you might want to inspect the page by using Chrome DevTools.

In [5]:
# URL of page to be scraped
url = "https://redplanetscience.com"
browser.visit(url)
# Visit the Mars NASA news site: https://redplanetscience.com
response = requests.get(url)  

In [6]:
# check status code for response received
# success code - 200
print(response)
 
# print content of request
print(response.content)

<Response [200]>
b'<!DOCTYPE html>\r\n<html>\r\n   <head>\r\n      <meta charset="utf-8">\r\n      <meta name="viewport" content="width=device-width, initial-scale=1">\r\n      <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0-beta1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-giJF6kkoqNQ00vy+HMDP7azOuL0xtbfIcaT9wjKHr8RbDVddVHyTfAAsrekwKmP1" crossorigin="anonymous">\r\n      <link rel="stylesheet" type="text/css" href="css/font.css">\r\n      <link rel="stylesheet" type="text/css" href="css/app.css">\r\n      <link rel="stylesheet" href="https://pro.fontawesome.com/releases/v5.10.0/css/all.css" integrity="sha384-AYmEC3Yw5cVb3ZcuHtOA93w35dYTsvhLPVnYs9eStHfGJvOvKxVfELGroGkvsg+p" crossorigin="anonymous"/>\r\n      <title>News - Mars Exploration Program</title>\r\n   </head>\r\n   <body>\r\n      <div class="col-md-12">\r\n      <div class="row">\r\n      <nav class="navbar navbar-expand-lg navbar-light fixed-top">\r\n         <div class="container-fluid">\r\n     

### Step 2: Scrape the Website

Create a Beautiful Soup object and use it to extract text elements from the website.

In [7]:
# Create a Beautiful Soup object
soup = bs(response.text, 'html.parser')
# soup = bs(nasa_page.text, 'lxml')
type(soup)

bs4.BeautifulSoup

In [8]:
html = browser.html
soup = bs(html, 'lxml')

In [9]:
# Getting the title tag
print(soup.title)
 
# Getting the name of the tag
print(soup.title.name)
 
# Getting the name of parent tag
print(soup.title.parent.name)

<title>News - Mars Exploration Program</title>
title
head


In [10]:
# print formatted version of the soup
print(soup.prettify())

<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link crossorigin="anonymous" href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0-beta1/dist/css/bootstrap.min.css" integrity="sha384-giJF6kkoqNQ00vy+HMDP7azOuL0xtbfIcaT9wjKHr8RbDVddVHyTfAAsrekwKmP1" rel="stylesheet"/>
  <link href="css/font.css" rel="stylesheet" type="text/css"/>
  <link href="css/app.css" rel="stylesheet" type="text/css"/>
  <link crossorigin="anonymous" href="https://pro.fontawesome.com/releases/v5.10.0/css/all.css" integrity="sha384-AYmEC3Yw5cVb3ZcuHtOA93w35dYTsvhLPVnYs9eStHfGJvOvKxVfELGroGkvsg+p" rel="stylesheet"/>
  <title>
   News - Mars Exploration Program
  </title>
 </head>
 <body>
  <div class="col-md-12">
   <div class="row">
    <nav class="navbar navbar-expand-lg navbar-light fixed-top">
     <div class="container-fluid">
      <a class="navbar-brand" href="#">
       <img src="image/nasa.png" width="80"/>
       <span class="logo">
        MA

In [11]:
# Extract all the text elements
soup.text

"\n\n\n\n\n\n\nNews - Mars Exploration Program\n\n\n\n\n\n\n\nMARS Planet Science\nExploration Program\n\n\n\n\n\n\n\nThe Red Planet\n\n\nThe Program\n\n\nNews & Events\n\n\nMultimedia\n\n\nMissions\n\n\nMore\n\n\n\n\n\n\n\n\n\n\nNews\n\n\n\n\n\n\n\n\nLatest\n20202019\n\n\n\nAll Categories\nFeature StoriesPress ReleasesStatus Reports\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJanuary 15, 2023\nFollow NASA's Perseverance Rover in Real Time on Its Way to Mars\nA crisply rendered web application can show you where the agency's Mars 2020 mission is right now as it makes its way to the Red Planet for a Feb. 18, 2021, landing.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJanuary 14, 2023\nNASA's Mars Rover Drivers Need Your Help\nUsing an online tool to label Martian terrain types, you can train an artificial intelligence algorithm that could improve the way engineers guide the Curiosity rover.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJanuary 11, 2023\nA Martian Roundtrip: NASA's Perseverance Rover Sample Tubes\nMarvels of

In [12]:

results = soup.find_all('div', class_='col-md-4')
for result in results:
    print(result)
# # results are returned as an iterable list
# # results = soup.find_all('div', class_='content_title''article_teaser_body')
# preview_list = soup.find_all('div', class_='article_teaser_body')
# print(preview_list)
# s = soup.find('div', class_='list_text')
 
# lines = s.find_all('div')
 
# for line in lines:
#     print(line.text)

<div class="col-md-4">
<div class="list_image">
<img src="https://mars.nasa.gov/system/news_items/list_view_images/8742_Mars2020-Earth-226.jpg"/>
</div>
</div>
<div class="col-md-4">
<div class="list_image">
<img src="https://mars.nasa.gov/system/news_items/list_view_images/8689_PIA23897-320x240.jpg"/>
</div>
</div>
<div class="col-md-4">
<div class="list_image">
<img src="https://mars.nasa.gov/system/news_items/list_view_images/8822_1-PIA24304-CatScanMars-320x240.gif"/>
</div>
</div>
<div class="col-md-4">
<div class="list_image">
<img src="https://mars.nasa.gov/system/news_items/list_view_images/8654_MAIN-IMAGE-PIA23881-320x240.jpg"/>
</div>
</div>
<div class="col-md-4">
<div class="list_image">
<img src="https://mars.nasa.gov/system/news_items/list_view_images/8606_list_image.jpg"/>
</div>
</div>
<div class="col-md-4">
<div class="list_image">
<img src="https://mars.nasa.gov/system/news_items/list_view_images/8705_PIA23896-226.jpg"/>
</div>
</div>
<div class="col-md-4">
<div class="

In [13]:
results = soup.find_all('div', class_='article_teaser_body')
for result in results:
    print(result)

<div class="article_teaser_body">A crisply rendered web application can show you where the agency's Mars 2020 mission is right now as it makes its way to the Red Planet for a Feb. 18, 2021, landing.</div>
<div class="article_teaser_body">Using an online tool to label Martian terrain types, you can train an artificial intelligence algorithm that could improve the way engineers guide the Curiosity rover.</div>
<div class="article_teaser_body">Marvels of engineering, the rover's sample tubes must be tough enough to safely bring Red Planet samples on the long journey back to Earth in immaculate condition. </div>
<div class="article_teaser_body">Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first.</div>
<div class="article_teaser_body">Robotic spacecraft will be able to communicate with the dish using radio waves and lasers.</div>
<div class="article_teaser_body"

### Step 3: Store the Results

Extract the titles and preview text of the news articles that you scraped. Store the scraping results in Python data structures as follows:

* Store each title-and-preview pair in a Python dictionary. And, give each dictionary two keys: `title` and `preview`. An example is the following:

  ```python
  {'title': "Mars Rover Begins Mission!", 
        'preview': "NASA's Mars Rover begins a multiyear mission to collect data about the little-explored planet."}
  ```

* Store all the dictionaries in a Python list.

* Print the list in your notebook.

In [14]:
# Retrieve the parent divs for all articles
results = soup.find_all('div', class_="list_text")

# Create an empty list to store the dictionaries
articles = []

# iterate through text and extract the title and preview text from the elements

for result in results:

    #identify and return title of article 
    title = result.find('div', class_='content_title').text.strip()
    #identify and return preview of article 
    preview = result.find('div', class_='article_teaser_body').text
        
    # Store each title and preview pair in a dictionary, then append it to the list
    articles.append({'title':title,
                      'preview':preview})
   
    # dictionary to be inserted in Mongo DB 
    post = {
        'title': title,
        'preview': preview
    }
    
    # insert dictionary into MongoDB as a document 
    collection.insert_one(post)

In [16]:
# Print the list to confirm success
print(json.dumps(articles, sort_keys=False, indent=4))

[
    {
        "title": "Follow NASA's Perseverance Rover in Real Time on Its Way to Mars",
        "preview": "A crisply rendered web application can show you where the agency's Mars 2020 mission is right now as it makes its way to the Red Planet for a Feb. 18, 2021, landing."
    },
    {
        "title": "NASA's Mars Rover Drivers Need Your Help",
        "preview": "Using an online tool to label Martian terrain types, you can train an artificial intelligence algorithm that could improve the way engineers guide the Curiosity rover."
    },
    {
        "title": "A Martian Roundtrip: NASA's Perseverance Rover Sample Tubes",
        "preview": "Marvels of engineering, the rover's sample tubes must be tough enough to safely bring Red Planet samples on the long journey back to Earth in immaculate condition. "
    },
    {
        "title": "How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus",
        "preview": "Like much of the rest of the world, the Mars ro

### (Optional) Step 4: Export the Data

Optionally, store the scraped data in a file or database (to ease sharing the data with others). To do so, export the scraped data to either a JSON file or a MongoDB database.

In [17]:
# Step 4 was completed within the for loop above 

In [18]:
# Export data to JSON
# Export data to MongoDB
browser.quit()