# HTML Parsing with Beautiful Soup
1. Fundamentals
2. Traversing
3. Finding and other utilities

In [34]:
from bs4 import BeautifulSoup

url = 'https://news.ycombinator.com/'

In [36]:
import requests

html_doc = requests.get(url).content

In [37]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:
# soup

In [38]:
print(soup.prettify())

<html lang="en" op="news">
 <head>
  <meta content="origin" name="referrer"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <link href="news.css?CZk51y0z29TYBzr1LRkl" rel="stylesheet" type="text/css"/>
  <link href="favicon.ico" rel="shortcut icon"/>
  <link href="rss" rel="alternate" title="RSS" type="application/rss+xml"/>
  <title>
   Hacker News
  </title>
 </head>
 <body>
  <center>
   <table bgcolor="#f6f6ef" border="0" cellpadding="0" cellspacing="0" id="hnmain" width="85%">
    <tr>
     <td bgcolor="#ff6600">
      <table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%">
       <tr>
        <td style="width:18px;padding-right:4px">
         <a href="https://news.ycombinator.com">
          <img height="18" src="y18.gif" style="border:1px white solid;" width="18"/>
         </a>
        </td>
        <td style="line-height:12pt; height:10px;">
         <span class="pagetop">
          <b class="hnname">
           <a href

Let's investigate.

In [40]:
title = soup.title

Introspect and navigate the DOM.

In [42]:
title.text

'Hacker News'

In [43]:
soup.span

<span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
<a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a href="jobs">jobs</a> | <a href="submit">submit</a> </span>

Get class

In [44]:
soup.span["class"]

['pagetop']

Let's get all the links.

In [45]:
soup.find_all('a')

[<a href="https://news.ycombinator.com"><img height="18" src="y18.gif" style="border:1px white solid;" width="18"/></a>,
 <a href="news">Hacker News</a>,
 <a href="newest">new</a>,
 <a href="front">past</a>,
 <a href="newcomments">comments</a>,
 <a href="ask">ask</a>,
 <a href="show">show</a>,
 <a href="jobs">jobs</a>,
 <a href="submit">submit</a>,
 <a href="login?goto=news">login</a>,
 <a href="vote?id=26906817&amp;how=up&amp;goto=news" id="up_26906817"><div class="votearrow" title="upvote"></div></a>,
 <a class="storylink" href="https://www.bbc.com/news/world-middle-east-56842506">Stranded sailor allowed to leave abandoned ship after four years</a>,
 <a href="from?site=bbc.com"><span class="sitestr">bbc.com</span></a>,
 <a class="hnuser" href="user?id=alphachloride">alphachloride</a>,
 <a href="item?id=26906817">1 hour ago</a>,
 <a href="hide?id=26906817&amp;goto=news">hide</a>,
 <a href="item?id=26906817">96 comments</a>,
 <a href="vote?id=26907176&amp;how=up&amp;goto=news" id="up_2

## Tags
- name
- attrs
- modifiable


In [49]:
anchor = soup.find_all('a')[3]

In [51]:
anchor

<a href="front">past</a>

In [53]:
anchor.name

'a'

In [54]:
anchor.attrs

{'href': 'front'}

In [55]:
anchor.get_text()

'past'

## Traversing
Let's navigate more formally with...

- `.next_element` and `.previous_element`
- `.next_siblings` and `.previous_siblings`

In [68]:
[y for y in soup.table.children]

['\n',
 <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.gif" style="border:1px white solid;" width="18"/></a></td>
 <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
 <a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a href="jobs">jobs</a> | <a href="submit">submit</a> </span></td><td style="text-align:right;padding-right:4px;"><span class="pagetop">
 <a href="login?goto=news">login</a>
 </span></td>
 </tr></table></td></tr>,
 '\n',
 <tr id="pagespace" style="height:10px" title=""></tr>,
 <tr><td><table border="0" cellpadding="0" cellspacing="0" class="itemlist">
 <tr class="athing" id="26906817">
 <td align="right" class="title" valign="top"><span class="rank"

## Finding by `id` and `class_`

In [69]:
soup.find_all("span", class_="score")

[<span class="score" id="score_26906817">224 points</span>,
 <span class="score" id="score_26907176">28 points</span>,
 <span class="score" id="score_26904951">198 points</span>,
 <span class="score" id="score_26904136">192 points</span>,
 <span class="score" id="score_26905948">70 points</span>,
 <span class="score" id="score_26898651">457 points</span>,
 <span class="score" id="score_26904637">103 points</span>,
 <span class="score" id="score_26902430">279 points</span>,
 <span class="score" id="score_26900749">391 points</span>,
 <span class="score" id="score_26892180">58 points</span>,
 <span class="score" id="score_26907603">7 points</span>,
 <span class="score" id="score_26902821">253 points</span>,
 <span class="score" id="score_26901461">104 points</span>,
 <span class="score" id="score_26899860">33 points</span>,
 <span class="score" id="score_26905203">199 points</span>,
 <span class="score" id="score_26901757">209 points</span>,
 <span class="score" id="score_26906474">12 po

In [70]:
soup.find_all(id="26907176")

[<tr class="athing" id="26907176">
 <td align="right" class="title" valign="top"><span class="rank">2.</span></td> <td class="votelinks" valign="top"><center><a href="vote?id=26907176&amp;how=up&amp;goto=news" id="up_26907176"><div class="votearrow" title="upvote"></div></a></center></td><td class="title"><a class="storylink" href="https://spectrum.ieee.org/tech-talk/semiconductors/processors/cerebras-giant-ai-chip-now-has-a-trillions-more-transistors">Cerebras’ New Monster AI Chip Adds 1.4T Transistors</a><span class="sitebit comhead"> (<a href="from?site=ieee.org"><span class="sitestr">ieee.org</span></a>)</span></td></tr>]

## `get_text()`

In [71]:
url = 'https://www.macys.com/shop/mens-clothing/mens-underwear?id=57&cm_sp=c2_1111US_catsplash_men-_-row1-_-image_underwear-and-socks&edge=hybrid'

In [72]:
html_doc = requests.get(url).content

In [73]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [74]:
print(soup.prettify())

<html>
 <head>
  <title>
   Access Denied
  </title>
 </head>
 <body>
  <h1>
   Access Denied
  </h1>
  You don't have permission to access the requested URL on this server.
  <p>
   Reference: 18.c7794668.1619124598.24d6366f
  </p>
 </body>
</html>


# Scraping Techniques
- Chrome Inspector
- https://curl.trillworks.com/

1. Go to Chrome inspector/developer tools
2. Copy request as cURL
3. Put into the trillworks 
4. Copy and paste here and clean up
5. Try the request again

In [78]:
import os
import requests

headers = {
    'authority': 'www.macys.com',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
    'sec-ch-ua-mobile': '?0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'accept-language': 'en-US,en;q=0.9',
    'if-none-match': '"19fb13-oO1RHLn6nYAnERIvlcGEn7tVeLQ"',
}

params = (
    ('id', '57'),
    ('cm_sp', 'c2_1111US_catsplash_men-_-row1-_-image_underwear-and-socks'),
    ('edge', 'hybrid'),
)

response = requests.get('https://www.macys.com/shop/mens-clothing/mens-underwear', headers=headers, params=params)

In [79]:
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   Mens Underwear - Boxers, Briefs &amp; Jockstraps - Macy's
  </title>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="telephone=no" name="format-detection">
   <meta content="Shop the Latest Collection of Underwear for Men Online at Macys.com. FREE SHIPPING AVAILABLE!" name="description"/>
   <meta content="width=device-width, initial-scale=1" name="viewport"/>
   <meta content="Mens Underwear - Boxers, Briefs &amp; Jockstraps - Macy's" property="og:title" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
    <meta content="website" property="og:type" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
     <meta content="https://www.macys.com/img/nav/co_macysLogo3.gif" property="og:image" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
      <meta content="https:/

## Mini-Lab
1. Get all products and prices and save to CSV.
2. Create a Bottle-powered page where it shows you a random product with image, name, and price and link to the Macy's page for that product. 
3. Bonus: allow that Bottle-powered page to filter on price min and mx.

In [80]:
productNames = soup.find_all("div", {"class": "productBrand"})

In [82]:
productPrices = soup.find_all("div", {"class": "priceInfo"})

In [84]:
len(productNames)

66

In [85]:
len(productPrices)

66

In [87]:
import csv 

with open('prices.csv',  mode='w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for i in range(0,len(productNames)):
        writer.writerow([productNames[i].text.strip(), productPrices[i].text.strip()])