# Load the necessary libraries

In [2]:
import requests
from bs4 import BeautifulSoup as bs

# Load our first page

In [12]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

soup = bs(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Start using Beautiful Soup to Scrape

find and find all

In [15]:
first_header = soup.find('h2')

headers = soup.find_all('h2')
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [20]:
# pass in a list of elements to look for
first_header = soup.find(['h1', 'h2'])
#first_header

headers = soup.find_all(['h1', 'h2'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [21]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [26]:
soup.find_all(['p', 'h1', 'h2'])

[<h1>HTML Webpage</h1>,
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [28]:
# You can nest find/find_all calls
body = soup.find('body')
print(body)
print()
div = body.find('div')
print(div)
print()
header = div.find('h1')
print(header)

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

<h1>HTML Webpage</h1>


In [32]:
# We can search specific strings in our find/find_all calls
import re

paragraphs = soup.find_all('p', string=re.compile('Some'))
print(paragraphs)

headers = soup.find_all('h2', string=re.compile('(H|h)eader'))
headers

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


[<h2>A Header</h2>, <h2>Another header</h2>]

#**Select (CSS selector)**

In [33]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [34]:
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [35]:
paragraphs = soup.select('h2 ~ p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [36]:
bold_text = soup.select('p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [43]:
paragraphs = soup.select('body > p')
print(paragraphs)
print()

for paragraph in paragraphs:
  print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]

[<i>Some italicized text</i>]
[]


In [44]:
# Grab by element with specific property
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

# Get different properties of the HTML

In [48]:
# use .string
header = soup.find('h2')
header.string

# If multiple child elements use get_text
div = soup.find('div')
print(div.prettify())
print()
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [62]:
# Get a specific property from an element
link = soup.find('a')
print(link['href'])

paragraphs = soup.select('p#paragraph-id')
paragraphs[0]['id']

https://keithgalli.github.io/web-scraping/webpage.html


'paragraph-id'

# Code Navigation

In [64]:
# Path Syntax
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [65]:
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

# Exercises

## Load the webpage

In [67]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
webpage = bs(r.content)

# Print out our HTML
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

# Grab all the social media links from the webpage


Do this in at least 3 different ways

In [72]:
links = webpage.select('ul.socials a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [84]:
ulist = webpage.find('ul', attrs={'class': 'socials'})
print(ulist)
print()
lists = ulist.find_all('a')
print(lists)
print()
actual_links = [link['href'] for link in links ]
actual_links

<ul class="socials">
<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
</ul>

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>, <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>, <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>, <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]



['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [87]:
links = webpage.select('li.social a')
print(links)
print()
actual_links = [link['href'] for link in links]
actual_links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>, <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>, <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>, <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]



['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [93]:
links = webpage.select('body ul li.social a')
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

# Exercise 2: Grab all text on the webpage

Just get stuff above the Photos tag

In [97]:
header = webpage.body.find('h2', string='Photos')

print('------------Header----------------')
print(header)
previous_elements = header.find_previous_siblings()

print('------------Previous Elements----------------')
print(previous_elements)
previous_elements_sorted = previous_elements[::-1]

print('------------Previous Elements Sorted----------------')
print(previous_elements_sorted)

elements = [x.get_text() for x in previous_elements_sorted]

print('------------Elements----------------')
print(elements)
text = '\n'.join(elements)
print('----------------Text----------------')
print(text)

------------Header----------------
<h2>Photos</h2>
------------Previous Elements----------------
[<ul class="socials">
<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
</ul>, <br/>, <h2>Social Media</h2>, <ul class="fun-facts">
<li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>
<li>Middle name is Ronald</li>
<li>Never had been on a plane until college</li>
<li>Dunkin Donuts coffee is better than Starbucks</li>
<li>A favorite book series of mine is <i>Ender's Game</i></li>
<li>C

#Scrape the Table

In [102]:
import pandas as pd

table = webpage.select('table.hockey-stats')[0]
columns = table.find('thead').find_all('th')
column_names = [c.string for c in columns]

table_rows = table.find('tbody').find_all('tr')
l = []
for tr in table_rows:
  td = tr.find_all('td')
  row = [str(tr.get_text()).strip() for tr in td]
  l.append(row)

df = pd.DataFrame(l, columns=column_names)
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


# Grab all fun facts that use word 'is'

In [103]:
import re

facts = webpage.select('ul.fun-facts li')
print('facts')
print(facts)
print()
facts_with_is = [facts.find(string=re.compile('is')) for facts in facts]
print('facts_with_is')
print(facts_with_is)
print()
facts_with_is1 = [fact.find_parent().get_text() for fact in facts_with_is if fact]
print('facts_with_is1')
print(facts_with_is1)
print()

facts
[<li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>, <li>Middle name is Ronald</li>, <li>Never had been on a plane until college</li>, <li>Dunkin Donuts coffee is better than Starbucks</li>, <li>A favorite book series of mine is <i>Ender's Game</i></li>, <li>Current video game of choice is <i>Rocket League</i></li>, <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>]

facts_with_is
[None, 'Middle name is Ronald', None, 'Dunkin Donuts coffee is better than Starbucks', 'A favorite book series of mine is ', 'Current video game of choice is ', "The band that I've seen the most times live is the "]

facts_with_is1
['Middle name is Ronald', 'Dunkin Donuts coffee is better than Starbucks', "A favorite book series of mine is Ender's Game", 'Current video game of choice is Rocket League', "The band that I've seen the most times live is the Zac Brown Band"]



# Mystery Challenge

In [119]:
files = webpage.select('div.block a')
relative_files = [f['href'] for f in files]
print(relative_files)
url = "https://keithgalli.github.io/web-scraping/"
for f in relative_files:
  full_url = url + f
  page = requests.get(full_url)
  bs_page = bs(page.content)
  secret_word_element = bs_page.find('p', attrs={'id': 'secret-word'})
  secret_word = secret_word_element.string
  print(secret_word)

[]
