#### Import necessary libraries

In [152]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### Load first page

In [153]:
# load webpage content
url = 'https://keithgalli.github.io/web-scraping/'
r = requests.get(url + 'example.html')

# convert to beautiful soup object
soup = bs(r.content)

# print out html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



#### Start using beautiful soup to scrape
##### find and find_all

In [154]:
# find and find_all
first_header = soup.find('h2')
first_header

headers = soup.find_all('h2')
headers

<h2>A Header</h2>

[<h2>A Header</h2>, <h2>Another header</h2>]

In [155]:
# pass in a list of elements to look for
first_header = soup.find(['h1', 'h2'])  # order does not matter
first_header

headers = soup.find_all(['h1', 'h2'])
headers

<h1>HTML Webpage</h1>

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [156]:
# pass in attributes
paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [157]:
# nested calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [158]:
# search for specific strings
paragraphs = soup.find_all('p', string=re.compile('Some'))
paragraphs

headers = soup.find_all('h2', string=re.compile('(H|h)eader'))
headers

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

[<h2>A Header</h2>, <h2>Another header</h2>]

##### select (CSS selector)

In [159]:
# paragraphs within divs
paragraphs = soup.select('div p')
paragraphs

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [160]:
# paragraphs followings header2s
paragraphs = soup.select('h2 ~ p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [161]:
# bold element following id
bold_text = soup.select('p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [162]:
# nested calls
paragraphs = soup.select('body > p')
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [163]:
# grab element with specific property
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

##### Get different properties of the HTML 

In [164]:
# single child element
header = soup.find('h2')
header.string

# multiple child elements
div = soup.find('div')
print(div.get_text())

'A Header'


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [165]:
# get specific property from element
link = soup.find('a')
link['href']

paragraphs = soup.select('p#paragraph-id')
paragraphs[0]['id']

'https://keithgalli.github.io/web-scraping/webpage.html'

'paragraph-id'

#### Code navigation

In [166]:
# path syntax
soup.body.div.h1.string

'HTML Webpage'

In [167]:
# parent, sibling, child
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

#### Exercises
##### Load webpage

In [168]:
# load webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

# convert to beautiful soup object
webpage = bs(r.content)

print(webpage.prettify)

<bound method Tag.prettify of <html><head>
<title>Keith Galli's Page</title>
<style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
</style>
</head>
<body>
<h1>Welcome to my page!</h1>
<img src="./images/selfie1.jpg" width="300px"/>
<h2>About me</h2>
<p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>
<p>Here is a link to my channel: <a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a></

##### Grab all of the social links from the web page, in at least three different ways

In [169]:
# select links from unordered list with socials class
links = webpage.select('ul.socials a')
links = [link['href'] for link in links]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [170]:
# find links from unordered list with socials class
links = webpage.find('ul', attrs={'class': 'socials'}).find_all('a')
links = [link['href'] for link in links]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [171]:
# select links from lists with social class
links = webpage.select('li.social a')
links = [link['href'] for link in links]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

##### Scrape the table on the web page into a pandas data frame

In [172]:
# get table
table = webpage.select('table.hockey-stats')[0]

# get columns
columns = table.find('thead').find_all('th')
# get column names
column_names = [column.string for column in columns]

# get rows
rows = table.find('tbody').find_all('tr')

# put data into a pandas data frame
l = []
for tr in rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

df = pd.DataFrame(l, columns=column_names)
df.loc[df['Team'] != 'Did not play']

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9,1,1,2,2,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8,0.0,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8,5,10,15,8,,|,,,,,,,


##### Grab all the fun facts that use the word "is"

In [173]:
# gets fun facts
facts = webpage.select('ul.fun-facts li')
# filters for strings containing "is"
facts = [fact.find(string=re.compile('is')) for fact in facts]
# preserves italic text
facts = [fact.find_parent().get_text() for fact in facts if fact]
facts

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

##### Download an image

In [186]:
# gets images
images = webpage.select('div.row div.column img')
images

# loop through images
for image in images:
    image_url = url + image['src']
    image_url
    image_name = image['alt']
    image_name
    
    # saves image
    image_data = requests.get(image_url).content
    with open(image_name + '.jpg', 'wb') as handler:
        handler.write(image_data)

[<img alt="Lake Como" src="images/italy/lake_como.jpg" style="height:100%"/>,
 <img alt="Pontevecchio, Florence" src="images/italy/pontevecchio.jpg" style="height:100%"/>,
 <img alt="Riomaggiore, Cinque de Terre" src="images/italy/riomaggiore.jpg" style="height:100%"/>]

'https://keithgalli.github.io/web-scraping/images/italy/lake_como.jpg'

'Lake Como'

1369133

'https://keithgalli.github.io/web-scraping/images/italy/pontevecchio.jpg'

'Pontevecchio, Florence'

1308720

'https://keithgalli.github.io/web-scraping/images/italy/riomaggiore.jpg'

'Riomaggiore, Cinque de Terre'

683430

##### Solve the mystery challenge

In [225]:
# gets links
links = webpage.select('div.block a')
links = [link['href'] for link in links]
links

message = ''
for link in links:
    r = requests.get(url + link)
    page = bs(r.content)
    word = page.find('p', attrs={'id': 'secret-word'}).string
    message = message + str(word) + ' '
message = message[:-1]
message

['challenge/file_1.html',
 'challenge/file_2.html',
 'challenge/file_3.html',
 'challenge/file_4.html',
 'challenge/file_5.html',
 'challenge/file_6.html',
 'challenge/file_7.html',
 'challenge/file_8.html',
 'challenge/file_9.html',
 'challenge/file_10.html']

'Make sure to smash that like button and subscribe !!!'