# Beautiful soup

In [1]:
from bs4 import BeautifulSoup

Just for learning we will use offline website 

In [2]:
with open('website.html') as file:
    content = file.read()
content

'<!DOCTYPE html>\n<html>\n\n<head>\n\t<meta charset="utf-8">\n\t<title>Angela\'s Personal Site</title>\n</head>\n\n<body>\n\t<h1 id="name">Angela Yu</h1>\n\t<p><em>Founder of <strong><a href="https://www.appbrewery.co/">The App Brewery</a></strong>.</em></p>\n\t<p>I am an iOS and Web Developer. I ❤️ coffee and motorcycles.</p>\n\t<hr>\n\t<h3 class="heading">Books and Teaching</h3>\n\t<ul>\n\t\t<li>The Complete iOS App Development Bootcamp</li>\n\t\t<li>The Complete Web Development Bootcamp</li>\n\t\t<li>100 Days of Code - The Complete Python Bootcamp</li>\n\t</ul>\n\t<hr>\n\t<h3 class="heading">Other Pages</h3>\n\t<a href="https://angelabauer.github.io/cv/hobbies.html">My Hobbies</a>\n\t<a href="https://angelabauer.github.io/cv/contact-me.html">Contact Me</a>\n</body>\n\n</html>'

In [3]:
# creating an object
# import lxml  we can also use other parser
soup = BeautifulSoup(content , 'html.parser')
                    # content and Parser

In [4]:
# lets see the simple use of object soup
print(soup.title)

<title>Angela's Personal Site</title>


In [5]:
print(soup.title.string)

Angela's Personal Site


In [6]:
print(soup.prettify())  #prettified the code

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <title>
   Angela's Personal Site
  </title>
 </head>
 <body>
  <h1 id="name">
   Angela Yu
  </h1>
  <p>
   <em>
    Founder of
    <strong>
     <a href="https://www.appbrewery.co/">
      The App Brewery
     </a>
    </strong>
    .
   </em>
  </p>
  <p>
   I am an iOS and Web Developer. I ❤️ coffee and motorcycles.
  </p>
  <hr/>
  <h3 class="heading">
   Books and Teaching
  </h3>
  <ul>
   <li>
    The Complete iOS App Development Bootcamp
   </li>
   <li>
    The Complete Web Development Bootcamp
   </li>
   <li>
    100 Days of Code - The Complete Python Bootcamp
   </li>
  </ul>
  <hr/>
  <h3 class="heading">
   Other Pages
  </h3>
  <a href="https://angelabauer.github.io/cv/hobbies.html">
   My Hobbies
  </a>
  <a href="https://angelabauer.github.io/cv/contact-me.html">
   Contact Me
  </a>
 </body>
</html>



### find_all

In [9]:
# take use of tags ,attributes to find
all_anchor_tag = soup.find_all(name = 'a')
print(all_anchor_tag)

[<a href="https://www.appbrewery.co/">The App Brewery</a>, <a href="https://angelabauer.github.io/cv/hobbies.html">My Hobbies</a>, <a href="https://angelabauer.github.io/cv/contact-me.html">Contact Me</a>]


In [10]:
# getting only text
for tag in all_anchor_tag:
    print(tag.getText())

The App Brewery
My Hobbies
Contact Me


In [13]:
# and for inside tags
for tag in all_anchor_tag:
    print(tag.get('href'))

https://www.appbrewery.co/
https://angelabauer.github.io/cv/hobbies.html
https://angelabauer.github.io/cv/contact-me.html


### find
seraches by attribute name

In [17]:
# find gives the earliest occurence of searching attribute
heading = soup.find(name = 'h1',id = 'name')
print(heading.string)  #.string gives the tags value
print(heading.name)    # name will give the nameof heading

Angela Yu


In [19]:
#another
section_head = soup.find(name = 'h3',class_ = 'heading')# class is reserved keyword and therefore we must use'_' after class
print(section_head.string)

Books and Teaching


###  using selectors
selectors can be of html type ,css type 

In [21]:
name = soup.select_one(selector = '#name') # select one selects first occurence
print(name.string)

Angela Yu


In [23]:
print(soup.select(selector = 'p'))

[<p><em>Founder of <strong><a href="https://www.appbrewery.co/">The App Brewery</a></strong>.</em></p>, <p>I am an iOS and Web Developer. I ❤️ coffee and motorcycles.</p>]


In [24]:
print(soup.select_one(selector='p'))

<p><em>Founder of <strong><a href="https://www.appbrewery.co/">The App Brewery</a></strong>.</em></p>


In [26]:
print(soup.select(selector = 'p a' )) # selects like css selector p then a

[<a href="https://www.appbrewery.co/">The App Brewery</a>]


In [29]:
heading = soup.select(".heading") #will select the tag with heading class
print(heading)

[<h3 class="heading">Books and Teaching</h3>, <h3 class="heading">Other Pages</h3>]


###  Here are some advice while scarping

must aabide their rules and respect owner
you can check the rule through adding '/robot.txt' at end of url or after '.com'
their is a point of crawl- delay --- it tells n.o of seconds you should leave after each time of crawling
