# Data Acquisition / Web Scraping Exercises

In [None]:
### Soup Methods



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 

In [3]:
response = requests.get('https://web-scraping-demo.zgulde.net/people')

In [4]:
soup = BeautifulSoup(response.text)

In [5]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Example People Page</title>
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" rel="stylesheet"/>
</head>
<body class="mx-auto max-w-screen-lg pb-32">
<h1 class="my-5 text-4xl text-center">People</h1>
<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">
<p>
<i class="bi bi-exclamation-circle text-xl"></i>
        All data on this page is strictly for demonstration purposes and fake.
    </p>
</div>
<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Kari Jackson</h

In [38]:
people = soup.select('.person')
people

[<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
 <h2 class="text-2xl text-purple-800 name col-span-full border-b">Kari Jackson</h2>
 <p class="quote col-span-full px-5 py-5 text-center text-gray-500">
             "Programmable non-volatile projection"
         </p>
 <div class="grid grid-cols-9">
 <i class="bi bi-envelope-fill text-purple-800"></i>
 <p class="email col-span-8">qobrien@bullock-johnson.com</p>
 <i class="bi bi-telephone-fill text-purple-800"></i>
 <p class="phone col-span-8">130-053-2581x1308</p>
 </div>
 <div class="address grid grid-cols-9">
 <i class="bi bi-geo-fill text-purple-800"></i>
 <p class="col-span-8">
                 590 Moon Ways Apt. 937 <br/>
                 Castanedafort, IL 23106
             </p>
 </div>
 </div>,
 <div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
 <h2 class="text-2xl text-purple-800 n

In [42]:
person = persons[0]

In [44]:
person.h2

<h2 class="text-2xl text-purple-800 name col-span-full border-b">Kari Jackson</h2>

In [45]:
#name
person.h2.text

'Kari Jackson'

In [46]:
# .p finds the first p element or element with a tag name of 'p'
#quote
person.p.text.strip()

'"Programmable non-volatile projection"'

In [53]:
#Another way: quote
person.select('.quote')[0].text.strip()

'\n            "Programmable non-volatile projection"\n        '

In [49]:
#email
person.select('.email')[0].text

'qobrien@bullock-johnson.com'

In [48]:
#phone
person.select('.phone')[0].text

'130-053-2581x1308'

In [52]:
# address
address = person.select('.address')[0].text.strip()
address

'590 Moon Ways Apt. 937 \n                Castanedafort, IL 23106'

In [58]:
import re

re.sub(r'\s{2,}',' ', address)

'590 Moon Ways Apt. 937 Castanedafort, IL 23106'

#### Now create a function to put it all together

In [61]:
def parse_person(person):
    
    name = person.h2.text
    
    #quote
    quote = person.p.text.strip()
    
    #email
    email = person.select('.email')[0].text
    
    #phone
    phone = person.select('.phone')[0].text
    
    # address
    address = person.select('.address')[0].text.strip()
    address = re.sub(r'\s{2,}',' ', address)
    
    return {'name':name, 'quote':quote, 'email':email, 'phone':phone, 'address':address}

In [62]:
[parse_person(person)for person in persons]

[{'name': 'Kari Jackson',
  'quote': '"Programmable non-volatile projection"',
  'email': 'qobrien@bullock-johnson.com',
  'phone': '130-053-2581x1308',
  'address': '590 Moon Ways Apt. 937 Castanedafort, IL 23106'},
 {'name': 'Kathryn Fields',
  'quote': '"Streamlined real-time database"',
  'email': 'candersen@hamilton-harris.com',
  'phone': '851-289-2089',
  'address': '4710 Koch Motorway Suite 548 East Randallburgh, RI 18326'},
 {'name': 'Vanessa Gonzalez',
  'quote': '"Adaptive content-based installation"',
  'email': 'grimesangela@miller-kelly.com',
  'phone': '(829)520-6552',
  'address': '94567 James Cove Tranport, OK 83593'},
 {'name': 'Karl Nelson',
  'quote': '"Synergized multi-state function"',
  'email': 'deanna16@hotmail.com',
  'phone': '001-372-976-6483x068',
  'address': '3128 Henderson Ranch Apt. 670 East Kimberly, SD 70808'},
 {'name': 'Tammy Greer',
  'quote': '"Total cohesive hub"',
  'email': 'ericsims@baker.com',
  'phone': '+1-863-438-6243x67714',
  'address': 

In [63]:
pd.DataFrame([parse_person(person)for person in persons])

Unnamed: 0,name,quote,email,phone,address
0,Kari Jackson,"""Programmable non-volatile projection""",qobrien@bullock-johnson.com,130-053-2581x1308,"590 Moon Ways Apt. 937 Castanedafort, IL 23106"
1,Kathryn Fields,"""Streamlined real-time database""",candersen@hamilton-harris.com,851-289-2089,4710 Koch Motorway Suite 548 East Randallburgh...
2,Vanessa Gonzalez,"""Adaptive content-based installation""",grimesangela@miller-kelly.com,(829)520-6552,"94567 James Cove Tranport, OK 83593"
3,Karl Nelson,"""Synergized multi-state function""",deanna16@hotmail.com,001-372-976-6483x068,"3128 Henderson Ranch Apt. 670 East Kimberly, S..."
4,Tammy Greer,"""Total cohesive hub""",ericsims@baker.com,+1-863-438-6243x67714,"70916 Peters Heights Wadehaven, NY 62876"
5,Alan Matthews,"""Ameliorated foreground application""",ladams@gmail.com,+1-351-728-7645,"2781 Ferrell Course New Anthony, SD 55440"
6,Christopher Brown,"""Self-enabling hybrid adapter""",riverasamantha@castillo-charles.info,014.783.1804x534,"371 Mccormick Shore Suite 555 New Elizabeth, S..."
7,John Wright,"""Streamlined systematic alliance""",crystal64@yahoo.com,(748)670-4667x870,"7178 Potter Parks Apt. 211 Port Sharon, RI 11073"
8,Mrs. Kimberly Mercer,"""Grass-roots executive Local Area Network""",ashleygreen@davis.com,902.444.1546,"747 Turner Pine Juliemouth, TX 44615"
9,Samuel Henderson,"""Down-sized zero-defect leverage""",eellis@griffin.info,2444830191,"88891 Aguirre Lake Floydbury, IL 90681"


#### My attempt below

In [39]:
persons = soup.select('.person.border.rounded.px-3.py-5.grid.grid-cols-2.gap-x-3.bg-purple-50')

In [40]:
persons

[<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
 <h2 class="text-2xl text-purple-800 name col-span-full border-b">Kari Jackson</h2>
 <p class="quote col-span-full px-5 py-5 text-center text-gray-500">
             "Programmable non-volatile projection"
         </p>
 <div class="grid grid-cols-9">
 <i class="bi bi-envelope-fill text-purple-800"></i>
 <p class="email col-span-8">qobrien@bullock-johnson.com</p>
 <i class="bi bi-telephone-fill text-purple-800"></i>
 <p class="phone col-span-8">130-053-2581x1308</p>
 </div>
 <div class="address grid grid-cols-9">
 <i class="bi bi-geo-fill text-purple-800"></i>
 <p class="col-span-8">
                 590 Moon Ways Apt. 937 <br/>
                 Castanedafort, IL 23106
             </p>
 </div>
 </div>,
 <div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
 <h2 class="text-2xl text-purple-800 n

In [18]:
articles[0].select('.address.grid.grid-cols-9')[0].select('p')

[<p class="col-span-8">
                 590 Moon Ways Apt. 937 <br/>
                 Castanedafort, IL 23106
             </p>]

In [22]:
articles[0].select('.grid.grid-cols-9')[0].select('p')

[<p class="email col-span-8">qobrien@bullock-johnson.com</p>,
 <p class="phone col-span-8">130-053-2581x1308</p>]

In [32]:
articles[0].select('h2')[0].select('p')

[]

In [None]:
def process_article(article):
   # quote = 
    email,phone = articles[0].select('.grid.grid-cols-9')[0].select('p')
    address = articles[0].select('.address.grid.grid-cols-9')[0].select('p')
    return {
        'Name': article.h2.text,
        #'Quote': quote.text,
        'Email': email.text
        #'Phone': phone.text
        'Address': address.text
    }

pd.DataFrame([process_article(article) for article in articles])