# Web Scraping

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = 'https://web-scraping-demo.zgulde.net/news'
response = get(url)
response

<Response [200]>

In [3]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>News Example Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap


In [4]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
articles = soup.select('div.grid.grid-cols-4')

In [6]:
article = articles[0]
article

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
<img src="/static/placeholder.png"/>
<div class="col-span-3 space-y-3 py-3">
<h2 class="text-2xl text-green-900">identify prevent religious</h2>
<div class="grid grid-cols-2 italic">
<p> 1997-07-22 </p>
<p class="text-right">By Jeff Hughes </p>
</div>
<p>Challenge relationship set. Detail pattern hundred man.
Weight much police back success soldier start defense. Whatever account for investment mouth by all. Gun national bill trial cut create community each.</p>
</div>
</div>

In [7]:
def parse_news_article(article):
    output = {}
    output['headline'] = article.find('h2').text
    output['date'], output['byline'], output['description'] = [p.text for p in article.find_all('p')]
    return output

In [9]:
pd.DataFrame([parse_news_article(article) for article in articles])

Unnamed: 0,headline,date,byline,description
0,identify prevent religious,1997-07-22,By Jeff Hughes,Challenge relationship set. Detail pattern hun...
1,song able major,1978-05-23,By Catherine Martinez,Protect ready set each already across share. C...
2,case culture only,2018-12-08,By Kevin Moore,Provide report ok would American. Growth matte...
3,use them key,1987-02-28,By Michael Roth,Over contain purpose impact. Remain add financ...
4,high radio produce,2017-01-18,By Jill Cooke,Teacher peace deal marriage than treatment. Pa...
5,change cost condition,2006-01-13,By Thomas Hunt,Sound professor because. Official cold on fish...
6,market marriage western,2020-04-27,By Robert Le,Guess herself our defense pattern page actuall...
7,subject figure save,1971-08-14,By Julie Miranda,Quite store voice letter. Activity home accord...
8,allow state blood,1976-08-30,By Ellen Rodriguez,Oil center blue gun. Door environment during o...
9,build camera five,1994-08-15,By Shannon Richardson,Enter others issue argue still. Customer prese...


In [26]:
url = 'https://web-scraping-demo.zgulde.net/people'
response = get(url)
response

<Response [200]>

In [27]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Example People Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstr


In [28]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [29]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Example People Page</title>
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" rel="stylesheet"/>
</head>
<body class="mx-auto max-w-screen-lg pb-32">
<h1 class="my-5 text-4xl text-center">People</h1>
<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">
<p>
<i class="bi bi-exclamation-circle text-xl"></i>
        All data on this page is strictly for demonstration purposes and fake.
    </p>
</div>
<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Carlos Barker<

In [30]:
people = soup.select('div.person.border.rounded.px-3')

In [31]:
people1 = people[0]

In [32]:
people1

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Carlos Barker</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Reverse-engineered neutral hub"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">dawn28@gmail.com</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">(015)721-5872x842</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                029 Wallace Harbor <br/>
                Warrenland, WA 79359
            </p>
</div>
</div>

In [38]:
def parse_people(person):
    output = {}
    output['name'] = person.find('h2').text
    output['description'], output['email'], output['phone'], output['address'] = [p.text for p in person.find_all('p')]
    return output

In [39]:
output = parse_people(people1)

In [40]:
output

{'name': 'Carlos Barker',
 'description': '\n            "Reverse-engineered neutral hub"\n        ',
 'email': 'dawn28@gmail.com',
 'phone': '(015)721-5872x842',
 'address': '\n                029 Wallace Harbor \n                Warrenland, WA 79359\n            '}