# Web Scraping

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = 'https://web-scraping-demo.zgulde.net/news'
response = get(url)
response

<Response [200]>

In [3]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>News Example Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap


In [4]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
articles = soup.select('div.grid.grid-cols-4')

In [6]:
article = articles[0]
article

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
<img src="/static/placeholder.png"/>
<div class="col-span-3 space-y-3 py-3">
<h2 class="text-2xl text-green-900">practice perform discussion</h2>
<div class="grid grid-cols-2 italic">
<p> 2003-10-22 </p>
<p class="text-right">By Christopher Daniels </p>
</div>
<p>Arrive activity grow above. Find along eye establish we camera view.
Animal fire walk know people natural tax especially. Market friend fine coach however husband.</p>
</div>
</div>

In [7]:
def parse_news_article(article):
    output = {}
    output['headline'] = article.find('h2').text
    output['date'], output['byline'], output['description'] = [p.text for p in article.find_all('p')]
    return output

In [8]:
pd.DataFrame([parse_news_article(article) for article in articles])

Unnamed: 0,headline,date,byline,description
0,practice perform discussion,2003-10-22,By Christopher Daniels,Arrive activity grow above. Find along eye est...
1,professional half kind,2019-03-29,By Thomas Jones,Suddenly executive by parent game nor. Author ...
2,song cost memory,1978-12-13,By Erik Grant,Approach young product focus current. Most lan...
3,nature age field,2021-01-03,By Elizabeth Gonzales,Easy technology chair system door. Less everyb...
4,expert consumer establish,1985-09-03,By Dr. Robert Thomas III,Third position significant protect glass trave...
5,bank guy result,2008-04-07,By Laura Myers,Increase play draw leave floor. Country someon...
6,new offer fall,2000-04-01,By Collin Moore,Air job senior politics should. Quality shake ...
7,sea together federal,1987-05-11,By Morgan Noble,Sound agreement half toward whatever. Concern ...
8,none talk sister,2017-12-20,By Nicholas Blanchard,Fine something rate Mrs enough. Technology per...
9,media trade six,2009-03-15,By Randy Miller,Reality project maybe number. School moment se...


In [9]:
url = 'https://web-scraping-demo.zgulde.net/people'
response = get(url)
response

<Response [200]>

In [10]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Example People Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstr


In [11]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [12]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Example People Page</title>
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" rel="stylesheet"/>
</head>
<body class="mx-auto max-w-screen-lg pb-32">
<h1 class="my-5 text-4xl text-center">People</h1>
<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">
<p>
<i class="bi bi-exclamation-circle text-xl"></i>
        All data on this page is strictly for demonstration purposes and fake.
    </p>
</div>
<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Brandi Baker</

In [13]:
people = soup.select('div.person.border.rounded.px-3')

In [14]:
people1 = people[0]

In [15]:
people1

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Brandi Baker</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Re-engineered multimedia encryption"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">meagandavidson@zimmerman.biz</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">+1-831-392-2398x36291</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                5533 Emily Square <br/>
                Ginatown, NM 06671
            </p>
</div>
</div>

In [16]:
def parse_people(person):
    output = {}
    output['name'] = person.find('h2').text
    output['description'], output['email'], output['phone'], output['address'] = [p.text for p in person.find_all('p')]
    return output

In [17]:
output = parse_people(people1)

In [18]:
output

{'name': 'Brandi Baker',
 'description': '\n            "Re-engineered multimedia encryption"\n        ',
 'email': 'meagandavidson@zimmerman.biz',
 'phone': '+1-831-392-2398x36291',
 'address': '\n                5533 Emily Square \n                Ginatown, NM 06671\n            '}

## Exercises

1. Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.
Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this: