# Web Scraping

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = 'https://web-scraping-demo.zgulde.net/news'
response = get(url)
response

<Response [200]>

In [3]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>News Example Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap


In [4]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
articles = soup.select('div.grid.grid-cols-4')

In [6]:
article = articles[0]
article

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
<img src="/static/placeholder.png"/>
<div class="col-span-3 space-y-3 py-3">
<h2 class="text-2xl text-green-900">most already forget</h2>
<div class="grid grid-cols-2 italic">
<p> 2012-06-21 </p>
<p class="text-right">By Jennifer Santos </p>
</div>
<p>List food bag machine. Nor according home by Congress nation through.
Opportunity green again very range matter. Chair force player focus focus.</p>
</div>
</div>

In [7]:
def parse_news_article(article):
    output = {}
    output['headline'] = article.find('h2').text
    output['date'], output['byline'], output['description'] = [p.text for p in article.find_all('p')]
    return output

In [8]:
pd.DataFrame([parse_news_article(article) for article in articles])

Unnamed: 0,headline,date,byline,description
0,most already forget,2012-06-21,By Jennifer Santos,List food bag machine. Nor according home by C...
1,magazine past eight,1991-10-03,By Sarah Gutierrez,Tonight career local who Congress. Question ra...
2,outside office back,2021-04-06,By Tiffany Hicks,Enjoy available Mr tonight interesting seat as...
3,part push measure,2009-07-11,By Michael Harrison,People age spring able. Price central pretty s...
4,another friend national,1975-02-08,By Ryan Baxter,Computer almost not. Population ok leader real...
5,four boy use,1991-11-24,By William Dunlap,Want answer have can stage. Nature amount race...
6,answer hope statement,2003-01-24,By Jamie Preston,Agreement believe trial figure hour raise term...
7,with always three,2019-06-30,By Timothy King,Those color exist forward voice technology dar...
8,employee such close,2002-12-16,By Kenneth Beltran,Argue foot field author determine thank see. S...
9,at state question,2016-05-28,By Charles Gardner,Seven street want list blood own. Catch rich e...


In [9]:
url = 'https://web-scraping-demo.zgulde.net/people'
response = get(url)
response

<Response [200]>

In [10]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Example People Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstr


In [11]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [12]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Example People Page</title>
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" rel="stylesheet"/>
</head>
<body class="mx-auto max-w-screen-lg pb-32">
<h1 class="my-5 text-4xl text-center">People</h1>
<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">
<p>
<i class="bi bi-exclamation-circle text-xl"></i>
        All data on this page is strictly for demonstration purposes and fake.
    </p>
</div>
<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Eric Robinson<

In [13]:
people = soup.select('div.person.border.rounded.px-3')

In [14]:
people1 = people[0]

In [15]:
people1

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Eric Robinson</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Enterprise-wide fault-tolerant capacity"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">michelle23@roberts.com</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">986.343.8016x895</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                40406 Moore Grove <br/>
                Foxstad, AK 00784
            </p>
</div>
</div>

In [16]:
def parse_people(person):
    output = {}
    output['name'] = person.find('h2').text
    output['description'], output['email'], output['phone'], output['address'] = [p.text for p in person.find_all('p')]
    return output

In [17]:
output = parse_people(people1)

In [18]:
output

{'name': 'Eric Robinson',
 'description': '\n            "Enterprise-wide fault-tolerant capacity"\n        ',
 'email': 'michelle23@roberts.com',
 'phone': '986.343.8016x895',
 'address': '\n                40406 Moore Grove \n                Foxstad, AK 00784\n            '}

## Exercises

## 1. Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.
Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:

## 2. Scraping text from inshorts, for business, sports, technology, entertainment. 
- End function get_news_articles() that returns a list of dictionaries with title, content, category in a dictionary.

1. Figure out how to deal with one card
1. Figure out how to loop over cards on a page
1. Figure out how to deal with multiple pages (categories)
1. Turn it into a function

In [19]:
response = get('http://www.inshorts.com/en/read/entertainment')
soup = BeautifulSoup(response.text)

In [20]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<style>
    /* The Modal (background) */
    .modal_contact {
        display: none; /* Hidden by default */
        position: fixed; /* Stay in place */
        z-index: 8; /* Sit on top */
        left: 0;
        top: 0;
        width: 100%; /* Full width */
        height: 100%;
        overflow: auto; /* Enable scroll if needed */
        background-color: rgb(0,0,0); /* Fallback color */
        background-color: rgba(0,0,0,0.4); /* Black w/ opacity */
    }

    /* Modal Content/Box */
    .modal-content {
        background-color: #fefefe;
        margin: 15% auto;
        padding: 20px !important;
        padding-top: 0 !important;
        /* border: 1px solid #888; */
        text-align: center;
        position: relative;
        border-radius: 6px;
    }

    /* The Close Button */
    .close {
      left: 90%;
      color: #aaa;
      float: right;
      font-size: 28px;
      font-weight: bold;
    /* positio

In [21]:
card = soup.select_one('div.news-card')
output = {}

In [22]:
output['title'] = card.select_one('a.clickable').text.strip()

In [23]:
output

{'title': 'Sonu Sood asks for 50 liver transplants worth ₹12 crore as fees to promote a hospital'}

In [24]:
card_content = card.select_one('div.news-card-content')

In [25]:
card_content

<div class="news-card-content news-right-box">
<div itemprop="articleBody">Sonu Sood, speaking to The Man magazine, recalled how he gave the money from his last few endorsements to charity. Citing an example, he said, "A gentleman...from Aster Hospitals connected with me...and said the group would like to collaborate with me...I said that I'll promote the hospitals, but give me 50 liver transplants. That's almost ₹12 crore in value."</div>
<div class="news-card-author-time news-card-author-time-in-content">
<a href="/prev/en/news/sonu-sood-asks-for-50-liver-transplants-worth-₹12-crore-as-fees-to-promote-a-hospital-1652178215731"><span class="short">short</span></a> by <span class="author">Daisy Mowke</span> / 
      <span class="time" content="2022-05-10T10:23:35.000Z" itemprop="dateModified">03:53 pm</span> on <span class="date">10 May</span>
</div>
</div>

In [26]:
output['content'] = card_content.select_one('div').text

In [27]:
output

{'title': 'Sonu Sood asks for 50 liver transplants worth ₹12 crore as fees to promote a hospital',
 'content': 'Sonu Sood, speaking to The Man magazine, recalled how he gave the money from his last few endorsements to charity. Citing an example, he said, "A gentleman...from Aster Hospitals connected with me...and said the group would like to collaborate with me...I said that I\'ll promote the hospitals, but give me 50 liver transplants. That\'s almost ₹12 crore in value."'}

In [28]:
card_content.select_one('.author').text

'Daisy Mowke'

In [29]:
output['author'] = card_content.select_one('.author').text

In [30]:
card_content.select_one('.time')

<span class="time" content="2022-05-10T10:23:35.000Z" itemprop="dateModified">03:53 pm</span>

In [31]:
card_content.select_one('.time').attrs

{'class': ['time'],
 'itemprop': 'dateModified',
 'content': '2022-05-10T10:23:35.000Z'}

In [32]:
card_content.select_one('.time').attrs['content']

'2022-05-10T10:23:35.000Z'

In [33]:
output['published'] = card_content.select_one('.time').attrs['content']

In [34]:
output

{'title': 'Sonu Sood asks for 50 liver transplants worth ₹12 crore as fees to promote a hospital',
 'content': 'Sonu Sood, speaking to The Man magazine, recalled how he gave the money from his last few endorsements to charity. Citing an example, he said, "A gentleman...from Aster Hospitals connected with me...and said the group would like to collaborate with me...I said that I\'ll promote the hospitals, but give me 50 liver transplants. That\'s almost ₹12 crore in value."',
 'author': 'Daisy Mowke',
 'published': '2022-05-10T10:23:35.000Z'}

In [35]:
def parse_news_card(card, category):
    '''
    Take in a news card and return a dictionary.
    '''
    output = {}
    output['category'] = category
    output['title'] = card.select_one('a.clickable').text.strip()
    card_content = card.select_one('div.news-card-content')
    output['content'] = card_content.select_one('div').text
    output['author'] = card_content.select_one('.author').text
    output['published'] = card_content.select_one('.time').attrs['content']
    
    return output

In [38]:
def parse_entertainment_page():
    url = 'http://www.inshorts.com/en/read/entertainment'
    response = get(url)
    soup = BeautifulSoup(response.text)
    articles = [parse_news_card(card, category) for card in soup.select('.news-card')]
    return articles

In [41]:
def parse_inshorts_page(category):
    url = 'http://www.inshorts.com/en/read/' + category
    response = get(url)
    soup = BeautifulSoup(response.text)
    articles = [parse_news_card(card, category) for card in soup.select('.news-card')]
    return articles

In [42]:
parse_inshorts_page('business')

[{'category': 'business',
  'title': 'Rupee closes at all-time low of 77.50 against US dollar',
  'content': 'The Indian rupee weakened further on Monday to close at a new all-time low of 77.50 against the US dollar, 60 paise over its previous close. During the trading session, the rupee touched its lifetime low of 77.52. The currency was weighed down by elevated crude oil prices and a widening trade deficit.',
  'author': 'Pragya Swastik',
  'published': '2022-05-09T15:27:43.000Z'},
 {'category': 'business',
  'title': "After Musk's Taj Mahal tweet, his mother says his grandparents flew there in 1954",
  'content': 'After Elon Musk tweeted he visited Taj Mahal in 2007 and called it a "wonder of the world", his mother Maye Musk shared that his grandparents flew to the Taj Mahal from South Africa in 1954. She tweeted, "The only people to ever do this trip in a single-engine propeller plane, without a radio or GPS. Their motto \'Live dangerously...carefully\'."',
  'author': 'Apaar Sharm

In [43]:
categories = ['business', 'sports', 'entertainment', 'technology']
articles = []

for category in categories:
    category_articles = parse_inshorts_page(category)
    articles.extend(category_articles)

In [44]:
articles

[{'category': 'business',
  'title': 'Rupee closes at all-time low of 77.50 against US dollar',
  'content': 'The Indian rupee weakened further on Monday to close at a new all-time low of 77.50 against the US dollar, 60 paise over its previous close. During the trading session, the rupee touched its lifetime low of 77.52. The currency was weighed down by elevated crude oil prices and a widening trade deficit.',
  'author': 'Pragya Swastik',
  'published': '2022-05-09T15:27:43.000Z'},
 {'category': 'business',
  'title': "Layout of 'world's first Bitcoin City' in El Salvador unveiled, President shares pics",
  'content': "El Salvador's President Nayib Bukele has shared the layout of the 'world's first Bitcoin City' that will initially be funded by Bitcoin-backed bonds. Bukele shared on Twitter pictures of a gold-coloured 3D model and rendered images of the city. 'Bitcoin City' will be constructed at the foot of a volcano which is being used to mine Bitcoin using geothermal energy.",
  '

In [45]:
pd.DataFrame(articles)

Unnamed: 0,category,title,content,author,published
0,business,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,Pragya Swastik,2022-05-09T15:27:43.000Z
1,business,Layout of 'world's first Bitcoin City' in El S...,El Salvador's President Nayib Bukele has share...,Hiral Goyal,2022-05-10T13:24:11.000Z
2,business,"After Musk's Taj Mahal tweet, his mother says ...",After Elon Musk tweeted he visited Taj Mahal i...,Apaar Sharma,2022-05-10T04:18:35.000Z
3,business,Musk's $44 bn Twitter deal at risk of being re...,Elon Musk's $44 billion offer to buy Twitter c...,Pragya Swastik,2022-05-10T09:16:16.000Z
4,business,"Office as we know it, is over: Airbnb CEO on l...",After Airbnb allowed its employees to work rem...,Sakshita Khosla,2022-05-10T09:19:54.000Z
...,...,...,...,...,...
95,technology,Tesla recalls 1.3 lakh vehicles in US over dis...,Tesla is recalling about 1.3 lakh vehicles in ...,Ashley Paul,2022-05-10T13:48:25.000Z
96,technology,Blue Origin mission to fly first Mexican-origi...,Billionaire Jeff Bezos-led Blue Origin has ann...,Ananya Goyal,2022-05-10T10:47:22.000Z
97,technology,"Canada, US' electric aircraft firms want to ma...",Several electric vertical take-off and landing...,Ananya Goyal,2022-05-10T12:08:12.000Z
98,technology,Indonesian President plans to meet Musk over n...,Indonesian President Joko Widodo is planning t...,Ananya Goyal,2022-05-10T12:13:57.000Z


In [46]:
def get_news_articles():
    categories = ['business', 'sports', 'entertainment', 'technology']
    articles = []

    for category in categories:
        print(f'getting articles for {category}')
        category_articles = parse_inshorts_page(category)
        articles.extend(category_articles)
        
    return pd.DataFrame(articles)

In [48]:
df = get_news_articles()
df.head(100)

getting articles for business
getting articles for sports
getting articles for entertainment
getting articles for technology


Unnamed: 0,category,title,content,author,published
0,business,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,Pragya Swastik,2022-05-09T15:27:43.000Z
1,business,When are you coming to deliver 1st Tesla? Payt...,Paytm CEO Vijay Shekhar Sharma took to Twitter...,Ridham Gambhir,2022-05-10T05:08:13.000Z
2,business,Microsoft to help cover US employees' travel c...,Microsoft has said that it will cover travel c...,Ridham Gambhir,2022-05-10T03:42:26.000Z
3,business,Twitter will comply with EU content rules afte...,Tesla CEO Elon Musk has said that Twitter will...,Ridham Gambhir,2022-05-10T07:37:59.000Z
4,business,"Bitcoin briefly drops below $30,000 for first ...","Bitcoin, in the early hours of Tuesday, fell b...",Ridham Gambhir,2022-05-10T04:33:15.000Z
...,...,...,...,...,...
95,technology,Govt aiming to make India a global drone hub b...,Union Minister Jyotiraditya Scindia has said t...,Aishwarya Awasthi,2022-05-10T11:42:06.000Z
96,technology,Indonesian President plans to meet Musk over n...,Indonesian President Joko Widodo is planning t...,Ananya Goyal,2022-05-10T12:13:57.000Z
97,technology,Tesla recalls 1.3 lakh vehicles in US over dis...,Tesla is recalling about 1.3 lakh vehicles in ...,Ashley Paul,2022-05-10T13:48:25.000Z
98,technology,"Canada, US' electric aircraft firms want to ma...",Several electric vertical take-off and landing...,Ananya Goyal,2022-05-10T12:08:12.000Z
