# Web Scraping

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = 'https://web-scraping-demo.zgulde.net/news'
response = get(url)
response

<Response [200]>

In [3]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>News Example Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap


In [4]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
articles = soup.select('div.grid.grid-cols-4')

In [6]:
article = articles[0]
article

<div class="grid grid-cols-4 gap-x-4 border rounded pr-3 bg-green-50 hover:shadow-lg transition duration-500">
<img src="/static/placeholder.png"/>
<div class="col-span-3 space-y-3 py-3">
<h2 class="text-2xl text-green-900">notice company establish</h2>
<div class="grid grid-cols-2 italic">
<p> 1997-11-15 </p>
<p class="text-right">By David Sandoval </p>
</div>
<p>Must sister free production process short. Both case authority lose.
Science play campaign wrong loss side. Why about paper big performance nothing activity.</p>
</div>
</div>

In [7]:
def parse_news_article(article):
    output = {}
    output['headline'] = article.find('h2').text
    output['date'], output['byline'], output['description'] = [p.text for p in article.find_all('p')]
    return output

In [8]:
pd.DataFrame([parse_news_article(article) for article in articles])

Unnamed: 0,headline,date,byline,description
0,notice company establish,1997-11-15,By David Sandoval,Must sister free production process short. Bot...
1,what drive edge,2010-03-25,By Rhonda Sanford,Difficult Mr development budget and budget at ...
2,one series area,1994-06-10,By Heather Garcia,Than voice of family operation trade pressure....
3,girl along everything,1986-07-05,By Joseph Spencer,Phone process from name style. Section investm...
4,politics term wide,1982-06-07,By Kathryn Becker,Program one later various. Green market securi...
5,now same truth,1979-07-09,By Courtney Harris,Bad home upon able court dog low. Trade true c...
6,reflect guess laugh,1985-06-08,By Mary Neal,Answer cost modern. Skin could must trip. Chai...
7,church let pattern,2017-10-27,By Lisa Williams,Knowledge change admit summer sell. Recognize ...
8,about affect box,1980-05-13,By Joseph Miranda,Lawyer fast receive whom world. Threat couple ...
9,property before whether,1995-01-24,By Douglas Barber,Difficult tree forward open hot of. Image top ...


In [9]:
url = 'https://web-scraping-demo.zgulde.net/people'
response = get(url)
response

<Response [200]>

In [10]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Example People Page</title>
    <link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet" />
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstr


In [11]:
# Make a soup variable holding the response content
soup = BeautifulSoup(response.text, 'html.parser')

In [12]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Example People Page</title>
<link href="https://unpkg.com/tailwindcss@^2/dist/tailwind.min.css" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.4.1/font/bootstrap-icons.css" rel="stylesheet"/>
</head>
<body class="mx-auto max-w-screen-lg pb-32">
<h1 class="my-5 text-4xl text-center">People</h1>
<div class="my-5 text-red-800 px-5 py-3 bg-red-100 font-bold">
<p>
<i class="bi bi-exclamation-circle text-xl"></i>
        All data on this page is strictly for demonstration purposes and fake.
    </p>
</div>
<div class="grid grid-cols-2 gap-x-12 gap-y-16" id="people">
<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Nathan Campbel

In [13]:
people = soup.select('div.person.border.rounded.px-3')

In [14]:
people1 = people[0]

In [15]:
people1

<div class="person border rounded px-3 py-5 grid grid-cols-2 gap-x-3 bg-purple-50 hover:shadow-lg transition duration-500">
<h2 class="text-2xl text-purple-800 name col-span-full border-b">Nathan Campbell</h2>
<p class="quote col-span-full px-5 py-5 text-center text-gray-500">
            "Reverse-engineered systematic infrastructure"
        </p>
<div class="grid grid-cols-9">
<i class="bi bi-envelope-fill text-purple-800"></i>
<p class="email col-span-8">brian72@hotmail.com</p>
<i class="bi bi-telephone-fill text-purple-800"></i>
<p class="phone col-span-8">674-546-2220x3857</p>
</div>
<div class="address grid grid-cols-9">
<i class="bi bi-geo-fill text-purple-800"></i>
<p class="col-span-8">
                364 Mackenzie Ways <br/>
                West Mark, TN 74251
            </p>
</div>
</div>

In [16]:
def parse_people(person):
    output = {}
    output['name'] = person.find('h2').text
    output['description'], output['email'], output['phone'], output['address'] = [p.text for p in person.find_all('p')]
    return output

In [17]:
output = parse_people(people1)

In [18]:
output

{'name': 'Nathan Campbell',
 'description': '\n            "Reverse-engineered systematic infrastructure"\n        ',
 'email': 'brian72@hotmail.com',
 'phone': '674-546-2220x3857',
 'address': '\n                364 Mackenzie Ways \n                West Mark, TN 74251\n            '}

## Exercises

## 1. Visit Codeup's Blog and record the urls for at least 5 distinct blog posts. For each post, you should scrape at least the post's title and content.
Encapsulate your work in a function named get_blog_articles that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:
- 'title': 
- 'content':

In [44]:
url = 'https://codeup.com/blog/'
# Website doesn't accept python-requests
headers = {'User-Agent' : 'Codeup Data Science'}
response = get(url, headers=headers)

In [45]:
response

<Response [200]>

In [46]:
print(response.text[:400])

<!DOCTYPE html>
<html lang="en-US">
<head>
	<meta charset="UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=edge">
	<link rel="pingback" href="https://codeup.com/xmlrpc.php" />

	<script type="text/javascript">
		document.documentElement.className = 'js';
	</script>
	
	<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin /><script id="diviarea-loader">window.DiviPopupData=wi


In [48]:
# Doing a single article breakdown first
test = 'https://codeup.com/workshops/virtual/learn-to-code-python-workshop-on-4-16/'
headers = {'User-Agent' : 'Codeup Data Science'}
response2 = get(test, headers=headers)

In [49]:
# Do the soup thing
soup = BeautifulSoup(response.text, 'html.parser')
soup2 = BeautifulSoup(response2.text, 'html.parser')

In [50]:
soup

<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<link href="https://codeup.com/xmlrpc.php" rel="pingback"/>
<script type="text/javascript">
		document.documentElement.className = 'js';
	</script>
<link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/><script id="diviarea-loader">window.DiviPopupData=window.DiviAreaConfig={"zIndex":1000000,"animateSpeed":400,"triggerClassPrefix":"show-popup-","idAttrib":"data-popup","modalIndicatorClass":"is-modal","blockingIndicatorClass":"is-blocking","defaultShowCloseButton":true,"withCloseClass":"with-close","noCloseClass":"no-close","triggerCloseClass":"close","singletonClass":"single","darkModeClass":"dark","noShadowClass":"no-shadow","altCloseClass":"close-alt","popupSelector":".et_pb_section.popup","initializeOnEvent":"et_pb_after_init_modules","popupWrapperClass":"area-outer-wrap","fullHeightClass":"full-height","openPopupClass":"da-overlay-visible","ove

In [51]:
soup2

<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<link href="https://codeup.com/xmlrpc.php" rel="pingback"/>
<script type="text/javascript">
		document.documentElement.className = 'js';
	</script>
<link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/><script id="diviarea-loader">window.DiviPopupData=window.DiviAreaConfig={"zIndex":1000000,"animateSpeed":400,"triggerClassPrefix":"show-popup-","idAttrib":"data-popup","modalIndicatorClass":"is-modal","blockingIndicatorClass":"is-blocking","defaultShowCloseButton":true,"withCloseClass":"with-close","noCloseClass":"no-close","triggerCloseClass":"close","singletonClass":"single","darkModeClass":"dark","noShadowClass":"no-shadow","altCloseClass":"close-alt","popupSelector":".et_pb_section.popup","initializeOnEvent":"et_pb_after_init_modules","popupWrapperClass":"area-outer-wrap","fullHeightClass":"full-height","openPopupClass":"da-overlay-visible","ove

In [52]:
# Getting the blog urls
urls = [a.attrs['href'] for a in soup.select('a.more-link')]

In [53]:
urls

['https://codeup.com/workshops/virtual/learn-to-code-html-css-on-4-30/',
 'https://codeup.com/workshops/virtual/learn-to-code-python-workshop-on-4-16/',
 'https://codeup.com/codeup-news/coming-soon-cloud-administration/',
 'https://codeup.com/featured/5-books-every-woman-in-tech-should-read/',
 'https://codeup.com/codeup-news/codeup-start-dates-for-march-2022/',
 'https://codeup.com/codeup-news/vet-tec-funding-dallas/',
 'https://codeup.com/codeup-news/dallas-campus-re-opens-with-new-grant-partner/',
 'https://codeup.com/codeup-news/codeups-placement-team-continues-setting-records/',
 'https://codeup.com/it-training/it-certifications-101/',
 'https://codeup.com/cybersecurity/a-rise-in-cyber-attacks-means-opportunities-for-veterans-in-san-antonio/',
 'https://codeup.com/codeup-news/use-your-gi-bill-benefits-to-land-a-job-in-tech/',
 'https://codeup.com/tips-for-prospective-students/which-program-is-right-for-me-cyber-security-or-systems-engineering/',
 'https://codeup.com/it-training/wh

In [62]:
def get_blog_article_urls():
    headers = {'user-agent': 'Codeup Data Science'}
    response = get('https://codeup.com/blog/', headers=headers)
    soup = BeautifulSoup(response.text)
    urls = [a.attrs['href'] for a in soup.select('a.more-link')]
    return urls

In [63]:
def parse_blog_article(soup):
    return {
        'title': soup.select_one('h1.entry-title').text,
        'published': soup.select_one('.published').text,
        'content': soup.select_one('.entry-content').text.strip(),
    }

In [64]:
def get_blog_articles():
    urls = get_blog_article_urls()
    articles = []
    for url in urls:
        response = get(url, headers=headers)
        soup = BeautifulSoup(response.text)
        articles.append(parse_blog_article(soup))
        
    df = pd.DataFrame(articles)
    return df

In [65]:
df = get_blog_articles()

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      15 non-null     object
 1   published  15 non-null     object
 2   content    15 non-null     object
dtypes: object(3)
memory usage: 488.0+ bytes


###  Example of looping through entire blogsite for all blogs (this is not my work it is Ben Smith's https://github.com/bensmith07/nlp-exercises/blob/main/acquire_notebook.ipynb)

In [68]:
def get_blog_articles(use_cache=True):
    
    # establish a filename for the local csv
    filename = 'codeup_blog_articles.csv'
    
    if use_cache:
        
        # check to see if a local copy already exists
        if os.path.exists(filename):
            print('Reading from local CSV...')
            # if so, return the local csv
            return pd.read_csv(filename)
        
    # otherwise, scrape the data from codeup.com
    print('Reading blog articles from codeup.com...')
    
    articles = []

    # go to blog homepage
    url = 'https://codeup.com/blog/'
    headers = {'user-agent': 'Innis Data Science Cohort'}
    response = get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get url for next page of articles
    # (returns None if there are no more pages)
    next_page = soup.select_one('.pagination.clearfix').div.a

    # get the urls for the rest of the articles on this page
    urls = []
    for article in soup.select('article'):
        #for link in article.select('.more-link'):
        for link in article.select('.entry-featured-image-url'):
            urls.append(link.attrs['href'])

    # go to each article page
    for url in urls:
        response = get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # pull article info and append to list
        dct = {}
        dct['title'] = soup.select_one('.entry-title').text
        dct['content'] = soup.select_one('.entry-content').text.strip()
        articles.append(dct)

    page_counter = 1
    print(f'{page_counter} pages complete     ', end='\r')

    # check whether there is a next page
    while next_page != None:
        # go to the next page
        url = next_page.attrs['href']
        response = get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # get url for next page of articles
        # (this will return None if there are no more pages)
        next_page = soup.select_one('.pagination.clearfix').div.a

        # get all the urls for articles on this page
        urls = []
        for article in soup.select('article'):
            for link in article.select('.entry-featured-image-url'):
                urls.append(link.attrs['href'])

        # go to each article page
        for url in urls:
            response = get(url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')

            # pull article info and append to list
            dct = {}
            dct['title'] = soup.select_one('.entry-title').text
            dct['content'] = soup.select_one('.entry-content').text.strip()
            articles.append(dct)

        page_counter += 1
        print(f'{page_counter} pages complete     ', end='\r')
        
    print(f'{page_counter} pages scraped. No more pages available.')
    
    articles = pd.DataFrame(articles)
    
    # cache local copy
    print('Writing to local CSV...')
    articles.to_csv(filename, index=False)
    
    return articles

## 2. Scraping text from inshorts, for business, sports, technology, entertainment. 
- End function get_news_articles() that returns a list of dictionaries with title, content, category in a dictionary.

1. Figure out how to deal with one card
1. Figure out how to loop over cards on a page
1. Figure out how to deal with multiple pages (categories)
1. Turn it into a function

In [19]:
response = get('http://www.inshorts.com/en/read/entertainment')
soup = BeautifulSoup(response.text)

In [20]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<style>
    /* The Modal (background) */
    .modal_contact {
        display: none; /* Hidden by default */
        position: fixed; /* Stay in place */
        z-index: 8; /* Sit on top */
        left: 0;
        top: 0;
        width: 100%; /* Full width */
        height: 100%;
        overflow: auto; /* Enable scroll if needed */
        background-color: rgb(0,0,0); /* Fallback color */
        background-color: rgba(0,0,0,0.4); /* Black w/ opacity */
    }

    /* Modal Content/Box */
    .modal-content {
        background-color: #fefefe;
        margin: 15% auto;
        padding: 20px !important;
        padding-top: 0 !important;
        /* border: 1px solid #888; */
        text-align: center;
        position: relative;
        border-radius: 6px;
    }

    /* The Close Button */
    .close {
      left: 90%;
      color: #aaa;
      float: right;
      font-size: 28px;
      font-weight: bold;
    /* positio

In [21]:
card = soup.select_one('div.news-card')
output = {}

In [22]:
output['title'] = card.select_one('a.clickable').text.strip()

In [23]:
output

{'title': "Why should we go to another industry: Mahesh Babu on 'B'wood can't afford me' remark"}

In [24]:
card_content = card.select_one('div.news-card-content')

In [25]:
card_content

<div class="news-card-content news-right-box">
<div itemprop="articleBody">Telugu actor Mahesh Babu, who courted controversy for saying 'Bollywood can't afford me', stated, "I've always wanted to do Telugu films." "I've always wished for Telugu movies to do well across the country. I strongly feel, why should we go to another industry by leaving ours behind?" he added. Mahesh further clarified that he respects all languages.</div>
<div class="news-card-author-time news-card-author-time-in-content">
<a href="/prev/en/news/why-should-we-go-to-another-industry-mahesh-babu-on-bwood-cant-afford-me-remark-1652281134612"><span class="short">short</span></a> by <span class="author">Daisy Mowke</span> / 
      <span class="time" content="2022-05-11T14:58:54.000Z" itemprop="dateModified">08:28 pm</span> on <span class="date">11 May</span>
</div>
</div>

In [26]:
output['content'] = card_content.select_one('div').text

In [27]:
output

{'title': "Why should we go to another industry: Mahesh Babu on 'B'wood can't afford me' remark",
 'content': 'Telugu actor Mahesh Babu, who courted controversy for saying \'Bollywood can\'t afford me\', stated, "I\'ve always wanted to do Telugu films." "I\'ve always wished for Telugu movies to do well across the country. I strongly feel, why should we go to another industry by leaving ours behind?" he added. Mahesh further clarified that he respects all languages.'}

In [28]:
card_content.select_one('.author').text

'Daisy Mowke'

In [29]:
output['author'] = card_content.select_one('.author').text

In [30]:
card_content.select_one('.time')

<span class="time" content="2022-05-11T14:58:54.000Z" itemprop="dateModified">08:28 pm</span>

In [31]:
card_content.select_one('.time').attrs

{'class': ['time'],
 'itemprop': 'dateModified',
 'content': '2022-05-11T14:58:54.000Z'}

In [32]:
card_content.select_one('.time').attrs['content']

'2022-05-11T14:58:54.000Z'

In [33]:
output['published'] = card_content.select_one('.time').attrs['content']

In [34]:
output

{'title': "Why should we go to another industry: Mahesh Babu on 'B'wood can't afford me' remark",
 'content': 'Telugu actor Mahesh Babu, who courted controversy for saying \'Bollywood can\'t afford me\', stated, "I\'ve always wanted to do Telugu films." "I\'ve always wished for Telugu movies to do well across the country. I strongly feel, why should we go to another industry by leaving ours behind?" he added. Mahesh further clarified that he respects all languages.',
 'author': 'Daisy Mowke',
 'published': '2022-05-11T14:58:54.000Z'}

In [35]:
def parse_news_card(card, category):
    '''
    Take in a news card and return a dictionary.
    '''
    output = {}
    output['category'] = category
    output['title'] = card.select_one('a.clickable').text.strip()
    card_content = card.select_one('div.news-card-content')
    output['content'] = card_content.select_one('div').text
    output['author'] = card_content.select_one('.author').text
    output['published'] = card_content.select_one('.time').attrs['content']
    
    return output

In [36]:
def parse_entertainment_page():
    url = 'http://www.inshorts.com/en/read/entertainment'
    response = get(url)
    soup = BeautifulSoup(response.text)
    articles = [parse_news_card(card, category) for card in soup.select('.news-card')]
    return articles

In [37]:
def parse_inshorts_page(category):
    url = 'http://www.inshorts.com/en/read/' + category
    response = get(url)
    soup = BeautifulSoup(response.text)
    articles = [parse_news_card(card, category) for card in soup.select('.news-card')]
    return articles

In [38]:
parse_inshorts_page('business')

[{'category': 'business',
  'title': 'Bill Gates tests positive for COVID-19',
  'content': 'Microsoft Co-founder Bill Gates took to Twitter on Tuesday and wrote, "I\'ve tested positive for COVID-19. I\'m experiencing mild symptoms and am following the experts\' advice by isolating until I\'m healthy again." "I\'m fortunate to be vaccinated and boosted and have access to testing and great medical care," he added.',
  'author': 'Ankush Verma',
  'published': '2022-05-11T03:05:42.000Z'},
 {'category': 'business',
  'title': "Stablecoin UST, that's meant to maintain a $1 peg, crashes to 40 cents",
  'content': 'TerraUSD or \'UST\', a stablecoin that\'s meant to maintain a $1 peg, plunged over 55% on Wednesday from 24 hours earlier to around 40 cents. Its sister token fell more than 80% to $5 and other cryptocurrencies like Bitcoin and Ether also plunged. A stablecoin is a digital currency pegged to a "stable" reserve asset like the US dollar.',
  'author': 'Pragya Swastik',
  'published':

In [39]:
categories = ['business', 'sports', 'entertainment', 'technology']
articles = []

for category in categories:
    category_articles = parse_inshorts_page(category)
    articles.extend(category_articles)

In [40]:
articles

[{'category': 'business',
  'title': "Stablecoin UST, that's meant to maintain a $1 peg, crashes to 40 cents",
  'content': 'TerraUSD or \'UST\', a stablecoin that\'s meant to maintain a $1 peg, plunged over 55% on Wednesday from 24 hours earlier to around 40 cents. Its sister token fell more than 80% to $5 and other cryptocurrencies like Bitcoin and Ether also plunged. A stablecoin is a digital currency pegged to a "stable" reserve asset like the US dollar.',
  'author': 'Pragya Swastik',
  'published': '2022-05-11T08:59:37.000Z'},
 {'category': 'business',
  'title': 'Bill Gates tests positive for COVID-19',
  'content': 'Microsoft Co-founder Bill Gates took to Twitter on Tuesday and wrote, "I\'ve tested positive for COVID-19. I\'m experiencing mild symptoms and am following the experts\' advice by isolating until I\'m healthy again." "I\'m fortunate to be vaccinated and boosted and have access to testing and great medical care," he added.',
  'author': 'Ankush Verma',
  'published':

In [41]:
pd.DataFrame(articles)

Unnamed: 0,category,title,content,author,published
0,business,"Stablecoin UST, that's meant to maintain a $1 ...","TerraUSD or 'UST', a stablecoin that's meant t...",Pragya Swastik,2022-05-11T08:59:37.000Z
1,business,Bill Gates tests positive for COVID-19,Microsoft Co-founder Bill Gates took to Twitte...,Ankush Verma,2022-05-11T03:05:42.000Z
2,business,Investors lose over ₹13 lakh crore in a four-d...,A four-day decline in stocks has wiped out ove...,Pragya Swastik,2022-05-11T13:11:15.000Z
3,business,"Health, digital adoption among '5 megatrends' ...",Tata Sons Chairman N Chandrasekaran has listed...,Pragya Swastik,2022-05-11T15:22:51.000Z
4,business,Zomato's market cap falls below its last priva...,Zomato's market capitalisation fell below its ...,Hiral Goyal,2022-05-10T14:15:44.000Z
...,...,...,...,...,...
95,technology,Apple testing folding panel for iPhone and iPa...,Apple is developing a new folding OLED screen ...,Ananya Goyal,2022-05-11T12:13:28.000Z
96,technology,Elon Musk's tweets on taking Tesla private wer...,A US judge said in an April judgement that Tes...,Aishwarya Awasthi,2022-05-11T07:21:55.000Z
97,technology,Musk says Tesla is open to buying a mining fir...,Tesla CEO Elon Musk has said that Tesla is ope...,Aishwarya Awasthi,2022-05-11T09:22:22.000Z
98,technology,"Starlink resisted Russian hacks, but they're r...",SpaceX CEO Elon Musk in response to a report o...,Aishwarya Awasthi,2022-05-11T05:19:24.000Z


In [42]:
def get_news_articles():
    categories = ['business', 'sports', 'entertainment', 'technology']
    articles = []

    for category in categories:
        print(f'getting articles for {category}')
        category_articles = parse_inshorts_page(category)
        articles.extend(category_articles)
        
    return pd.DataFrame(articles)

In [43]:
df = get_news_articles()
df.head(100)

getting articles for business
getting articles for sports
getting articles for entertainment
getting articles for technology


Unnamed: 0,category,title,content,author,published
0,business,"Stablecoin UST, that's meant to maintain a $1 ...","TerraUSD or 'UST', a stablecoin that's meant t...",Pragya Swastik,2022-05-11T08:59:37.000Z
1,business,Bill Gates tests positive for COVID-19,Microsoft Co-founder Bill Gates took to Twitte...,Ankush Verma,2022-05-11T03:05:42.000Z
2,business,Investors lose over ₹13 lakh crore in a four-d...,A four-day decline in stocks has wiped out ove...,Pragya Swastik,2022-05-11T13:11:15.000Z
3,business,"Health, digital adoption among '5 megatrends' ...",Tata Sons Chairman N Chandrasekaran has listed...,Pragya Swastik,2022-05-11T15:22:51.000Z
4,business,Zomato's market cap falls below its last priva...,Zomato's market capitalisation fell below its ...,Hiral Goyal,2022-05-10T14:15:44.000Z
...,...,...,...,...,...
95,technology,"Chinese burn 3 am oil, Americans avoid going t...",The world's richest person Elon Musk praised C...,Ananya Goyal,2022-05-11T13:29:14.000Z
96,technology,"Google to pay over 300 news publishers in EU, ...",Google has agreed to pay over 300 publishers a...,Aishwarya Awasthi,2022-05-11T07:50:01.000Z
97,technology,Elon Musk's tweets on taking Tesla private wer...,A US judge said in an April judgement that Tes...,Aishwarya Awasthi,2022-05-11T07:21:55.000Z
98,technology,Will stay at Tesla as long as I can be useful:...,Tesla CEO Elon Musk on being asked about how l...,Aishwarya Awasthi,2022-05-11T04:14:36.000Z
