In [1]:
import requests
from collections import namedtuple, Counter
import os
import json
import yaml

In [2]:
API_KEY = yaml.safe_load(open('guardianAPI.yaml', 'r'))['API_KEY']

In [24]:
search_params = {
#     'q': "coronavirus OR covid-19",
    'q': '(coronavirus covid pandemic crisis lockdown) AND (economy economic environmental environmental legal racial racism "human right" research funds funding funder donate donation sponsor security social council protest)',
#     'section':'business',
    'from-date':'2020-01-23',
    'api-key': API_KEY,
    'page':1
}

section_params = {
#     'show-fields':'all',
    'show-blocks':'main',
    'api-key': API_KEY
}

In [19]:
search_url = 'https://content.guardianapis.com/search'
section_url = 'https://content.guardianapis.com/tags'

In [20]:
response = requests.get(url=search_url, params=search_params)

In [21]:
response.json()

{'response': {'status': 'ok',
  'userTier': 'developer',
  'total': 4435,
  'startIndex': 1,
  'pageSize': 10,
  'currentPage': 1,
  'pages': 444,
  'orderBy': 'relevance',
  'results': [{'id': 'business/2020/apr/07/wellcome-trust-urges-firms-to-donate-6bn-for-covid-19-research',
    'type': 'article',
    'sectionId': 'business',
    'sectionName': 'Business',
    'webPublicationDate': '2020-04-07T10:44:07Z',
    'webTitle': 'Wellcome Trust urges firms to donate £6bn for Covid-19 research',
    'webUrl': 'https://www.theguardian.com/business/2020/apr/07/wellcome-trust-urges-firms-to-donate-6bn-for-covid-19-research',
    'apiUrl': 'https://content.guardianapis.com/business/2020/apr/07/wellcome-trust-urges-firms-to-donate-6bn-for-covid-19-research',
    'isHosted': False,
    'pillarId': 'pillar/news',
    'pillarName': 'News'},
   {'id': 'commentisfree/2020/mar/19/coronavirus-pandemic-human-rights',
    'type': 'article',
    'sectionId': 'commentisfree',
    'sectionName': 'Opinion',

In [251]:
response = requests.get(url=section_url, params=section_params)

In [252]:
response.json()

{'response': {'status': 'ok',
  'userTier': 'developer',
  'total': 61972,
  'startIndex': 1,
  'pageSize': 10,
  'currentPage': 1,
  'pages': 6198,
  'results': [{'id': '2019-family-gift-guide/2019-family-gift-guide',
    'type': 'paid-content',
    'webTitle': '2019 family gift guide',
    'webUrl': 'https://www.theguardian.com/2019-family-gift-guide/2019-family-gift-guide',
    'apiUrl': 'https://content.guardianapis.com/2019-family-gift-guide/2019-family-gift-guide',
    'activeSponsorships': [{'sponsorshipType': 'paid-content',
      'sponsorName': 'Google',
      'sponsorLogo': 'https://static.theguardian.com/commercial/sponsor/11/Nov/2019/522e3225-cc9f-4aca-b779-1885b95b2241-GoogleNest_Logo_Horizontal-280 copy-v2.png',
      'sponsorLink': 'https://nest.com/',
      'sponsorLogoDimensions': {'width': 280, 'height': 97}}],
    'paidContentType': 'Topic'},
   {'id': '20th-century-fox-the-favourite/20th-century-fox-the-favourite',
    'type': 'paid-content',
    'webTitle': '20th C

In [25]:
params = {
#     'show-fields':'all',
    'show-blocks':'body',
    'api-key': API_KEY
}
aurl = 'https://content.guardianapis.com/business/2020/mar/15/markets-face-more-turmoil-as-fears-for-global-economy-grow-coronavirus'
response = requests.get(url=aurl, params=params)

In [273]:
d = response.json()

In [275]:
d

{'response': {'status': 'ok',
  'userTier': 'developer',
  'total': 1,
  'content': {'id': 'business/2020/mar/15/markets-face-more-turmoil-as-fears-for-global-economy-grow-coronavirus',
   'type': 'article',
   'sectionId': 'business',
   'sectionName': 'Business',
   'webPublicationDate': '2020-03-15T17:39:08Z',
   'webTitle': 'Markets face more turmoil as fears for global economy grow',
   'webUrl': 'https://www.theguardian.com/business/2020/mar/15/markets-face-more-turmoil-as-fears-for-global-economy-grow-coronavirus',
   'apiUrl': 'https://content.guardianapis.com/business/2020/mar/15/markets-face-more-turmoil-as-fears-for-global-economy-grow-coronavirus',
   'blocks': {'body': [{'id': '5e6e02788f085e564ad84680',
      'bodyHtml': '<p>Financial markets face another volatile week as the escalating coronavirus crisis tips the global economy into a downturn that some companies will struggle to survive.</p> <p>With France, Spain and Italy in lockdown, a sharp eurozone recession looks i

In [279]:
d['response']['content']['blocks']['body'][0]['bodyTextSummary']

'Financial markets face another volatile week as the escalating coronavirus crisis tips the global economy into a downturn that some companies will struggle to survive. With France, Spain and Italy in lockdown, a sharp eurozone recession looks inevitable – despite shock emergency action by the US central bank on Sunday night. And while falling share prices captured the headlines last week, analysts believe a corporate debt crisis is building as global growth goes into reverse. Fears of a cashflow crunch are also rising as self-isolating consumers shun shops and restaurants, and travel links are curbed.\n“We cannot underplay the challenge at hand here. A huge proportion of UK businesses face significant cashflow pressures and without cash firms can’t survive for long,” Karim Haji, the head of financial services at KPMG UK warned. “Banks’ margins are already squeezed, asset managers are especially vulnerable to the current market situation and insurers face the potential double hit of in

In [34]:
INFO = namedtuple('INFO', ('title', 'section', 'pdate', 'weburl', 'apiurl', 'text'))

def fetch_news(url, params, start_page):
    page = start_page
    total_pages = 1
    all_results = []
    while page <= total_pages:
        try:
            print(f"calling page {page}/{total_pages}")
            params.update({'page': page})
            resp = requests.get(url=url, params=params)
            if not resp:
                break
            data = resp.json()['response']
            total_pages = data['pages']
            page += 1
            all_results.extend(data['results'])
        except Exception as e:
            print('api call runs into exceptions...')
            print(e)
            break
    return all_results

def extract_article_info(results):
    infos = []
    for art in tqdm.tqdm(results):
        title = art['webTitle']
        section = art['sectionName']
        pdate = art['webPublicationDate']
        weburl = art['webUrl']
        apiurl = art['apiUrl']
        bodytext = fetch_text(apiurl)
        info = INFO(title=title, section=section, pdate=pdate, weburl=weburl, apiurl=apiurl, text=bodytext)
        infos.append(info)
    return infos

def fetch_text(apiurl):
    params = {
    'show-blocks':'body',
    'api-key': API_KEY
    }
    try:
        response = requests.get(apiurl, params=params)
        data = response.json()['response']
        bodytext = data['content']['blocks']['body'][0]['bodyTextSummary']
    except Exception as e:
        print("error when fetching body text")
        print(e)
    return bodytext

In [27]:
articles = fetch_news(url=search_url, params=search_params, start_page=1)

calling page 1/1
calling page 2/444
calling page 3/444
calling page 4/444
calling page 5/444
calling page 6/444
calling page 7/444
calling page 8/444
calling page 9/444
calling page 10/444
calling page 11/444
calling page 12/444
calling page 13/444
calling page 14/444
calling page 15/444
calling page 16/444
calling page 17/444
calling page 18/444
calling page 19/444
calling page 20/444
calling page 21/444
calling page 22/444
calling page 23/444
calling page 24/444
calling page 25/444
calling page 26/444
calling page 27/444
calling page 28/444
calling page 29/444
calling page 30/444
calling page 31/444
calling page 32/444
calling page 33/444
calling page 34/444
calling page 35/444
calling page 36/444
calling page 37/444
calling page 38/444
calling page 39/444
calling page 40/444
calling page 41/444
calling page 42/444
calling page 43/444
calling page 44/444
calling page 45/444
calling page 46/444
calling page 47/444
calling page 48/444
calling page 49/444
calling page 50/444
calling pag

calling page 398/444
calling page 399/444
calling page 400/444
calling page 401/444
calling page 402/444
calling page 403/444
calling page 404/444
calling page 405/444
calling page 406/444
calling page 407/444
calling page 408/444
calling page 409/444
calling page 410/444
calling page 411/444
calling page 412/444
calling page 413/444
calling page 414/444
calling page 415/444
calling page 416/444
calling page 417/444
calling page 418/444
calling page 419/444
calling page 420/444
calling page 421/444
calling page 422/444
calling page 423/444
calling page 424/444
calling page 425/444
calling page 426/444
calling page 427/444
calling page 428/444
calling page 429/444
calling page 430/444
calling page 431/444
calling page 432/444
calling page 433/444
calling page 434/444
calling page 435/444
calling page 436/444
calling page 437/444
calling page 438/444
calling page 439/444
calling page 440/444
calling page 441/444
calling page 442/444
calling page 443/444
calling page 444/444


In [28]:
len(articles)

4435

In [33]:
infos = extract_article_info(articles)

In [35]:
len(infos)

4435

In [36]:
with open('news.json', 'w') as f:
    json.dump(infos, fp=f)

In [37]:
'2020-03-10T22:01:22Z'
dates = [art.pdate.split('T')[0] for art in infos]

In [38]:
len(dates)

4435

In [39]:
Counter(dates).most_common()

[('2020-03-19', 163),
 ('2020-03-25', 144),
 ('2020-03-20', 143),
 ('2020-03-17', 143),
 ('2020-03-24', 140),
 ('2020-03-18', 134),
 ('2020-03-27', 134),
 ('2020-03-23', 131),
 ('2020-04-01', 131),
 ('2020-03-26', 130),
 ('2020-04-03', 124),
 ('2020-03-30', 124),
 ('2020-04-02', 119),
 ('2020-03-31', 116),
 ('2020-04-06', 110),
 ('2020-03-11', 108),
 ('2020-03-16', 104),
 ('2020-04-05', 102),
 ('2020-03-29', 101),
 ('2020-03-13', 99),
 ('2020-03-12', 97),
 ('2020-03-22', 95),
 ('2020-04-07', 94),
 ('2020-03-15', 89),
 ('2020-04-04', 87),
 ('2020-03-21', 77),
 ('2020-03-28', 72),
 ('2020-03-09', 66),
 ('2020-03-10', 65),
 ('2020-03-03', 64),
 ('2020-03-14', 53),
 ('2020-03-06', 52),
 ('2020-03-04', 51),
 ('2020-02-26', 44),
 ('2020-03-02', 42),
 ('2020-03-05', 41),
 ('2020-03-08', 40),
 ('2020-02-27', 39),
 ('2020-02-25', 39),
 ('2020-03-01', 39),
 ('2020-02-24', 32),
 ('2020-02-28', 32),
 ('2020-02-29', 29),
 ('2020-02-05', 29),
 ('2020-02-07', 28),
 ('2020-03-07', 27),
 ('2020-02-13',

In [40]:
titles = [art.title for art in infos]

In [41]:
titles

['Wellcome Trust urges firms to donate £6bn for Covid-19 research',
 'The coronavirus pandemic threatens a crisis for human rights too | Afua Hirsch',
 'Council workers face increasing abuse amid coronavirus pandemic',
 'Cancer Research UK to cut funding for research by £44m',
 'Australian prime minister to ask global leaders to fund urgent research for coronavirus vaccine',
 "Australia's private health funds could reap windfall from coronavirus – report",
 'Businesses affected by coronavirus should have access to disaster relief funds, says Queensland',
 'Postgraduate students urge funders to extend grants over coronavirus',
 'The coronavirus pandemic: visualising the global crisis',
 'Carnival cruises seeks $6bn funding amid coronavirus fallout',
 'Climate monitoring and research could fall victim to coronavirus, scientists fear',
 'Some UK property funds ban withdrawals over coronavirus',
 'Morning mail: British PM in hospital, Ruby Princess investigation, health fund windfall',
 'E