# 0. Using the Guardian API to gather all articles on Climate Change
The code-block below creates a folder in which for each day all articles with the tag 'environment/climate-change' are gathered in a separate json-file. An API-key has been obtained by filling out a developer application for the Guardian API.

In [None]:
import json
import requests
import pandas as pd
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta

# Make directory
ARTICLES_DIR = join('guardian', 'articles')
makedirs(ARTICLES_DIR, exist_ok=True)

API_ENDPOINT = 'http://content.guardianapis.com/search'
my_params = {
    'tag' : "environment/climate-change",
    'from-date': "",
    'to-date': "",
    'order-by': "newest",
    'show-fields': 'all',
    'page-size': 200,
    'api-key': '7ca76969-9b4f-4186-a7f4-0965214465b7'
}

# Select dates for which I want articles
start_date = date(2000, 01, 01)
end_date = date(2019,12, 05)
dayrange = range((end_date - start_date).days + 1)

# Gather json files for each day
for daycount in dayrange:
    dt = start_date + timedelta(days=daycount)
    datestr = dt.strftime('%Y-%m-%d')
    filename = join(ARTICLES_DIR, datestr + '.json')
    if not exists(filename):
        all_results = []
        my_params['from-date'] = datestr
        my_params['to-date'] = datestr
        current_page = 1
        total_pages = 1
        while current_page <= total_pages:
            my_params['page'] = current_page
            resp = requests.get(API_ENDPOINT, my_params)
            data = resp.json()
            all_results.extend(data['response']['results'])
            current_page += 1
            total_pages = data['response']['pages']

        with open(filename, 'w') as f:
            #print("Writing to", filename)

            # re-serialize it for pretty indentation
            f.write(json.dumps(all_results, indent=2))

The block below turns all different json files into one json file. 

In [1]:
import json
import requests
import pandas as pd
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta

dates = pd.date_range(start = '2000-01-01', end = '2019-12-04')

allArticles = pd.DataFrame()
for date in dates:
    #print(date)
    datestring = str(date).split(' ')[0]
    articleInDay = pd.read_json('../project-2020-superlaut/guardian/articles/' + datestring + '.json')
    allArticles = pd.concat([allArticles, articleInDay], ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




As all articles are now gathered in one pandas DataFrame, now the relevant columns are selected. Selecting just three column and turning it into one file instead of multiple makes the file 100 MB in size, instead of 300 MB.

In [4]:
import pandas as pd

guardian = allArticles
guardian[['standfirst', 'bodyText', 'wordcount']] = pd.io.json.json_normalize(guardian['fields'])[['standfirst', 'bodyText', 'wordcount']]
guardian = guardian[['id', 'sectionName','webPublicationDate','webTitle', 'standfirst', 'bodyText', 'wordcount']]
guardian['wordcount'] = guardian['wordcount'].astype(dtype = 'int64')
guardian = guardian[['bodyText','webPublicationDate','wordcount']]
# orient 'records' because PySpark cannot read any other format, this took me some time to figure out...
guardian.to_json('../project-2020-superlaut/climateChangeArticlesGuardian0019.json', orient = 'records')