<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-the-json-and-pprint-libraries" data-toc-modified-id="Import-the-json-and-pprint-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import the json and pprint libraries</a></span></li><li><span><a href="#Load-the-JSON-data-and-look-for-potential-issues" data-toc-modified-id="Load-the-JSON-data-and-look-for-potential-issues-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the JSON data and look for potential issues</a></span></li><li><span><a href="#Check-for-differences-in-the-structure-of-the-dictionaries" data-toc-modified-id="Check-for-differences-in-the-structure-of-the-dictionaries-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Check for differences in the structure of the dictionaries</a></span></li><li><span><a href="#Generate-counts-from-the-JSON-data" data-toc-modified-id="Generate-counts-from-the-JSON-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Generate counts from the JSON data</a></span></li><li><span><a href="#Get-the-source-data-and-confirm-that-it-has-the-anticipated-length" data-toc-modified-id="Get-the-source-data-and-confirm-that-it-has-the-anticipated-length-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Get the source data and confirm that it has the anticipated length</a></span></li><li><span><a href="#Fix-any-errors-in-the-values-in-the-dictionary" data-toc-modified-id="Fix-any-errors-in-the-values-in-the-dictionary-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Fix any errors in the values in the dictionary</a></span></li><li><span><a href="#Create-a-pandas-DataFrame" data-toc-modified-id="Create-a-pandas-DataFrame-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Create a pandas DataFrame</a></span></li><li><span><a href="#Confirm-that-we-are-getting-the-expected-values-for-source" data-toc-modified-id="Confirm-that-we-are-getting-the-expected-values-for-source-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Confirm that we are getting the expected values for source</a></span></li></ul></div>

# Import the json and pprint libraries

In [1]:
import pandas as pd
import numpy as np
import json
import pprint
from collections import Counter

In [4]:
import watermark

%load_ext watermark
%watermark -n -v -iv

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.20.0

numpy    : 1.19.2
json     : 2.0.9
pandas   : 1.2.1
watermark: 2.1.0



# Load the JSON data and look for potential issues

In [5]:
with open('data/allcandidatenewssample.json') as f:
    candidatenews = json.load(f)

In [6]:
len(candidatenews)

60000

In [7]:
pprint.pprint(candidatenews[0:2])

[{'date': '2019-12-25 10:00:00',
  'domain': 'www.nbcnews.com',
  'panel_position': 1,
  'query': 'Michael Bloomberg',
  'source': 'NBC News',
  'story_position': 6,
  'time': '18 hours ago',
  'title': 'Bloomberg cuts ties with company using prison inmates to make '
           'campaign calls',
  'url': 'https://www.nbcnews.com/politics/2020-election/bloomberg-cuts-ties-company-using-prison-inmates-make-campaign-calls-n1106971'},
 {'date': '2019-11-09 08:00:00',
  'domain': 'www.townandcountrymag.com',
  'panel_position': 1,
  'query': 'Amy Klobuchar',
  'source': 'Town & Country Magazine',
  'story_position': 3,
  'time': '18 hours ago',
  'title': "Democratic Candidates React to Michael Bloomberg's Potential Run",
  'url': 'https://www.townandcountrymag.com/society/politics/a29739854/michael-bloomberg-democratic-candidates-campaign-reactions/'}]


In [8]:
pprint.pprint(candidatenews[0]['source'])

'NBC News'


# Check for differences in the structure of the dictionaries

In [9]:
Counter([len(item) for item in candidatenews])

Counter({9: 57202, 2: 2382, 10: 416})

In [10]:
pprint.pprint(next(item for item in candidatenews if len(item) < 9))

{'date': '2019-09-11 18:00:00', 'reason': 'Not collected'}


In [11]:
# checking the usage of next
pprint.pprint((item for item in candidatenews if len(item) < 9))

<generator object <genexpr> at 0x0000024D4AC52948>


In [13]:
pprint.pprint(next(item for item in candidatenews if len(item) > 9))

{'category': 'Satire',
 'date': '2019-08-21 04:00:00',
 'domain': 'politics.theonion.com',
 'panel_position': 1,
 'query': 'John Hickenlooper',
 'source': 'Politics | The Onion',
 'story_position': 8,
 'time': '4 days ago',
 'title': '‘And Then There Were 23,’ Says Wayne Messam Crossing Out '
          'Hickenlooper Photo \n'
          'In Elaborate Grid Of Rivals',
 'url': 'https://politics.theonion.com/and-then-there-were-23-says-wayne-messam-crossing-ou-1837311060'}


In [14]:
pprint.pprint([item for item in candidatenews if len(item) == 2][0:10])

[{'date': '2019-09-11 18:00:00', 'reason': 'Not collected'},
 {'date': '2019-07-24 00:00:00', 'reason': 'No Top stories'},
 {'date': '2019-08-19 20:00:00', 'reason': 'Not collected'},
 {'date': '2019-09-13 16:00:00', 'reason': 'Not collected'},
 {'date': '2019-10-16 20:00:00', 'reason': 'No Top stories'},
 {'date': '2019-10-17 18:00:00', 'reason': 'Not collected'},
 {'date': '2019-08-02 14:00:00', 'reason': 'Not collected'},
 {'date': '2019-05-27 12:00:00', 'reason': 'Not collected'},
 {'date': '2019-12-03 12:00:00', 'reason': 'No Top stories'},
 {'date': '2019-01-03 00:00:00', 'reason': 'No Top stories'}]


In [15]:
candidatenews = [item for item in candidatenews if len(item) > 2]

In [16]:
len(candidatenews)

57618

# Generate counts from the JSON data

In [17]:
politico = [item for item in candidatenews if item['source'] == "Politico"]

In [18]:
len(politico)

2732

In [19]:
pprint.pprint(politico[0:2])

[{'date': '2019-05-18 18:00:00',
  'domain': 'www.politico.com',
  'panel_position': 1,
  'query': 'Marianne Williamson',
  'source': 'Politico',
  'story_position': 7,
  'time': '1 week ago',
  'title': 'Marianne Williamson reaches donor threshold for Dem debates',
  'url': 'https://www.politico.com/story/2019/05/09/marianne-williamson-2020-election-1315133'},
 {'date': '2018-12-27 06:00:00',
  'domain': 'www.politico.com',
  'panel_position': 1,
  'query': 'Julian Castro',
  'source': 'Politico',
  'story_position': 1,
  'time': '1 hour ago',
  'title': "O'Rourke and Castro on collision course in Texas",
  'url': 'https://www.politico.com/story/2018/12/27/orourke-julian-castro-collision-texas-election-1073720'}]


# Get the source data and confirm that it has the anticipated length

In [20]:
sources = [item.get('source') for item in candidatenews]

In [21]:
type(sources)

list

In [22]:
len(sources)

57618

In [23]:
sources[0:5]

['NBC News', 'Town & Country Magazine', 'TheHill', 'CNBC.com', 'Fox News']

In [24]:
pprint.pprint(Counter(sources).most_common(10))

[('Fox News', 3530),
 ('CNN.com', 2750),
 ('Politico', 2732),
 ('TheHill', 2383),
 ('The New York Times', 1804),
 ('Washington Post', 1770),
 ('Washington Examiner', 1655),
 ('The Hill', 1342),
 ('New York Post', 1275),
 ('Vox', 941)]


# Fix any errors in the values in the dictionary

In [26]:
for newsdict in candidatenews:
    newsdict.update((k, 'The Hill') for k, v in newsdict.items()
                    if k == 'source' and v == 'TheHill')

In [27]:
# Usage of  item.get('source') instead of item['source']. 
# This is handy when there might be missing keys in a dictionary. get returns None when the key
# is missing, but we can use an optional second argument to specify a value to return.

sources = [item.get('source') for item in candidatenews]

In [28]:
pprint.pprint(Counter(sources).most_common(10))

[('The Hill', 3725),
 ('Fox News', 3530),
 ('CNN.com', 2750),
 ('Politico', 2732),
 ('The New York Times', 1804),
 ('Washington Post', 1770),
 ('Washington Examiner', 1655),
 ('New York Post', 1275),
 ('Vox', 941),
 ('Breitbart', 799)]


# Create a pandas DataFrame

In [29]:
candidatenewsdf = pd.DataFrame(candidatenews)

In [30]:
candidatenewsdf.dtypes

title             object
url               object
source            object
time              object
date              object
query             object
story_position     int64
panel_position    object
domain            object
category          object
dtype: object

# Confirm that we are getting the expected values for source

In [31]:
candidatenewsdf.rename(columns={'date': 'storydate'}, inplace=True)

In [33]:
candidatenewsdf['storydate'] = candidatenewsdf['storydate'].astype(
    'datetime64[ns]')

In [34]:
candidatenewsdf.shape

(57618, 10)

In [35]:
candidatenewsdf['source'].value_counts(sort=True).head(10)

The Hill               3725
Fox News               3530
CNN.com                2750
Politico               2732
The New York Times     1804
Washington Post        1770
Washington Examiner    1655
New York Post          1275
Vox                     941
Breitbart               799
Name: source, dtype: int64