###Converting data from NPR's API into a Python Pandas Dataframe

In [1]:
from json import loads
import pandas as pd

In [2]:
## Load sample json data from NPR's API
json_obj = loads(open("npr_api_sample.json").read())

In [3]:
## Get to know the data
for story in json_obj['list']['story']:

    print 'ID: ' + story['id']
    
    print 'TITLE: ' + story['title']['$text']

    print 'DATE: ' + story['storyDate']['$text'] + '\n'


ID: 482689024
TITLE: Colorado Mother Fights Off Mountain Lion To Save Her Son
DATE: Sun, 19 Jun 2016 14:40:53 -0400

ID: 482678701
TITLE: Former Vanderbilt University Football Player Found Guilty Of Rape — Again
DATE: Sun, 19 Jun 2016 12:18:00 -0400

ID: 482669249
TITLE: PHOTOS: Thousands Protest Against U.S. Military Presence In Okinawa, Japan
DATE: Sun, 19 Jun 2016 10:16:00 -0400

ID: 482509752
TITLE: Food To Celebrate Freedom: Tea Cakes For Juneteenth!
DATE: Sun, 19 Jun 2016 09:45:28 -0400

ID: 482612712
TITLE: A Pair O' Definitions For A Pile Of 'Para' Puns
DATE: Sun, 19 Jun 2016 07:57:30 -0400

ID: 482514949
TITLE: Welcome To Mongolia's New Postal System: An Atlas Of Random Words
DATE: Sun, 19 Jun 2016 07:57:30 -0400

ID: 482508765
TITLE: Why My Mom Left Me Out Of Her Book
DATE: Sun, 19 Jun 2016 07:57:30 -0400

ID: 482351977
TITLE: Neko Case, k.d. lang And Laura Veirs On The Art Of Working Together
DATE: Sun, 19 Jun 2016 07:57:30 -0400

ID: 482658774
TITLE: In Songs, Stories, Lati

In [4]:
## For our visualization, we're interested in the story category tags, which are nested in the json

for story in json_obj['list']['story']:
    for parent in story['parent']:
        print story['id'], parent['title']['$text']

482689024 News
482689024 mountain lion
482689024 Must Reads
482689024 Colorado
482689024 The Two-Way
482689024 Animals
482689024 Around the Nation
482689024 U.S.
482689024 Home Page Top Stories
482689024 News
482678701 News
482678701 News
482678701 Latest From NPR News
482678701 rape
482678701 America
482678701 The Two-Way
482678701 Around the Nation
482678701 Law
482678701 U.S.
482678701 Home Page Top Stories
482669249 World
482669249 marine corps
482669249 Latest From NPR News
482669249 Okinawa
482669249 U.S.
482669249 Must Reads
482669249 Japan
482669249 The Two-Way
482669249 Asia
482669249 The Impact of War
482669249 World
482669249 U.S.
482669249 Home Page Top Stories
482669249 News
482509752 Food
482509752 tea cakes
482509752 Etha Robinson
482509752 NPR Stories For Apple News
482509752 Code Switch
482509752 Food
482509752 Emancipation Proclamation
482509752 Juneteenth
482509752 History
482509752 Around the Nation
482509752 Food
482509752 Race
482509752 U.S.
482509752 Home Page To

In [5]:
## Build the data frame by creating a list of dictionaries, then converting the list of dictionaries into a data frame

##First data frame: one row per distinct story category

In [6]:
## Create an empty list
dicts_list = []

In [7]:
## Fill the list with dictionaries -- each dictionary will be a row in our dataframe

for story in json_obj['list']['story']:
    for parent in story['parent']:
        d = {
            'id': story['id'],
            'title': story['title']['$text'],
            'category': parent['title']['$text']    
        }
        dicts_list.append(d)

In [8]:
## Convert the list of dictionaries into a pandas dataframe

df = pd.DataFrame(dicts_list, columns=('id', 'title', 'category'))

In [9]:
df.head(5)

Unnamed: 0,id,title,category
0,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,News
1,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,mountain lion
2,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,Must Reads
3,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,Colorado
4,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,The Two-Way


In [10]:
##Second data frame: One row per story, with one column containing a list of all the story's categories


In [11]:
## Create an empty list
dicts_reshape = []

In [12]:
## Fill the list with dictionaries -- each dictionary will be a row in our dataframe

for story in json_obj['list']['story']:
    categories_list = []
    d = {
        'id': story['id'],
        'title': story['title']['$text']
        }
    for parent in story['parent']:
        category = parent['title']['$text']
        categories_list.append(category)   
        d['category'] = categories_list
        d['top_category'] = []
    dicts_reshape.append(d)

In [13]:
## Convert the list of dictionaries into a pandas dataframe

df_reshape = pd.DataFrame(dicts_reshape, columns=('id', 'title', 'category'))

In [14]:
df_reshape.head()

Unnamed: 0,id,title,category
0,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,"[News, mountain lion, Must Reads, Colorado, Th..."
1,482678701,Former Vanderbilt University Football Player F...,"[News, News, Latest From NPR News, rape, Ameri..."
2,482669249,PHOTOS: Thousands Protest Against U.S. Militar...,"[World, marine corps, Latest From NPR News, Ok..."
3,482509752,Food To Celebrate Freedom: Tea Cakes For Junet...,"[Food, tea cakes, Etha Robinson, NPR Stories F..."
4,482612712,A Pair O' Definitions For A Pile Of 'Para' Puns,"[Games & Humor, Weekend Edition Sunday for Jun..."


In [18]:
df_reshape.to_json('npr_dataframe_sample.json')