In [6]:
from json import loads
import pandas as pd
import itertools

In [7]:
## Load sample json data from NPR's API
json_obj = loads(open("npr_api_sample.json").read())

In [8]:
## Build the data frame by creating a list of dictionaries, then converting the list of dictionaries into a data frame

##First data frame: one row per distinct story category

dicts_list = []

In [9]:
for story in json_obj['list']['story']:
    for parent in story['parent']:
        d = {
            'id': story['id'],
            'title': story['title']['$text'],
            'category': parent['title']['$text'] 
        }
        dicts_list.append(d)

In [10]:
df = pd.DataFrame(dicts_list, columns=('id', 'title', 'category'))

In [11]:
df.head(5)

Unnamed: 0,id,title,category
0,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,News
1,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,mountain lion
2,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,Must Reads
3,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,Colorado
4,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,The Two-Way


In [14]:
##Second data frame: one row per story, with one column containing a list of all the story's categories

dicts_reshape = []

In [16]:
for story in json_obj['list']['story']:
    categories_list = []
    d = {
        'id': story['id'],
        'title': story['title']['$text']
        }
    for parent in story['parent']:
        category = parent['title']['$text']
        categories_list.append(category)   
        d['category'] = categories_list
        d['top_category'] = []
    dicts_reshape.append(d)

In [17]:
df_reshape = pd.DataFrame(dicts_reshape, columns=('id', 'title', 'category', 'top_category'))

In [18]:
df_reshape.head()

Unnamed: 0,id,title,category,top_category
0,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,"[News, mountain lion, Must Reads, Colorado, Th...",[]
1,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,"[News, mountain lion, Must Reads, Colorado, Th...",[]
2,482678701,Former Vanderbilt University Football Player F...,"[News, News, Latest From NPR News, rape, Ameri...",[]
3,482669249,PHOTOS: Thousands Protest Against U.S. Militar...,"[World, marine corps, Latest From NPR News, Ok...",[]
4,482509752,Food To Celebrate Freedom: Tea Cakes For Junet...,"[Food, tea cakes, Etha Robinson, NPR Stories F...",[]


In [19]:
df_reshape['category'][0]

['News',
 'mountain lion',
 'Must Reads',
 'Colorado',
 'The Two-Way',
 'Animals',
 'Around the Nation',
 'U.S.',
 'Home Page Top Stories',
 'News']

In [20]:
##determine most popular categories
all_categories = df['category'].tolist()

In [21]:
stop_categories = ['#MemeOfTheWeek','All Things Considered','All Things Considered for June 10, 2016',
                   'All Things Considered for June 11, 2016','Classical Notes Newsletter','Code Switch',
                   'Goats and Soda','Home Page Top Stories','Latest From NPR News','MPX Break','MPX Invest',
                   'MPX Music interviews','MPX Open Door','MPX Parking Lot','MPX election stories ',
                   'Must Reads','Music Makers','NPR Selects','NPR Stories For Apple News',
                   "NPR.org's Most Popular Stories",'Policy-ish','Shots - Health News','Story of the Day',
                   'The Two-Way','Two-Way Featured Post Two',"Wait Wait...Don't Tell Me!",
                   "Wait Wait...Don't Tell Me! For June 11, 2016",'Weekend Edition Saturday',
                   'Weekend Edition Saturday for June 11, 2016','Weekend Edition Sunday',
                   'Weekend Edition Sunday for June 12, 2016','World Story of the Day','Your Health',
                   'nprreads','sunday puzzle','A No. 1']

In [22]:
all_categories_filtered = [x for x in all_categories if x not in stop_categories]

In [23]:
len(all_categories)

609

In [24]:
len(all_categories_filtered)

468

In [25]:
category_counter = {}  

In [26]:
for category in all_categories_filtered:
    if category in category_counter:
        category_counter[category] += 1
    else:
        category_counter[category] = 1

In [27]:
popular_categories = sorted(category_counter, key = category_counter.get, reverse = True)

In [28]:
top_5 = popular_categories[:5]

In [29]:
top_5

[u'News', u'U.S.', u'World', u'Around the Nation', u'Health']

In [30]:
top_30 = popular_categories[:30]

In [31]:
top_30

[u'News',
 u'U.S.',
 u'World',
 u'Around the Nation',
 u'Health',
 u'Politics',
 u'Arts & Life',
 u'Law',
 u'Global Health',
 u'Opinion',
 u'Commentary',
 u'Food',
 u'Weekend Edition Sunday for June 19, 2016',
 u'Race',
 u'Elections',
 u'Music',
 u'Books',
 u'Games & Humor',
 u'Book Reviews',
 u'Sports',
 u'International',
 u'Donald Trump',
 u'Asia',
 u'Weekend Edition Saturday for June 18, 2016',
 u'History',
 u"Children's Health",
 u'Music Interviews',
 u'Republicans',
 u'Latin America',
 u'Education']

In [32]:
##filter category lists to retain only top categories

In [33]:
for index, row in df_reshape.iterrows():
    compressed_category = [x for x in row['category'] if x in top_30]
    df_reshape['top_category'][index] = compressed_category

In [34]:
df_reshape.head()

Unnamed: 0,id,title,category,top_category
0,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,"[News, mountain lion, Must Reads, Colorado, Th...","[News, Around the Nation, U.S., News]"
1,482689024,Colorado Mother Fights Off Mountain Lion To Sa...,"[News, mountain lion, Must Reads, Colorado, Th...","[News, Around the Nation, U.S., News]"
2,482678701,Former Vanderbilt University Football Player F...,"[News, News, Latest From NPR News, rape, Ameri...","[News, News, Around the Nation, Law, U.S.]"
3,482669249,PHOTOS: Thousands Protest Against U.S. Militar...,"[World, marine corps, Latest From NPR News, Ok...","[World, U.S., Asia, World, U.S., News]"
4,482509752,Food To Celebrate Freedom: Tea Cakes For Junet...,"[Food, tea cakes, Etha Robinson, NPR Stories F...","[Food, Food, History, Around the Nation, Food,..."


In [35]:
##d3 export:  create a list of pairs of relationships to convert to json

pairs_d3 = []

In [36]:
for category_list in df_reshape['top_category']:
    for pair in itertools.combinations(category_list, 2):
        if pair not in pairs_d3 and pair[::-1] not in pairs_d3 and pair[0] <> pair[1]:
            pairs_d3.append(pair)

In [37]:
df_d3 = pd.DataFrame(pairs_d3, columns=('source', 'target'))

In [38]:
df_d3

Unnamed: 0,source,target
0,News,Around the Nation
1,News,U.S.
2,Around the Nation,U.S.
3,News,Law
4,Around the Nation,Law
5,Law,U.S.
6,World,U.S.
7,World,Asia
8,World,News
9,U.S.,Asia


In [39]:
df_d3.to_csv('links.csv')

In [40]:
nodes = pd.DataFrame()

In [41]:
nodes['node'] = top_30

In [42]:
nodes

Unnamed: 0,node
0,News
1,U.S.
2,World
3,Around the Nation
4,Health
5,Politics
6,Arts & Life
7,Law
8,Global Health
9,Opinion


In [43]:
nodes.to_csv('nodes.csv')