In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

## JSON

In [2]:
elections = pd.read_csv("elections.csv")
elections

Unnamed: 0,Candidate,Party,%,Year,Result
0,Reagan,Republican,50.7,1980,win
1,Carter,Democratic,41.0,1980,loss
2,Anderson,Independent,6.6,1980,loss
3,Reagan,Republican,58.8,1984,win
4,Mondale,Democratic,37.6,1984,loss
5,Bush,Republican,53.4,1988,win
6,Dukakis,Democratic,45.6,1988,loss
7,Clinton,Democratic,43.0,1992,win
8,Bush,Republican,37.4,1992,loss
9,Perot,Independent,18.9,1992,loss


## Hierarchical Data

A lot of structured data isn't in CSV format, but in HTML, XML, JSON, YAML, etc. JSON might have a structure that Pandas can't read directly.

Here's an example: a group of people collected information about US congressional legislators in YAML format.

https://github.com/unitedstates/congress-legislators

Here's one of the data files:

https://github.com/unitedstates/congress-legislators/blob/master/legislators-current.yaml

In [3]:
import requests
from pathlib import Path

legislators_path = 'legislators-current.yaml'
base_url = 'https://github.com/unitedstates/congress-legislators/raw/master/'

def download(url, path):
    """Download the contents of a URL to a local file."""
    path = Path(path) # If path was a string, now it's a Path
    if not path.exists():
        print('Downloading...', end=' ')
        resp = requests.get(url)
        with path.open('wb') as f:
            f.write(resp.content)
        print('Done!')
        
download(base_url + legislators_path, legislators_path)

Downloading... Done!


- id:
    bioguide: B000944
    thomas: '00136'
    lis: S307
    govtrack: 400050
    opensecrets: N00003535
    votesmart: 27018
    fec:
    - H2OH13033
    - S6OH00163


In [None]:
import yaml

legislators = ...
len(legislators)

In [None]:
leg_df = pd.DataFrame(
    columns=['id', 'first', 'last', 'birthday'],
    data=[[x['id']['bioguide'], 
           x['name']['first'],
           x['name']['last'],
           to_date(x['bio']['birthday'])] for x in legislators])
leg_df.head()

## Missing Values

## Joins

In [None]:
comm_path = 'committees-current.yaml'
comm_membership_path = 'committee-membership-current.yaml'
for p in [comm_path, comm_membership_path]:
    download(base_url + p, p)
comms = yaml.load(open(comm_path))
comm_membership = yaml.load(open(comm_membership_path))
print(len(comms), len(comm_membership))

In [None]:
comms[0]

In [None]:
comm_df = pd.DataFrame(
    columns=['name', 'thomas_id', 'type'],
    data=[[c['name'], c['thomas_id'], c['type']] for c in comms]
)
comm_df.head()

In [None]:
comm_membership.keys()

In [None]:
comm_membership['HSAG'][0]

In [None]:
member_df = pd.DataFrame(
    columns=['comm_id', 'leg_id'],
    data=[[c, m['bioguide']] for c, members in comm_membership.items() for m in members]
)
member_df.head()

In [None]:
member_count = member_df['leg_id'].value_counts().to_frame().reset_index()
member_count.head()

In [None]:
member_count.columns = ['id', 'num_comms']
member_count.head()

In [None]:
df = leg_df.merge(member_count, left_on='id', right_on='id')
df.head()

In [None]:
import seaborn as sns
sns.lmplot(x='age', y='num_comms', data=df, fit_reg=True);