# Practice with JSON

In [1]:
import pandas as pd

In [2]:
import json

In [3]:
db = json.load(open('data/usda.json'))

In [4]:
db[0].keys()

dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])

In [5]:
db[0]

{'description': 'Cheese, caraway',
 'group': 'Dairy and Egg Products',
 'id': 1008,
 'manufacturer': '',
 'nutrients': [{'description': 'Protein',
   'group': 'Composition',
   'units': 'g',
   'value': 25.18},
  {'description': 'Total lipid (fat)',
   'group': 'Composition',
   'units': 'g',
   'value': 29.2},
  {'description': 'Carbohydrate, by difference',
   'group': 'Composition',
   'units': 'g',
   'value': 3.06},
  {'description': 'Ash', 'group': 'Other', 'units': 'g', 'value': 3.28},
  {'description': 'Energy',
   'group': 'Energy',
   'units': 'kcal',
   'value': 376.0},
  {'description': 'Water',
   'group': 'Composition',
   'units': 'g',
   'value': 39.28},
  {'description': 'Energy', 'group': 'Energy', 'units': 'kJ', 'value': 1573.0},
  {'description': 'Fiber, total dietary',
   'group': 'Composition',
   'units': 'g',
   'value': 0.0},
  {'description': 'Calcium, Ca',
   'group': 'Elements',
   'units': 'mg',
   'value': 673.0},
  {'description': 'Iron, Fe',
   'group': 

In [6]:
db[0]['nutrients'][0]

{'description': 'Protein',
 'group': 'Composition',
 'units': 'g',
 'value': 25.18}

In [10]:
nutrients = pd.DataFrame(db[0]['nutrients'])

In [11]:
nutrients.head()

Unnamed: 0,description,group,units,value
0,Protein,Composition,g,25.18
1,Total lipid (fat),Composition,g,29.2
2,"Carbohydrate, by difference",Composition,g,3.06
3,Ash,Other,g,3.28
4,Energy,Energy,kcal,376.0


In [12]:
info_keys = ['description', 'group', 'id', 'manufacturer']
info = pd.DataFrame(db, columns = info_keys)

In [13]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
description     6636 non-null object
group           6636 non-null object
id              6636 non-null int64
manufacturer    5195 non-null object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB


In [14]:
pd.value_counts(info.group)[:10]

Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Legumes and Legume Products          365
Fast Foods                           365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Name: group, dtype: int64

In [15]:
nutrient = []

for rec in db:
    fnuts = pd.DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrient.append(fnuts)
    
nutrients = pd.concat(nutrient, ignore_index = True)

In [16]:
nutrients.head()

Unnamed: 0,description,group,units,value,id
0,Protein,Composition,g,25.18,1008
1,Total lipid (fat),Composition,g,29.2,1008
2,"Carbohydrate, by difference",Composition,g,3.06,1008
3,Ash,Other,g,3.28,1008
4,Energy,Energy,kcal,376.0,1008


In [17]:
nutrients.duplicated().sum()

14179

In [18]:
nutrients = nutrients.drop_duplicates()

In [19]:
col_mapping = {'description': 'food',
              'group': 'fgroup'}

In [20]:
info = info.rename(columns = col_mapping, copy = False)

In [21]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
food            6636 non-null object
fgroup          6636 non-null object
id              6636 non-null int64
manufacturer    5195 non-null object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB


In [22]:
nutrients = nutrients.rename(columns = col_mapping, copy = False)

In [23]:
nutrients.head()

Unnamed: 0,food,fgroup,units,value,id
0,Protein,Composition,g,25.18,1008
1,Total lipid (fat),Composition,g,29.2,1008
2,"Carbohydrate, by difference",Composition,g,3.06,1008
3,Ash,Other,g,3.28,1008
4,Energy,Energy,kcal,376.0,1008


In [24]:
ndata = pd.merge(nutrients, info, on = 'id', how = 'outer')

In [25]:
ndata

Unnamed: 0,food_x,fgroup_x,units,value,id,food_y,fgroup_y,manufacturer
0,Protein,Composition,g,25.180,1008,"Cheese, caraway",Dairy and Egg Products,
1,Total lipid (fat),Composition,g,29.200,1008,"Cheese, caraway",Dairy and Egg Products,
2,"Carbohydrate, by difference",Composition,g,3.060,1008,"Cheese, caraway",Dairy and Egg Products,
3,Ash,Other,g,3.280,1008,"Cheese, caraway",Dairy and Egg Products,
4,Energy,Energy,kcal,376.000,1008,"Cheese, caraway",Dairy and Egg Products,
5,Water,Composition,g,39.280,1008,"Cheese, caraway",Dairy and Egg Products,
6,Energy,Energy,kJ,1573.000,1008,"Cheese, caraway",Dairy and Egg Products,
7,"Fiber, total dietary",Composition,g,0.000,1008,"Cheese, caraway",Dairy and Egg Products,
8,"Calcium, Ca",Elements,mg,673.000,1008,"Cheese, caraway",Dairy and Egg Products,
9,"Iron, Fe",Elements,mg,0.640,1008,"Cheese, caraway",Dairy and Egg Products,


# Practicing Scraping



In [26]:
import pandas as pd

In [27]:
import requests

In [28]:
from bs4 import BeautifulSoup

In [29]:
mta = requests.get('http://web.mta.info/status/ServiceStatusSubway.xml')

In [30]:
mta

<Response [200]>

In [31]:
mta.text[:100]

'<?xml version="1.0" encoding="UTF-8" standalone="yes" ?><Siri xmlns:ns2="http://www.ifopt.org.uk/acs'

In [32]:
soup = BeautifulSoup(mta.text, "lxml")

In [33]:
%%html
soup

In [34]:
soup.find('summary').text

'Southbound [2] and [3] trains are running local from 14 St to Chambers St because of a sick passenger at 14 St.'

In [35]:
soup.find('longdescription').text

'<P>Southbound [2] and [3] trains are running local from <STRONG>14 St</STRONG> to <STRONG>Chambers St</STRONG> because of a sick passenger at <STRONG>14 St</STRONG>.</P>\r\n<P>Expect delays in [1], [2] and [3] train service. </P>'

In [36]:
soup.find('longdescription')

<longdescription>&lt;P&gt;Southbound [2] and [3] trains are running local from &lt;STRONG&gt;14 St&lt;/STRONG&gt; to &lt;STRONG&gt;Chambers St&lt;/STRONG&gt; because of a sick passenger at &lt;STRONG&gt;14 St&lt;/STRONG&gt;.&lt;/P&gt;
&lt;P&gt;Expect delays in [1], [2] and [3] train service. &lt;/P&gt;</longdescription>

In [None]:
soup.find_all('summary').text

In [38]:
soup.find_all('summary')

[<summary xml:lang="EN">Southbound [2] and [3] trains are running local from 14 St to Chambers St because of a sick passenger at 14 St.</summary>,
 <summary xml:lang="EN">[4] and [5] train service has resumed following an earlier incident involving a train with mechanical problems at Bowling Green.</summary>,
 <summary xml:lang="EN">Southbound [B] and [D] trains are running with delays because of signal problems at 145 St.</summary>,
 <summary xml:lang="EN">Southbound [N] and [Q] trains are stopping along the [R] line from Canal St to DeKalb Av because of a train with mechanical problems at Canal St.</summary>,
 <summary xml:lang="EN">Rockaways Long-Term Flood Protection  [A] No service  to/from  Rockaway Park-Beach 116 St</summary>,
 <summary xml:lang="EN">Rockaways Long-Term Flood Protection  [S] No Rockaway Park Shuttle service at Broad Channel - Take the [A] instead [A] No rush hour service  to/from  Rockaway Park-Beach 116 St</summary>]

In [39]:
for i in soup.find_all('summary'):
    print(i.text)

Southbound [2] and [3] trains are running local from 14 St to Chambers St because of a sick passenger at 14 St.
[4] and [5] train service has resumed following an earlier incident involving a train with mechanical problems at Bowling Green.
Southbound [B] and [D] trains are running with delays because of signal problems at 145 St.
Southbound [N] and [Q] trains are stopping along the [R] line from Canal St to DeKalb Av because of a train with mechanical problems at Canal St.
Rockaways Long-Term Flood Protection  [A] No service  to/from  Rockaway Park-Beach 116 St
Rockaways Long-Term Flood Protection  [S] No Rockaway Park Shuttle service at Broad Channel - Take the [A] instead [A] No rush hour service  to/from  Rockaway Park-Beach 116 St


### GitHub

https://api.github.com/repositories/858127/milestones/28/labels

In [40]:
url = 'https://api.github.com/repositories/858127/milestones/28/labels'

In [41]:
import requests

In [42]:
req = requests.get('https://api.github.com/repositories/858127/milestones/28/labels')

In [43]:
req

<Response [200]>

In [44]:
data = req.json()

In [45]:
data[:5]

[{'color': 'e10c02',
  'default': False,
  'id': 76811,
  'name': 'Bug',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Bug'},
 {'color': '4E9A06',
  'default': False,
  'id': 76812,
  'name': 'Enhancement',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Enhancement'},
 {'color': 'FCE94F',
  'default': False,
  'id': 127681,
  'name': 'Refactor',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Refactor'},
 {'color': '75507B',
  'default': False,
  'id': 129350,
  'name': 'Build',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Build'},
 {'color': '3465A4',
  'default': False,
  'id': 134699,
  'name': 'Docs',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Docs'}]

In [46]:
issue_labels = pd.DataFrame(data)

In [47]:
issue_labels.head()

Unnamed: 0,color,default,id,name,url
0,e10c02,False,76811,Bug,https://api.github.com/repos/pandas-dev/pandas...
1,4E9A06,False,76812,Enhancement,https://api.github.com/repos/pandas-dev/pandas...
2,FCE94F,False,127681,Refactor,https://api.github.com/repos/pandas-dev/pandas...
3,75507B,False,129350,Build,https://api.github.com/repos/pandas-dev/pandas...
4,3465A4,False,134699,Docs,https://api.github.com/repos/pandas-dev/pandas...


In [48]:
issue_labels.iloc[7]

color                                                 06909A
default                                                False
id                                                   2301354
name                                                 Data IO
url        https://api.github.com/repos/pandas-dev/pandas...
Name: 7, dtype: object