# Practice with JSON

In [1]:
import pandas as pd

In [2]:
import json

In [3]:
db = json.load(open('data/usda.json'))

In [4]:
db[0].keys()

dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])

In [5]:
db[0]

{'description': 'Cheese, caraway',
 'group': 'Dairy and Egg Products',
 'id': 1008,
 'manufacturer': '',
 'nutrients': [{'description': 'Protein',
   'group': 'Composition',
   'units': 'g',
   'value': 25.18},
  {'description': 'Total lipid (fat)',
   'group': 'Composition',
   'units': 'g',
   'value': 29.2},
  {'description': 'Carbohydrate, by difference',
   'group': 'Composition',
   'units': 'g',
   'value': 3.06},
  {'description': 'Ash', 'group': 'Other', 'units': 'g', 'value': 3.28},
  {'description': 'Energy',
   'group': 'Energy',
   'units': 'kcal',
   'value': 376.0},
  {'description': 'Water',
   'group': 'Composition',
   'units': 'g',
   'value': 39.28},
  {'description': 'Energy', 'group': 'Energy', 'units': 'kJ', 'value': 1573.0},
  {'description': 'Fiber, total dietary',
   'group': 'Composition',
   'units': 'g',
   'value': 0.0},
  {'description': 'Calcium, Ca',
   'group': 'Elements',
   'units': 'mg',
   'value': 673.0},
  {'description': 'Iron, Fe',
   'group': 

In [6]:
db[0]['nutrients'][0]

{'description': 'Protein',
 'group': 'Composition',
 'units': 'g',
 'value': 25.18}

In [7]:
nutrients = pd.DataFrame(db[0]['nutrients'])

In [8]:
nutrients.head()

Unnamed: 0,description,group,units,value
0,Protein,Composition,g,25.18
1,Total lipid (fat),Composition,g,29.2
2,"Carbohydrate, by difference",Composition,g,3.06
3,Ash,Other,g,3.28
4,Energy,Energy,kcal,376.0


In [9]:
info_keys = ['description', 'group', 'id', 'manufacturer']
info = pd.DataFrame(db, columns = info_keys)

In [10]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
description     6636 non-null object
group           6636 non-null object
id              6636 non-null int64
manufacturer    5195 non-null object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB


In [11]:
pd.value_counts(info.group)[:10]

Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Legumes and Legume Products          365
Fast Foods                           365
Lamb, Veal, and Game Products        345
Sweets                               341
Pork Products                        328
Fruits and Fruit Juices              328
Name: group, dtype: int64

In [12]:
nutrient = []

for rec in db:
    fnuts = pd.DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrient.append(fnuts)
    
nutrients = pd.concat(nutrient, ignore_index = True)

In [13]:
nutrients.head()

Unnamed: 0,description,group,units,value,id
0,Protein,Composition,g,25.18,1008
1,Total lipid (fat),Composition,g,29.2,1008
2,"Carbohydrate, by difference",Composition,g,3.06,1008
3,Ash,Other,g,3.28,1008
4,Energy,Energy,kcal,376.0,1008


In [14]:
nutrients.duplicated().sum()

14179

In [15]:
nutrients = nutrients.drop_duplicates()

In [16]:
col_mapping = {'description': 'food',
              'group': 'fgroup'}

In [17]:
info = info.rename(columns = col_mapping, copy = False)

In [18]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
food            6636 non-null object
fgroup          6636 non-null object
id              6636 non-null int64
manufacturer    5195 non-null object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB


In [19]:
nutrients = nutrients.rename(columns = col_mapping, copy = False)

In [20]:
nutrients.head()

Unnamed: 0,food,fgroup,units,value,id
0,Protein,Composition,g,25.18,1008
1,Total lipid (fat),Composition,g,29.2,1008
2,"Carbohydrate, by difference",Composition,g,3.06,1008
3,Ash,Other,g,3.28,1008
4,Energy,Energy,kcal,376.0,1008


In [21]:
ndata = pd.merge(nutrients, info, on = 'id', how = 'outer')

# Practicing Scraping



In [22]:
import pandas as pd

In [23]:
import requests

In [24]:
from bs4 import BeautifulSoup

In [25]:
mta = requests.get('http://web.mta.info/status/ServiceStatusSubway.xml')

In [26]:
mta

<Response [200]>

In [27]:
mta.text[:100]

'<?xml version="1.0" encoding="UTF-8" standalone="yes" ?><Siri xmlns:ns2="http://www.ifopt.org.uk/acs'

In [28]:
soup = BeautifulSoup(mta.text, "lxml")

In [29]:
%%html
soup

In [30]:
soup.find('summary').text

'Some Hudson Yard bound [7] local trains are running express from Mets-Willets Pt to Queensboro Plaza because of signal problems at 74 St.'

In [31]:
soup.find('longdescription').text

'<P>[7] trains are running with delays in both directions.</P>\r\n<P>Some Hudson Yard bound [7] local trains are running express from <STRONG>Mets-Willets Point </STRONG>to <STRONG>Queensboro Plaza</STRONG> </P>\r\n<P>These service changes are because of signal problems at <STRONG>74 St-Broadway</STRONG>. </P>'

In [32]:
soup.find('longdescription')

<longdescription>&lt;P&gt;[7] trains are running with delays in both directions.&lt;/P&gt;
&lt;P&gt;Some Hudson Yard bound [7] local trains are running express from &lt;STRONG&gt;Mets-Willets Point &lt;/STRONG&gt;to &lt;STRONG&gt;Queensboro Plaza&lt;/STRONG&gt; &lt;/P&gt;
&lt;P&gt;These service changes are because of signal problems at &lt;STRONG&gt;74 St-Broadway&lt;/STRONG&gt;. &lt;/P&gt;</longdescription>

### GitHub

https://api.github.com/repositories/858127/milestones/28/labels

In [33]:
url = 'https://api.github.com/repositories/858127/milestones/28/labels'

In [34]:
import requests

In [35]:
req = requests.get('https://api.github.com/repositories/858127/milestones/28/labels')

In [36]:
req

<Response [200]>

In [37]:
data = req.json()

In [38]:
data[:5]

[{'color': 'e10c02',
  'default': False,
  'id': 76811,
  'name': 'Bug',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Bug'},
 {'color': '4E9A06',
  'default': False,
  'id': 76812,
  'name': 'Enhancement',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Enhancement'},
 {'color': 'FCE94F',
  'default': False,
  'id': 127681,
  'name': 'Refactor',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Refactor'},
 {'color': '75507B',
  'default': False,
  'id': 129350,
  'name': 'Build',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Build'},
 {'color': '3465A4',
  'default': False,
  'id': 134699,
  'name': 'Docs',
  'url': 'https://api.github.com/repos/pandas-dev/pandas/labels/Docs'}]

In [39]:
issue_labels = pd.DataFrame(data)

In [40]:
issue_labels.head()

Unnamed: 0,color,default,id,name,url
0,e10c02,False,76811,Bug,https://api.github.com/repos/pandas-dev/pandas...
1,4E9A06,False,76812,Enhancement,https://api.github.com/repos/pandas-dev/pandas...
2,FCE94F,False,127681,Refactor,https://api.github.com/repos/pandas-dev/pandas...
3,75507B,False,129350,Build,https://api.github.com/repos/pandas-dev/pandas...
4,3465A4,False,134699,Docs,https://api.github.com/repos/pandas-dev/pandas...


In [41]:
issue_labels.iloc[7]

color                                                 06909A
default                                                False
id                                                   2301354
name                                                 Data IO
url        https://api.github.com/repos/pandas-dev/pandas...
Name: 7, dtype: object