# JSON format

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import json

In [2]:
obj = """
{
    "name" :  "Kim",
    "places_lived" : ["Seoul", "Korea"],
    "pet" : null,
    "siblings" : [{"name" : "Scott", "age" : 25, "pet" : "Zuko"}]
}"""

In [3]:
type(obj)

str

In [4]:
obj

'\n{\n    "name" :  "Kim",\n    "places_lived" : ["Seoul", "Korea"],\n    "pet" : null,\n    "siblings" : [{"name" : "Scott", "age" : 25, "pet" : "Zuko"}]\n}'

In [5]:
json.loads(obj)

{'name': 'Kim',
 'places_lived': ['Seoul', 'Korea'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'}]}

In [6]:
r = json.loads(obj)  #decoding (json -> dict)
type(r)

dict

In [7]:
json.dumps(r)  #encoding (dict -> json)

'{"name": "Kim", "places_lived": ["Seoul", "Korea"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}]}'

## practical example
- naver real-time search ranking

In [12]:
import requests
from pandas.io.json import json_normalize

In [9]:
r = requests.get("http://rank.search.naver.com/rank.js")

In [10]:
r  # 200이면 정확하게 온 것, 300이나 400이면 서버나 local쪽에서 문제가 있는 것

<Response [200]>

In [11]:
r.text

'{"ts":"2020-11-18T20:25:00+0900","st":"2020-11-18T20:25:00+0900","et":"2020-11-18T20:25:00+0900","data":[{"category":"general","data":[{"rank":1,"keyword":"구창모","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":2,"keyword":"이가은","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":3,"keyword":"최환희","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":4,"keyword":"2020 11월 모의고사","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":5,"keyword":"정바비","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":6,"keyword":"한초원","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":7,"keyword":"최진실","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":8,"keyword":"전직 야구선수 폭행","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":9,"keyword":"금태섭","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{

In [18]:
json.loads(r.text)

{'ts': '2020-10-22T12:30:00+0900',
 'st': '2020-10-22T12:30:00+0900',
 'et': '2020-10-22T12:30:00+0900',
 'data': [{'category': 'general',
   'data': [{'rank': 1,
     'keyword': '소병철',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 2,
     'keyword': '아이린',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 3,
     'keyword': '김진애',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 4,
     'keyword': '윤석열',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 5,
     'keyword': '박순철',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 6,
     'keyword': '윤한홍',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue':

In [13]:
pd.DataFrame(json.loads(r.text))

Unnamed: 0,ts,st,et,data
0,2020-11-18T20:25:00+0900,2020-11-18T20:25:00+0900,2020-11-18T20:25:00+0900,"{'category': 'general', 'data': [{'rank': 1, '..."


In [14]:
json_normalize(json.loads(r.text))

  json_normalize(json.loads(r.text))


Unnamed: 0,ts,st,et,data
0,2020-11-18T20:25:00+0900,2020-11-18T20:25:00+0900,2020-11-18T20:25:00+0900,"[{'category': 'general', 'data': [{'rank': 1, ..."


In [15]:
json_normalize(json.loads(r.text), 'data')

  json_normalize(json.loads(r.text), 'data')


Unnamed: 0,category,data
0,general,"[{'rank': 1, 'keyword': '구창모', 'change': '+', ..."


In [16]:
jj = json_normalize(json.loads(r.text), ['data', 'data'])  # record_path = ['data', 'data']
jj

  jj = json_normalize(json.loads(r.text), ['data', 'data'])  # record_path = ['data', 'data']


Unnamed: 0,rank,keyword,change,score,tvalue,cvalue,ratio,delta
0,1,구창모,+,0,0,0,.,0
1,2,이가은,+,0,0,0,.,0
2,3,최환희,+,0,0,0,.,0
3,4,2020 11월 모의고사,+,0,0,0,.,0
4,5,정바비,+,0,0,0,.,0
5,6,한초원,+,0,0,0,.,0
6,7,최진실,+,0,0,0,.,0
7,8,전직 야구선수 폭행,+,0,0,0,.,0
8,9,금태섭,+,0,0,0,.,0
9,10,조성민,+,0,0,0,.,0


In [17]:
for i in range(len(jj)):
    ranks = (jj.iloc[i])['rank']
    title = (jj.iloc[i])['keyword']
    print(ranks, " : ", title)

1  :  구창모
2  :  이가은
3  :  최환희
4  :  2020 11월 모의고사
5  :  정바비
6  :  한초원
7  :  최진실
8  :  전직 야구선수 폭행
9  :  금태섭
10  :  조성민
11  :  방치찜
12  :  프로듀스 101
13  :  우정바이오
14  :  코로나 2단계
15  :  알테어
16  :  강동호
17  :  최준희
18  :  쇼챔피언
19  :  플렉센
20  :  19코로나 라이브


# HTML Parsing

In [18]:
from bs4 import BeautifulSoup

In [19]:
html_text = """
<html>
<body>
    <h1> reading web page with python </h1>
        <p> page analysis </p>
        <p> page alignment </p>
        <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>
"""

In [20]:
soup = BeautifulSoup(html_text, 'html.parser')
soup


<html>
<body>
<h1> reading web page with python </h1>
<p> page analysis </p>
<p> page alignment </p>
<td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>

In [21]:
type(soup)

bs4.BeautifulSoup

In [22]:
dir(soup)

['ASCII_SPACES',
 'DEFAULT_BUILDER_FEATURES',
 'ROOT_TAG_NAME',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_check_markup_is_url',
 '_decode_markup',
 '_feed',
 '_find_all',
 '_find_one',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_linkage_fixer',
 '_most_recent_element',
 '_namespaces',
 '_popToTag',
 '_should_pretty_print',
 'append',
 'attrs',
 'builder',
 'can_be_empty_element',
 'cdata_list_attributes',
 'childGenerator',
 'children',
 'clear',
 'conta

In [23]:
soup.html

<html>
<body>
<h1> reading web page with python </h1>
<p> page analysis </p>
<p> page alignment </p>
<td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>

In [24]:
soup.h1

<h1> reading web page with python </h1>

In [25]:
soup.p

<p> page analysis </p>

In [26]:
print(soup.p)
print(soup.p.next_sibling.next_sibling)

<p> page analysis </p>
<p> page alignment </p>


In [27]:
soup.td

<td>some text</td>

In [36]:
print(soup.td.text, soup.td.string)

some text some text


In [37]:
html_text2 = """
<html>
<body>
    <h1 id="title"> reading web page with python </h1>
        <p id="body"> page analysis </p>
        <p> page alignment </p>
        <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
        <ul>
            <li><a href = "http://www.naver.com"> naver</a></li>
            <li><a href = "http://www.daum.net"> daum</a></li>
        </ul>
    <div id="xxx">
        <h1> Wiki-books store </h1>
        <ul class="item">
            <li> introduction to game design </li>
            <li> introduction to python </li>
            <li> introduction to web design </li>
        </ul>
    </div>
</body>
</html>
"""

In [30]:
soup = BeautifulSoup(html_text2, 'html.parser')

## access by tags

In [31]:
soup.find(id='title')

<h1 id="title"> reading web page with python </h1>

In [32]:
soup.find(id='body')

<p id="body"> page analysis </p>

In [33]:
soup.find(id='body').text

' page analysis '

In [34]:
print(soup.find('td'))
print(soup.find_all('td'))

<td>some text</td>
[<td>some text</td>, <td></td>, <td><p>more text</p></td>, <td>even <p>more text</p></td>]


In [38]:
soup.find_all('p')

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [39]:
soup.find_all('li')

[<li><a href="http://www.naver.com"> naver</a></li>,
 <li><a href="http://www.daum.net"> daum</a></li>,
 <li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [40]:
soup.find_all('li')[0]

<li><a href="http://www.naver.com"> naver</a></li>

In [41]:
soup.find_all('li')[0].string

' naver'

In [42]:
soup.find_all('a')[0].string, soup.find_all('a')[0].attrs

(' naver', {'href': 'http://www.naver.com'})

In [43]:
for aa in soup.find_all('a'):
    href = aa.attrs['href']
    text = aa.text
    print(text, '-->', href)

 naver --> http://www.naver.com
 daum --> http://www.daum.net


## access by regular expression

In [44]:
import re
soup.find_all(re.compile("^p")) # tags starting with a character 'p'

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [50]:
soup.find_all(re.compile("div"))

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [51]:
soup.find_all(href=re.compile("^http://"))

[<a href="http://www.naver.com"> naver</a>,
 <a href="http://www.daum.net"> daum</a>]

## access by css selector

In [52]:
soup.select('h1')   # by tags

[<h1 id="title"> reading web page with python </h1>,
 <h1> Wiki-books store </h1>]

In [53]:
soup.select('#xxx') # by id

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [54]:
soup.select('.item') # by class name

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [58]:
soup.select('div .item') # multi-components(tag=div, class=item)

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [59]:
soup.select('#xxx > ul > li') # hierarchy (child)

[<li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [60]:
soup.select_one('#xxx > ul > li') # hierarchy (child)

<li> introduction to game design </li>

In [61]:
soup.select('div li')  # hierarchy (div tag >>> ul tag)  (descendants)

[<li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [62]:
soup.find_all('div li')  

[]

In [65]:
text = '<p class="body strikeout"></p>'

css_soup = BeautifulSoup(text,'html.parser')
css_soup.find_all("p", class_="strikeout")  # 다 쓰지 않아도 찾을 수 있다.

[<p class="body strikeout"></p>]

In [67]:
css_soup.find_all("p", class_="body")  # 다 쓰지 않아도 찾을 수 있다.

[<p class="body strikeout"></p>]

In [68]:
css_soup.select('p')

[<p class="body strikeout"></p>]