# JSON format

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import json

In [7]:
obj = """
{
    "name" :  "Kim",
    "places_lived" : ["Seoul", "Korea"],
    "pet" : null,
    "siblings" : [{"name" : "Scott", "age" : 25, "pet" : "Zuko"}]
}"""

In [8]:
type(obj)

str

In [9]:
obj

'\n{\n    "name" :  "Kim",\n    "places_lived" : ["Seoul", "Korea"],\n    "pet" : null,\n    "siblings" : [{"name" : "Scott", "age" : 25, "pet" : "Zuko"}]\n}'

In [10]:
json.loads(obj)

{'name': 'Kim',
 'places_lived': ['Seoul', 'Korea'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'}]}

In [12]:
r = json.loads(obj)  #decoding (json -> dict)
type(r)

dict

In [13]:
json.dumps(r)  #encoding (dict -> json)

'{"name": "Kim", "places_lived": ["Seoul", "Korea"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}]}'

## practical example
- naver real-time search ranking

In [14]:
import requests
from pandas.io.json import json_normalize

In [15]:
r = requests.get("http://rank.search.naver.com/rank.js")

In [16]:
r  # 200이면 정확하게 온 것, 300이나 400이면 서버나 local쪽에서 문제가 있는 것

<Response [200]>

In [17]:
r.text

'{"ts":"2020-10-22T12:30:00+0900","st":"2020-10-22T12:30:00+0900","et":"2020-10-22T12:30:00+0900","data":[{"category":"general","data":[{"rank":1,"keyword":"소병철","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":2,"keyword":"아이린","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":3,"keyword":"김진애","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":4,"keyword":"윤석열","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":5,"keyword":"박순철","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":6,"keyword":"윤한홍","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":7,"keyword":"윤호중","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":8,"keyword":"장제원","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":9,"keyword":"타이어뱅크","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":10,"keyw

In [18]:
json.loads(r.text)

{'ts': '2020-10-22T12:30:00+0900',
 'st': '2020-10-22T12:30:00+0900',
 'et': '2020-10-22T12:30:00+0900',
 'data': [{'category': 'general',
   'data': [{'rank': 1,
     'keyword': '소병철',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 2,
     'keyword': '아이린',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 3,
     'keyword': '김진애',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 4,
     'keyword': '윤석열',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 5,
     'keyword': '박순철',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 6,
     'keyword': '윤한홍',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue':

In [24]:
pd.DataFrame(json.loads(r.text))

Unnamed: 0,ts,st,et,data
0,2020-10-22T12:30:00+0900,2020-10-22T12:30:00+0900,2020-10-22T12:30:00+0900,"{'category': 'general', 'data': [{'rank': 1, '..."


In [20]:
json_normalize(json.loads(r.text))

  json_normalize(json.loads(r.text))


Unnamed: 0,ts,st,et,data
0,2020-10-22T12:30:00+0900,2020-10-22T12:30:00+0900,2020-10-22T12:30:00+0900,"[{'category': 'general', 'data': [{'rank': 1, ..."


In [21]:
json_normalize(json.loads(r.text), 'data')

  json_normalize(json.loads(r.text), 'data')


Unnamed: 0,category,data
0,general,"[{'rank': 1, 'keyword': '소병철', 'change': '+', ..."


In [30]:
jj = json_normalize(json.loads(r.text), ['data', 'data'])  # record_path = ['data', 'data']
jj

  jj = json_normalize(json.loads(r.text), ['data', 'data'])  # record_path = ['data', 'data']


Unnamed: 0,rank,keyword,change,score,tvalue,cvalue,ratio,delta
0,1,소병철,+,0,0,0,.,0
1,2,아이린,+,0,0,0,.,0
2,3,김진애,+,0,0,0,.,0
3,4,윤석열,+,0,0,0,.,0
4,5,박순철,+,0,0,0,.,0
5,6,윤한홍,+,0,0,0,.,0
6,7,윤호중,+,0,0,0,.,0
7,8,장제원,+,0,0,0,.,0
8,9,타이어뱅크,+,0,0,0,.,0
9,10,박범계,+,0,0,0,.,0


In [32]:
for i in range(len(jj)):
    ranks = (jj.iloc[i])['rank']
    title = (jj.iloc[i])['keyword']
    print(ranks, " : ", title)

1  :  소병철
2  :  아이린
3  :  김진애
4  :  윤석열
5  :  박순철
6  :  윤한홍
7  :  윤호중
8  :  장제원
9  :  타이어뱅크
10  :  박범계
11  :  지주연
12  :  김도읍
13  :  윤석열와이프
14  :  난색
15  :  백혜련
16  :  레드벨벳
17  :  송기헌
18  :  나를 사랑한 스파이
19  :  국정감사 생중계
20  :  비와이


# HTML Parsing

In [33]:
from bs4 import BeautifulSoup

In [34]:
html_text = """
<html>
<body>
    <h1> reading web page with python </h1>
        <p> page analysis </p>
        <p> page alignment </p>
        <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>
"""

In [36]:
soup = BeautifulSoup(html_text, 'html.parser')
soup


<html>
<body>
<h1> reading web page with python </h1>
<p> page analysis </p>
<p> page alignment </p>
<td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>

In [37]:
type(soup)

bs4.BeautifulSoup

In [38]:
dir(soup)

['ASCII_SPACES',
 'DEFAULT_BUILDER_FEATURES',
 'ROOT_TAG_NAME',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_check_markup_is_url',
 '_decode_markup',
 '_feed',
 '_find_all',
 '_find_one',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_linkage_fixer',
 '_most_recent_element',
 '_namespaces',
 '_popToTag',
 '_should_pretty_print',
 'append',
 'attrs',
 'builder',
 'can_be_empty_element',
 'cdata_list_attributes',
 'childGenerator',
 'children',
 'clear',
 'conta

In [39]:
soup.html

<html>
<body>
<h1> reading web page with python </h1>
<p> page analysis </p>
<p> page alignment </p>
<td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>

In [40]:
soup.h1

<h1> reading web page with python </h1>

In [41]:
soup.p

<p> page analysis </p>

In [43]:
print(soup.p)
print(soup.p.next_sibling.next_sibling)

<p> page analysis </p>
<p> page alignment </p>


In [44]:
soup.td

<td>some text</td>

In [47]:
print(soup.td.text, soup.td.string)

some text some text


In [48]:
html_text2 = """
<html>
<body>
    <h1 id="title"> reading web page with python </h1>
        <p id="body"> page analysis </p>
        <p> page alignment </p>
        <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
        <ul>
            <li><a href = "http://www.naver.com"> naver</a></li>
            <li><a href = "http://www.daum.net"> daum</a></li>
        </ul>
    <div id="xxx">
        <h1> Wiki-books store </h1>
        <ul class="item">
            <li> introduction to game design </li>
            <li> introduction to python </li>
            <li> introduction to web design </li>
        </ul>
    </div>
</body>
</html>
"""

In [49]:
soup = BeautifulSoup(html_text2, 'html.parser')

## access by tags

In [50]:
soup.find(id='title')

<h1 id="title"> reading web page with python </h1>

In [51]:
soup.find(id='body')

<p id="body"> page analysis </p>

In [52]:
soup.find(id='body').text

' page analysis '

In [53]:
print(soup.find('td'))
print(soup.find_all('td'))

<td>some text</td>
[<td>some text</td>, <td></td>, <td><p>more text</p></td>, <td>even <p>more text</p></td>]


In [54]:
soup.find_all('p')

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [55]:
soup.find_all('li')

[<li><a href="http://www.naver.com"> naver</a></li>,
 <li><a href="http://www.daum.net"> daum</a></li>,
 <li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [56]:
soup.find_all('li')[0]

<li><a href="http://www.naver.com"> naver</a></li>

In [57]:
soup.find_all('li')[0].string

' naver'

In [58]:
soup.find_all('a')[0].string, soup.find_all('a')[0].attrs

(' naver', {'href': 'http://www.naver.com'})

In [59]:
for aa in soup.find_all('a'):
    href = aa.attrs['href']
    text = aa.text
    print(text, '-->', href)

 naver --> http://www.naver.com
 daum --> http://www.daum.net


## access by regular expression

In [60]:
import re
soup.find_all(re.compile("^p")) # tags starting with a character 'p'

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [61]:
soup.find_all(re.compile("div"))

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [62]:
soup.find_all(href=re.compile("^http://"))

[<a href="http://www.naver.com"> naver</a>,
 <a href="http://www.daum.net"> daum</a>]

## access by css selector

In [64]:
soup.select('h1')   # by tags

[<h1 id="title"> reading web page with python </h1>,
 <h1> Wiki-books store </h1>]

In [65]:
soup.select('#xxx') # by id

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [66]:
soup.select('.item') # by class name

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [67]:
soup.select('div .item') # multi-components(tag=div, class=item)

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [69]:
soup.select('#xxx > ul > li') # hierarchy (child)

[<li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [70]:
soup.select_one('#xxx > ul > li') # hierarchy (child)

<li> introduction to game design </li>

In [71]:
soup.select('div li')  # hierarchy (div tag >>> ul tag)  (descendants)

[<li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [72]:
soup.find_all('div li')  

[]

In [76]:
text = '<p class="body strikeout"></p>'

css_soup = BeautifulSoup(text,'html.parser')
css_soup.find_all("p", class_="strikeout")  # 다 쓰지 않아도 찾을 수 있다.

[<p class="body strikeout"></p>]

In [77]:
css_soup.find_all("p", class_="body")  # 다 쓰지 않아도 찾을 수 있다.

[<p class="body strikeout"></p>]

In [78]:
css_soup.select('p')

[<p class="body strikeout"></p>]

# Practical example
- extract job information from www.monster.com

In [80]:
url = "https://www.monster.com/jobs/search/?q=Data-Scientist&where=California"
page = requests.get(url)
page

<Response [200]>

In [81]:
page.text  # page.content

'<!DOCTYPE html>\r\n<html xmlns="https://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\r\n<head>\r\n    \r\n<meta http-equiv="X-UA-Compatible" content="IE=edge" />\r\n<meta http-equiv="Expires" content="0" />\r\n<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=2.0, minimum-scale=1" />\r\n<meta name="j_jp" content="1" />\r\n<meta charset="UTF-8">\r\n<title>Data Scientist Jobs in California. California Data Scientist Jobs. | Monster.com</title>\r\n\r\n        <style type="text/css">\r\n                @font-face{font-family:\'Roboto\';font-style:normal;font-weight:100;font-display:optional;src:local(\'Roboto Thin\'),local(\'Roboto-Thin\'),local(\'sans-serif-thin\'),url(https://fonts.gstatic.com/s/roboto/v19/KFOkCnqEu92Fr1MmgVxFIzIXKMnyrYk.woff2) format(\'woff2\');unicode-range:U+0460-052F,U+1C80-1C88,U+20B4,U+2DE0-2DFF,U+A640-A69F,U+FE2E-FE2F}@font-face{font-family:\'Roboto\';font-style:normal;font-weight:100;font-display:optional;src:local(\'Roboto T

In [82]:
soup = BeautifulSoup(page.text, 'html.parser')

In [83]:
results = soup.find(id = "SearchResults")

In [84]:
job_elems = results.find_all('section', class_ = 'card-content')

In [97]:
len(job_elems)

29

In [86]:
job_elems[0]

<section class="card-content" data-jobid="221272677" data-postingid="f010d383-7ef4-4ff3-9624-2dea36c2a7c3" onclick="MKImpressionTrackingMouseDownHijack(this, event)">
<div class="flex-row">
<div class="mux-company-logo thumbnail is-loaded">
<img alt="CyberCoders" src="https://media.newjobs.com/clu/xcyb/xcyberc3x/branding/6344/CyberCoders-logo.jpg"/>
</div>
<div class="summary">
<header class="card-header">
<h2 class="title"><a data-bypass="true" data-m_impr_a_placement_id="JSR2CW" data-m_impr_j_cid="660" data-m_impr_j_coc="xcyberc3x" data-m_impr_j_jawsid="453866293" data-m_impr_j_jobid="221272677" data-m_impr_j_jpm="1" data-m_impr_j_jpt="1" data-m_impr_j_lat="37.2928" data-m_impr_j_lid="356" data-m_impr_j_long="-121.7979" data-m_impr_j_occid="11787" data-m_impr_j_p="1" data-m_impr_j_postingid="f010d383-7ef4-4ff3-9624-2dea36c2a7c3" data-m_impr_j_pvc="monster" data-m_impr_s_t="t" data-m_impr_uuid="a49f9629-ecc0-4fd0-bb40-54388c5f4301" href="https://job-openings.monster.com/data-scientist

In [90]:
job_elems[0].find('h2', class_ = 'title')

<h2 class="title"><a data-bypass="true" data-m_impr_a_placement_id="JSR2CW" data-m_impr_j_cid="660" data-m_impr_j_coc="xcyberc3x" data-m_impr_j_jawsid="453866293" data-m_impr_j_jobid="221272677" data-m_impr_j_jpm="1" data-m_impr_j_jpt="1" data-m_impr_j_lat="37.2928" data-m_impr_j_lid="356" data-m_impr_j_long="-121.7979" data-m_impr_j_occid="11787" data-m_impr_j_p="1" data-m_impr_j_postingid="f010d383-7ef4-4ff3-9624-2dea36c2a7c3" data-m_impr_j_pvc="monster" data-m_impr_s_t="t" data-m_impr_uuid="a49f9629-ecc0-4fd0-bb40-54388c5f4301" href="https://job-openings.monster.com/data-scientist-ai-machine-learning-no-sponsorship-san-jose-ca-us-cybercoders/221272677" onclick="clickJobTitle('plid=356&amp;pcid=660&amp;poccid=11787','Data Scientist',''); clickJobTitleSiteCat('{&quot;events.event48&quot;:&quot;true&quot;,&quot;eVar25&quot;:&quot;Data Scientist - AI/Machine Learning - No Sponsorship&quot;,&quot;eVar66&quot;:&quot;Monster&quot;,&quot;eVar67&quot;:&quot;JSR2CW&quot;,&quot;eVar26&quot;:&q

In [93]:
print(job_elems[0].find('h2', class_ = 'title').text.strip())

Data Scientist - AI/Machine Learning - No Sponsorship


In [98]:
job_elems[0].find('div', class_='company').text.strip()

'CyberCoders'

In [96]:
job_elems[0].find('div', class_='location').text.strip()

'San Jose, CA'

In [101]:
for num,i in enumerate(job_elems):
    title = i.find('h2', class_ = 'title')
    company = i.find('div', class_='company')
    location = i.find('div', class_='location')
    
    if None in (title, company, location):
        continue
    print(num + 1)    
    print(title.text.strip())
    print(company.text.strip())
    print(location.text.strip())

1
Data Scientist - AI/Machine Learning - No Sponsorship
CyberCoders
San Jose, CA
3
Water Utility Data Scientist / Data Analyst
Golden State Water Company
San Dimas, CA
4
Sr. to Lead Data Scientist (IoT startup, up to $200k)
Skyrocket Ventures
Sunnyvale, CA
5
Data Scientist
IMS
Newport Beach, CA
6
Senior Data Scientist
Jobot
Pasadena, CA
7
DATA SCIENTIST - INTERMEDIATE
The Judge Group
San Francisco, CA
9
Data Scientist/Tester
Apex Systems
Santa Clara, CA
10
Data Scientist II
System One
Santa Clara, CA
11
Data Scientist Intern
Henkel
Irvine, CA
12
Data Scientist
Tech Mahindra Limited
Sunnyvale, CA
13
Manager of Laboratory Informatics
CyberCoders
Austin, TX
14
Data Scientist
Mastech Digital
Los Angeles, CA
15
Data Scientist II
Randstad Life Sciences
Sunnyvale, CA
16
Principal Data Scientist
Randstad Technologies
Sunnyvale, CA
17
Senior Data Scientist
Company Confidential
San Francisco, CA
18
Sr. Data Scientist
VSolvit LLC
Norco, CA
20
Senior Data Science Manager
Ericsson Inc
Santa Clara ,