# JSON format

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import json

In [3]:
obj = """
{
    "name":"Kim",
    "places_lived": ["Seoul","Korea"],
    "pet": null,
    "siblings": [{"name": "Scott", "age":25, "pet": "Zuko"}]
}
"""

In [6]:
type(obj)

str

In [7]:
# Decoding (json format -> dict format)
# string을 json 형태 dictionary 로 바꾸어주는 json.loads()
json.loads(obj)

{'name': 'Kim',
 'places_lived': ['Seoul', 'Korea'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'}]}

In [10]:
r = json.loads(obj)
type(r)

dict

In [12]:
# Encoding (dict format -> json format: string)
# json.dumps() 를 사용한다
json.dumps(r)

'{"name": "Kim", "places_lived": ["Seoul", "Korea"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}]}'

## practical example
- naver real-time search ranking

In [13]:
import requests
import json
from pandas.io.json import json_normalize

In [14]:
r = requests.get('http://rank.search.naver.com/rank.js')

In [16]:
# Response[200] => 정상 작동, 이외이면 서버 문제
# string 형태의 json format
r.text

'{"ts":"2020-10-23T14:00:00+0900","st":"2020-10-23T14:00:00+0900","et":"2020-10-23T14:00:00+0900","data":[{"category":"general","data":[{"rank":1,"keyword":"최철호","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":2,"keyword":"아이린 인성","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":3,"keyword":"상강","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":4,"keyword":"문가영","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":5,"keyword":"아이린","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":6,"keyword":"어쩌다 결혼","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":7,"keyword":"최철호 폭행","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":8,"keyword":"최철호 여배우","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"rank":9,"keyword":"손흥민 골","change":"+","score":0,"tvalue":0,"cvalue":0,"ratio":".","delta":0},{"ra

In [18]:
json.loads(r.text)

{'ts': '2020-10-23T14:00:00+0900',
 'st': '2020-10-23T14:00:00+0900',
 'et': '2020-10-23T14:00:00+0900',
 'data': [{'category': 'general',
   'data': [{'rank': 1,
     'keyword': '최철호',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 2,
     'keyword': '아이린 인성',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 3,
     'keyword': '상강',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 4,
     'keyword': '문가영',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 5,
     'keyword': '아이린',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cvalue': 0,
     'ratio': '.',
     'delta': 0},
    {'rank': 6,
     'keyword': '어쩌다 결혼',
     'change': '+',
     'score': 0,
     'tvalue': 0,
     'cva

In [20]:
# 내부 데이터를 보려면 계속해서 재귀적으로 데이터를 가져와야함
# 이 작업을 DataFrame.io.json_normalize 해줌
pd.DataFrame(json.loads(r.text))
pd.DataFrame(pd.DataFrame(json.loads(r.text)).data)

0    {'category': 'general', 'data': [{'rank': 1, '...
Name: data, dtype: object

In [25]:
# 이후부터는 dataframe handling
jj = json_normalize(json.loads(r.text), record_path = ['data', 'data'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,rank,keyword,change,score,tvalue,cvalue,ratio,delta
0,1,최철호,+,0,0,0,.,0
1,2,아이린 인성,+,0,0,0,.,0
2,3,상강,+,0,0,0,.,0
3,4,문가영,+,0,0,0,.,0
4,5,아이린,+,0,0,0,.,0
5,6,어쩌다 결혼,+,0,0,0,.,0
6,7,최철호 폭행,+,0,0,0,.,0
7,8,최철호 여배우,+,0,0,0,.,0
8,9,손흥민 골,+,0,0,0,.,0
9,10,여신강림,+,0,0,0,.,0


In [26]:
jj = json_normalize(json.loads(r.text), record_path = ['data', 'data'])
for i in range(len(jj)):
    ranks = (jj.iloc[i]['rank'])
    title = (jj.iloc[i]['keyword'])
    print(ranks, title)

1 최철호
2 아이린 인성
3 상강
4 문가영
5 아이린
6 어쩌다 결혼
7 최철호 폭행
8 최철호 여배우
9 손흥민 골
10 여신강림
11 토트넘 lask
12 전진 아버지
13 이경실
14 이경실 이혼
15 포스코
16 아일랜드
17 휴림로봇
18 양성우
19 서예지
20 비니시우스


  """Entry point for launching an IPython kernel.


# HTML Parsing

In [None]:
# !pip install bs4

In [27]:
from bs4 import BeautifulSoup

In [28]:
html_text = """
<html>
<body>
    <h1> reading web page with python </h1>
        <p> page analysis </p>
        <p> page alignment </p>
        <td> some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>
"""

In [29]:
BeautifulSoup(html_text, 'html.parser')


<html>
<body>
<h1> reading web page with python </h1>
<p> page analysis </p>
<p> page alignment </p>
<td> some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>

In [54]:
soup = BeautifulSoup(html_text, 'html.parser')
type(soup) # BeautifulSoup Object

bs4.BeautifulSoup

In [31]:
dir(soup)

['ASCII_SPACES',
 'DEFAULT_BUILDER_FEATURES',
 'ROOT_TAG_NAME',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_check_markup_is_url',
 '_decode_markup',
 '_feed',
 '_find_all',
 '_find_one',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_linkage_fixer',
 '_most_recent_element',
 '_namespaces',
 '_popToTag',
 '_should_pretty_print',
 'append',
 'attrs',
 'builder',
 'can_be_empty_element',
 'cdata_list_attributes',
 'childGenerator',
 'children',
 'clear',
 'conta

In [32]:
# html 로 해당하는 부분
soup.html

<html>
<body>
<h1> reading web page with python </h1>
<p> page analysis </p>
<p> page alignment </p>
<td> some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>

In [33]:
soup.h1

<h1> reading web page with python </h1>

In [34]:
soup.p

<p> page analysis </p>

In [43]:
print(soup.p)
print(soup.p.text) # text 가져옴
print(soup.p.next_sibling) # new line 도 하나의 <p></p> 로 인식
print(soup.p.next_sibling.next_sibling)

<p> page analysis </p>
 page analysis 


<p> page alignment </p>


In [49]:
# .text
print(soup.td)
print(soup.td.text) # opening tag ~ closing tag 사이의 text 를 모두 가져옴
print(soup.td.string) # .text 와 같은 역할
print(soup.td.next_sibling.next_sibling.text)

# 아무 것도 없는 것
print(soup.td.next_sibling.string)
print(soup.td.next_sibling.text)

<td> some text</td>
 some text
 some text
more text
None



In [82]:
html_text2= """
<html>
<body>
    <h1 id="title"> reading web page with python </h1>
        <p id="body"> page analysis </p>
        <p> page alignment </p>
        <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
        <ul>
            <li><a href = "http://www.naver.com"> naver</a></li>
            <li><a href = "http://www.daum.net"> daum</a></li>
        </ul>
    <div id="xxx">
        <h1> Wiki-books store </h1>
        <ul class="item">
            <li> introduction to game design </li>
            <li> introduction to python </li>
            <li> introduction to web design </li>
        </ul>
    </div>
</body>
</html>
"""

In [51]:
BeautifulSoup(html_text2, 'html.parser')


<html>
<body>
<h1 id="title"> reading web page with python </h1>
<p id="body"> page analysis </p>
<p> page alignment </p>
<td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
<ul>
<li><a href="http://www.naver.com"> naver</a></li>
<li><a href="http://www.daum.net"> daum</a></li>
</ul>
<div <h1="" id="xxx"> Wiki-books store 
<ul class="item">
<li> introduction to game design </li>
<li> introduction to python </li>
<li> introduction to web design </li>
</ul>
</div>
</body>
</html>

In [84]:
soup2 = BeautifulSoup(html_text2, 'html.parser')

## access by tag

In [57]:
# access by tags
soup2.find(id = 'title')

<h1 id="title"> reading web page with python </h1>

In [58]:
soup2.find(id = 'body')

<p id="body"> page analysis </p>

In [59]:
soup2.find(id = 'body').string

' page analysis '

In [60]:
# 전부 찾아오기 (인자로는 tag 그냥 넘김)
soup2.find_all('td')

[<td>some text</td>,
 <td></td>,
 <td><p>more text</p></td>,
 <td>even <p>more text</p></td>]

In [61]:
soup2.find_all('p')

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [62]:
soup2.find_all('li')

[<li><a href="http://www.naver.com"> naver</a></li>,
 <li><a href="http://www.daum.net"> daum</a></li>,
 <li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [67]:
soup2.find_all('a')

[<a href="http://www.naver.com"> naver</a>,
 <a href="http://www.daum.net"> daum</a>]

In [68]:
# attribute 가져옴 (<in attribute>)
soup2.find_all('a')[0].attrs

{'href': 'http://www.naver.com'}

In [72]:
# 연결된 링크 모으기
for aa in soup2.find_all('a'):
    href = aa.attrs['href']
    text = aa.string
    print(text, '-->', href)

 naver --> http://www.naver.com
 daum --> http://www.daum.net


In [64]:
for text in soup2.find_all('li'):
    print(text.string)

 naver
 daum
 introduction to game design 
 introduction to python 
 introduction to web design 


## access by regular expression
- 찾으려는 문자열 패턴을 찾아줌

In [74]:
import re

In [75]:
soup2.find_all(re.compile("^p")) # tags starting with a character 'p'

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [77]:
soup2.find_all(re.compile("^div"))

[<div <h1="" id="xxx"> Wiki-books store 
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [80]:
soup2.find_all(href = re.compile("^http://")) # find all the pattern that is equal tp href = ^http://

[<a href="http://www.naver.com"> naver</a>,
 <a href="http://www.daum.net"> daum</a>]

## access by css selector

In [85]:
soup2.select('h1') # by tags, h1 으로 실행되는 것들을 가져옴

[<h1 id="title"> reading web page with python </h1>,
 <h1> Wiki-books store </h1>]

In [86]:
soup2.select("#xxx") # id 로 찾을 때는 # + id

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [88]:
soup2.select('.item') # class name ".+ class_name"

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [89]:
soup2.select('div .item') # div 라는 클래스에서 item 이라는 클라스 찾음

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [90]:
soup2.select_one("#xxx > ul > li") # hierarchy 

<li> introduction to game design </li>

In [91]:
soup2.select('#xxx > ul > li')

[<li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [92]:
soup2.select("div li") # all li under div

[<li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [94]:
text = '<p class="body strikeout"></p>'

In [95]:
css_soup = BeautifulSoup(text, 'html.parser')

In [96]:
css_soup.find_all("p", class_='strikeout')

[<p class="body strikeout"></p>]

In [97]:
css_soup.find_all("p", class_='body')

[<p class="body strikeout"></p>]

**Refer BeautifulSoup Documentation**

## practical example
- extract job information from www.monster.com

In [98]:
url = 'https://www.monster.com/jobs/search/?q=Data-Scientist&where=CA&intcid=skr_navigation_nhpso_searchMain'
page = requests.get(url)

In [103]:
page.content # binary

b'<!DOCTYPE html>\r\n<html xmlns="https://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\r\n<head>\r\n    \r\n<meta http-equiv="X-UA-Compatible" content="IE=edge" />\r\n<meta http-equiv="Expires" content="0" />\r\n<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=2.0, minimum-scale=1" />\r\n<meta name="j_jp" content="1" />\r\n<meta charset="UTF-8">\r\n<title>Data Scientist Jobs in CA. CA Data Scientist Jobs. | Monster.com</title>\r\n\r\n        <style type="text/css">\r\n                @font-face{font-family:\'Roboto\';font-style:normal;font-weight:100;font-display:optional;src:local(\'Roboto Thin\'),local(\'Roboto-Thin\'),local(\'sans-serif-thin\'),url(https://fonts.gstatic.com/s/roboto/v19/KFOkCnqEu92Fr1MmgVxFIzIXKMnyrYk.woff2) format(\'woff2\');unicode-range:U+0460-052F,U+1C80-1C88,U+20B4,U+2DE0-2DFF,U+A640-A69F,U+FE2E-FE2F}@font-face{font-family:\'Roboto\';font-style:normal;font-weight:100;font-display:optional;src:local(\'Roboto Thin\'),local(\'

In [104]:
soup = BeautifulSoup(page.content, 'html.parser')

In [120]:
results = soup.find(id='SearchResults')

In [121]:
type(results)

bs4.element.Tag

In [122]:
results

<div class="mux-card mux-job-card" id="SearchResults">
<section class="card-content" data-jobid="221272677" data-postingid="f010d383-7ef4-4ff3-9624-2dea36c2a7c3" onclick="MKImpressionTrackingMouseDownHijack(this, event)">
<div class="flex-row">
<div class="mux-company-logo thumbnail is-loaded">
<img alt="CyberCoders" src="https://media.newjobs.com/clu/xcyb/xcyberc3x/branding/6344/CyberCoders-logo.jpg"/>
</div>
<div class="summary">
<header class="card-header">
<h2 class="title"><a data-bypass="true" data-m_impr_a_placement_id="JSR2CW" data-m_impr_j_cid="660" data-m_impr_j_coc="xcyberc3x" data-m_impr_j_jawsid="453866293" data-m_impr_j_jobid="221272677" data-m_impr_j_jpm="1" data-m_impr_j_jpt="1" data-m_impr_j_lat="37.2928" data-m_impr_j_lid="356" data-m_impr_j_long="-121.7979" data-m_impr_j_occid="11787" data-m_impr_j_p="1" data-m_impr_j_postingid="f010d383-7ef4-4ff3-9624-2dea36c2a7c3" data-m_impr_j_pvc="monster" data-m_impr_s_t="t" data-m_impr_uuid="d3ecae2a-7514-4cbb-b167-ab669cd89b6e

In [123]:
results.find_all('section', class_='card-content')

[<section class="card-content" data-jobid="221272677" data-postingid="f010d383-7ef4-4ff3-9624-2dea36c2a7c3" onclick="MKImpressionTrackingMouseDownHijack(this, event)">
 <div class="flex-row">
 <div class="mux-company-logo thumbnail is-loaded">
 <img alt="CyberCoders" src="https://media.newjobs.com/clu/xcyb/xcyberc3x/branding/6344/CyberCoders-logo.jpg"/>
 </div>
 <div class="summary">
 <header class="card-header">
 <h2 class="title"><a data-bypass="true" data-m_impr_a_placement_id="JSR2CW" data-m_impr_j_cid="660" data-m_impr_j_coc="xcyberc3x" data-m_impr_j_jawsid="453866293" data-m_impr_j_jobid="221272677" data-m_impr_j_jpm="1" data-m_impr_j_jpt="1" data-m_impr_j_lat="37.2928" data-m_impr_j_lid="356" data-m_impr_j_long="-121.7979" data-m_impr_j_occid="11787" data-m_impr_j_p="1" data-m_impr_j_postingid="f010d383-7ef4-4ff3-9624-2dea36c2a7c3" data-m_impr_j_pvc="monster" data-m_impr_s_t="t" data-m_impr_uuid="d3ecae2a-7514-4cbb-b167-ab669cd89b6e" href="https://job-openings.monster.com/data-s

In [125]:
len(results.find_all('section', class_='card-content'))

29

In [126]:
job_el = results.find_all('section', class_='card-content')

In [127]:
job_el[0]

<section class="card-content" data-jobid="221272677" data-postingid="f010d383-7ef4-4ff3-9624-2dea36c2a7c3" onclick="MKImpressionTrackingMouseDownHijack(this, event)">
<div class="flex-row">
<div class="mux-company-logo thumbnail is-loaded">
<img alt="CyberCoders" src="https://media.newjobs.com/clu/xcyb/xcyberc3x/branding/6344/CyberCoders-logo.jpg"/>
</div>
<div class="summary">
<header class="card-header">
<h2 class="title"><a data-bypass="true" data-m_impr_a_placement_id="JSR2CW" data-m_impr_j_cid="660" data-m_impr_j_coc="xcyberc3x" data-m_impr_j_jawsid="453866293" data-m_impr_j_jobid="221272677" data-m_impr_j_jpm="1" data-m_impr_j_jpt="1" data-m_impr_j_lat="37.2928" data-m_impr_j_lid="356" data-m_impr_j_long="-121.7979" data-m_impr_j_occid="11787" data-m_impr_j_p="1" data-m_impr_j_postingid="f010d383-7ef4-4ff3-9624-2dea36c2a7c3" data-m_impr_j_pvc="monster" data-m_impr_s_t="t" data-m_impr_uuid="d3ecae2a-7514-4cbb-b167-ab669cd89b6e" href="https://job-openings.monster.com/data-scientist

In [133]:
print(job_el[0].find('h2', class_='title').text.strip()) # title text
print(job_el[0].find('span', class_='name').text.strip()) # tag span 을 이용한 것
print(job_el[0].find('div', class_='company').text.strip()) # tag div 를 이용한 것
print(job_el[0].find('div', class_='location').text.strip())

Data Scientist - AI/Machine Learning - No Sponsorship
CyberCoders
CyberCoders
San Jose, CA


In [135]:
for i in job_el:
    title = i.find('h2', class_='title')
    company = i.find('div', class_='company')
    location = i.find('div', class_='location')
    
    if None in (title, company, location):
        continue
        
    print(title.text.strip())
    print(company.text.strip())
    print(location.text.strip())
    print()
    

Data Scientist - AI/Machine Learning - No Sponsorship
CyberCoders
San Jose, CA

Water Utility Data Scientist / Data Analyst
Golden State Water Company
San Dimas, CA

Sr. to Lead Data Scientist (IoT startup, up to $200k)
Skyrocket Ventures
Sunnyvale, CA

Data Scientist
Eliassen Group
San Jose, CA

Data Scientist
IMS
Newport Beach, CA

Data Scientist, Auction and Delivery
Facebook
Menlo Park, CA

Lead Data Scientist
Pacific Life
Aliso Viejo, CA

Sr Biotech Data Scientist
Bayer
Berkeley, CA

Senior / Principal Data Scientist
Cylance
Irvine, CA

Senior Data Scientist
Loom
San Francisco, CA

Manager of Laboratory Informatics
CyberCoders
Austin, TX

Data Scientist
C3 IoT
Redwood City, CA

Staff Data Scientist
Intuit
San Diego, CA

Sr. Data Scientist, Sales Analytics
Zoom Video Communications
San Jose, CA

Data Scientist
Internet Brands
Los Angeles, CA

Senior Data Scientist/Statistician
Align Technology, Inc.
San Jose, CA

Sr. Data Scientist
App Annie
San Francisco, CA

Data Scientist, Marke