# 6.1 텍스트 파일 이용하는 방법

## 6.1.4 JSON 데이터

In [1]:
import json
import pandas as pd

In [2]:
obj = """
{"name":"Wes",
 "places_lived":["United States","Spain","Germany"],
 "pet":null,
 "siblings":[{"name":"Scott", "age":25, "pet":"Zuko"},
             {"name":"Katie", "age":33, "pet":"Cisco"}]
}
"""

In [3]:
result = json.loads(obj)

In [4]:
result

{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},
  {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}

In [5]:
asjson = json.dumps(result)

In [6]:
siblings = pd.DataFrame(result["siblings"], columns=["name","age"])

In [7]:
siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


## 6.1.5 XML과 HTML: 웹 내용 긁어오기

In [8]:
import requests
from lxml.html import parse
from io import StringIO

In [9]:
text = requests.get('https://sports.news.naver.com/basketball/record/index.nhn?category=nba').text

In [10]:
parsed = parse(StringIO(text))

In [11]:
doc = parsed.getroot()

In [12]:
doc

<Element html at 0x1139c43b8>

In [13]:
links = doc.findall('.//a')

In [14]:
links[15:20]

[<Element a at 0x1139c4e08>,
 <Element a at 0x1139c4e58>,
 <Element a at 0x1139c4ea8>,
 <Element a at 0x1139c4ef8>,
 <Element a at 0x1139c4f48>]

In [15]:
lnk = links[27]

In [16]:
lnk

<Element a at 0x116420228>

In [17]:
lnk.get('href')

'http://fd.sports.news.naver.com'

In [18]:
links[2].text_content()

'SPORTS'

In [19]:
urls = [lnk.get('href') for lnk in doc.findall('.//a')]

In [20]:
urls[-10:]

['#',
 'https://post.naver.com/viewer/postView.nhn?volumeNo=15857371&memberNo=1156373',
 'http://m.post.naver.com/viewer/postView.nhn?volumeNo=14837084&memberNo=1156373',
 'http://www.naver.com/rules/service.html',
 'http://news.naver.com/main/principle.nhn',
 'http://www.naver.com/rules/privacy.html',
 'http://www.naver.com/rules/disclaimer.html',
 '#',
 'http://www.navercorp.com/',
 'http://www.navercorp.com/']

In [21]:
tables = doc.findall('.//table')

In [22]:
tables

[<Element table at 0x116422e08>,
 <Element table at 0x116422d68>,
 <Element table at 0x116422f98>]

In [23]:
calls = tables[1]
puts = tables[2]

In [24]:
rows = calls.findall('.//tr')

In [25]:
rows

[<Element tr at 0x116426368>,
 <Element tr at 0x116426408>,
 <Element tr at 0x116426458>,
 <Element tr at 0x1164264a8>,
 <Element tr at 0x1164264f8>,
 <Element tr at 0x116426548>,
 <Element tr at 0x116426598>,
 <Element tr at 0x1164265e8>,
 <Element tr at 0x116426638>,
 <Element tr at 0x116426688>,
 <Element tr at 0x1164266d8>,
 <Element tr at 0x116426728>,
 <Element tr at 0x116426778>,
 <Element tr at 0x1164267c8>,
 <Element tr at 0x116426818>,
 <Element tr at 0x116426868>]

In [26]:
def _unpack(row, kind='td'):
    elts = row.findall('.//%s'%kind)
    return [val.text_content().strip().split('\n')[0] for val in elts]

In [27]:
_unpack(rows[0], kind='th')

['순위',
 '팀',
 '디비전',
 '경기수',
 '승',
 '패',
 '승률',
 '승차',
 '홈승',
 '홈패',
 '원정승',
 '원정패',
 '디비전승',
 '디비전패',
 '연속']

In [28]:
_unpack(rows[2], kind='td')

['보스턴',
 'ATL',
 '82',
 '55',
 '27',
 '0.671',
 '4.0',
 '27',
 '14',
 '28',
 '13',
 '12',
 '4',
 '1승']

In [29]:
from pandas.io.parsers import TextParser

In [30]:
def parse_options_data(table):
    rows = table.findall('.//tr')
    header = _unpack(rows[0], kind='th')
    data = [_unpack(r) for r in rows]
    return TextParser(data, names=header).get_chunk()

In [31]:
column_data = parse_options_data(calls)

In [32]:
row_data = parse_options_data(puts)

In [33]:
column_data

Unnamed: 0,순위,팀,디비전,경기수,승,패,승률,승차,홈승,홈패,원정승,원정패,디비전승,디비전패,연속
0,토론토,ATL,82,59,23,0.72,0.0,34,7,25,16,12,4,1패,
1,보스턴,ATL,82,55,27,0.671,4.0,27,14,28,13,12,4,1승,
2,필라델피아,ATL,82,52,30,0.634,7.0,30,11,22,19,9,7,16승,
3,클리블랜드,CEN,82,50,32,0.61,9.0,29,12,21,20,11,5,1패,
4,인디애나,CEN,82,48,34,0.585,11.0,27,14,21,20,10,6,1패,
5,마이애미,SEA,82,44,38,0.537,15.0,26,15,18,23,11,5,1승,
6,밀워키,CEN,82,44,38,0.537,15.0,25,16,19,22,6,10,1패,
7,워싱턴,SEA,82,43,39,0.524,16.0,23,18,20,21,8,8,1패,
8,디트로이트,CEN,82,39,43,0.476,20.0,25,16,14,27,9,7,1승,
9,샬럿,SEA,82,36,46,0.439,23.0,21,20,15,26,11,5,1승,


In [34]:
row_data

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,1
제임스 하든 (휴스턴),72,30.4,8.8,5.4,1.8,0.7,9.0,3.7,8.7,44.9,36.7,85.8
앤서니 데이비스 (뉴올리언즈),75,28.1,2.3,11.1,1.5,2.6,10.4,0.7,6.6,53.4,34.0,82.8
르브론 제임스 (클리블랜드),82,27.5,9.1,8.6,1.4,0.9,10.4,1.8,4.7,54.2,36.7,73.1
데미안 릴라드 (포틀랜드),73,26.9,6.6,4.5,1.0,0.4,8.5,3.1,6.8,43.9,36.1,91.6
야니스 아데토쿤보 (밀워키),75,26.9,4.8,10.0,1.4,1.4,9.9,0.6,6.5,52.9,30.7,76.0
케빈 듀란트 (골든스테이트),68,26.4,5.4,6.8,0.7,1.8,9.3,2.5,5.3,51.6,41.9,88.9
러셀 웨스트브룩 (오클라호마),80,25.4,10.3,10.1,1.8,0.2,9.5,1.2,5.2,44.9,29.8,73.7
카이리 어빙 (보스턴),60,24.4,5.1,3.8,1.1,0.3,8.9,2.8,3.9,49.1,40.8,88.9
라마커스 알드리...(샌안토니오),75,23.1,2.0,8.5,0.6,1.2,9.2,0.4,4.4,51.0,29.3,83.7
빅터 올라디포 (인디애나),75,23.1,4.3,5.2,2.4,0.8,8.5,2.2,3.9,47.7,37.1,79.9


In [35]:
row_data.shape

(20, 1)