## ネットワーク上からリソースをダウンロードする

In [1]:
import urllib
import urllib.request

* https://docs.python.org/3/library/urllib.request.html
* http://docs.python.jp/3/library/urllib.request.html

In [2]:
title = 'Python (programming language)'
quoted_title = urllib.parse.quote(title)
quoted_title

'Python%20%28programming%20language%29'

In [3]:
response_type = "json"
url_format = "https://en.wikipedia.org/w/api.php?format={format}&action=query&prop=revisions&titles={titles}&rvprop=content"
url = url_format.format(format = response_type, titles = quoted_title)
url

'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=revisions&titles=Python%20%28programming%20language%29&rvprop=content'

In [4]:
with urllib.request.urlopen(url) as response:
    code = response.getcode()
    bs = response.read()

print(code)
print(type(bs))
s = str(bs, encoding='ascii')
print(s[:500])

200
<class 'bytes'>
{"batchcomplete":"","query":{"pages":{"23862":{"pageid":23862,"ns":0,"title":"Python (programming language)","revisions":[{"contentformat":"text/x-wiki","contentmodel":"wikitext","*":"{{About|the programming language|the genus and other uses|Python (disambiguation)}}\n{{Use dmy dates|date=August 2015}}\n{{Infobox programming language\n|name                   = Python\n|logo                   = Python logo and wordmark.svg\n|logo size              = 260px\n|paradigm               = [[multi-paradi


## JSON ファイルを読む

In [5]:
import json

* https://docs.python.org/3/library/json.html
* http://docs.python.jp/3/library/json.html

In [6]:
decoder = json.JSONDecoder()
obj = decoder.decode(s)
print(type(obj))

<class 'dict'>


* JSONDecoder オブジェクトの decode メソッドで JSON 文字列をオブジェクト（`dict` または `array`）に変換できる

In [7]:
print(obj.keys())
print(type(obj['query']))
print(obj['query'].keys())
print(type(obj['query']['pages']))
print(obj['query']['pages'].keys())

dict_keys(['query', 'batchcomplete'])
<class 'dict'>
dict_keys(['pages'])
<class 'dict'>
dict_keys(['23862'])


In [8]:
page = obj['query']['pages']['23862']
print(page.keys())
print(page['title'])
print(type(page['revisions']))

dict_keys(['ns', 'title', 'pageid', 'revisions'])
Python (programming language)
<class 'list'>


In [9]:
rev = page['revisions'][0]
print(rev.keys())
print(rev['contentformat'])
print(rev['contentmodel'])
print(rev['*'][:300])

dict_keys(['contentmodel', '*', 'contentformat'])
text/x-wiki
wikitext
{{About|the programming language|the genus and other uses|Python (disambiguation)}}
{{Use dmy dates|date=August 2015}}
{{Infobox programming language
|name                   = Python
|logo                   = Python logo and wordmark.svg
|logo size              = 260px
|paradigm               = [[mu


### おまけ:ipynb

* *.ipynb ファイルの実態は JSON である（非 ASCII 文字を含むので注意）

In [10]:
with open('../20-basic/01-Julian_Day.ipynb', encoding='utf-8') as file:
    obj = decoder.decode(file.read())
    print(obj['cells'][1])

{'cell_type': 'markdown', 'metadata': {}, 'source': ['# Exercise\n', '\n', '## プログラム：Julian Day\n', '\n', '* https://en.wikipedia.org/wiki/Julian_day#Converting_Julian_or_Gregorian_calendar_date_to_Julian_day_number\n', '* 以下は <var>year</var> 年 <var>month</var> 月 <var>day</var> 日の Julian Day Number (JDN) を求めるプログラムである\n', '\n', '$$\n', 'a = \\lfloor \\frac{14 - \\mathrm{month}}{12} \\rfloor \\\\\n', 'y = \\mathrm{year} + 4800 - a \\\\\n', 'm = \\mathrm{month} + 12a - 3 \\\\\n', 'JDN = \\mathrm{day} + \\lfloor \\frac{153m + 2}{5} \\rfloor + 365y\n', '+ \\lfloor \\frac{y}{4} \\rfloor - \\lfloor \\frac{y}{100} \\rfloor + \\lfloor \\frac{y}{400} \\rfloor\n', '- 32045\n', '$$\n', '\n', '### 実装例']}


## XML ファイルを読む

In [17]:
import xml.etree

* https://docs.python.org/3/library/xml.etree.elementtree.html
* http://docs.python.jp/3/library/xml.etree.elementtree.html

* Wikipedia から XML 形式でページを取得する

In [11]:
response_type = "xml"
url = url_format.format(format = response_type, titles = quoted_title)
url

'https://en.wikipedia.org/w/api.php?format=xml&action=query&prop=revisions&titles=Python%20%28programming%20language%29&rvprop=content'

In [12]:
with urllib.request.urlopen(url) as response:
    code = response.getcode()
    bs = response.read()

print(code)
print(type(bs))
s = str(bs, encoding='utf-8')
print(s[:500])

200
<class 'bytes'>
<?xml version="1.0"?><api batchcomplete=""><query><pages><page _idx="23862" pageid="23862" ns="0" title="Python (programming language)"><revisions><rev contentformat="text/x-wiki" contentmodel="wikitext" xml:space="preserve">{{About|the programming language|the genus and other uses|Python (disambiguation)}}
{{Use dmy dates|date=August 2015}}
{{Infobox programming language
|name                   = Python
|logo                   = Python logo and wordmark.svg
|logo size              = 260px
|para


In [18]:
root = xml.etree.ElementTree.fromstring(s)
root

<Element 'api' at 0x7f2f5c18ae08>

* `xml.etree.ElementTree.fromstring` で XML 文字列をパースできる

In [15]:
revs = root.findall('./query/pages/page/revisions/rev')
print(revs)

[<Element 'rev' at 0x7f2f5c182e08>]


* XPath で要素を取得できる

In [16]:
rev = revs[0]
print(rev.items())
print(rev.text[:300])

[('contentmodel', 'wikitext'), ('{http://www.w3.org/XML/1998/namespace}space', 'preserve'), ('contentformat', 'text/x-wiki')]
{{About|the programming language|the genus and other uses|Python (disambiguation)}}
{{Use dmy dates|date=August 2015}}
{{Infobox programming language
|name                   = Python
|logo                   = Python logo and wordmark.svg
|logo size              = 260px
|paradigm               = [[mu
