### 6. Data Encoding and Processing

主要问题是：

1. 文件格式化处理（包括简单的如csv，tsv，以及复杂的如 xml、html等）
2. 关系型数据库操作，以及其他

涉及 Struct结构， 略去； 涉及 pandas操作，略去。

In [24]:
import os
import io
samples_path = './samples/'

def open_samples(filename, *args, **kwargs):
    return io.open(os.path.join(samples_path, filename), *args, **kwargs)

open = open_samples

### 6.1 CSV 文件处理




In [25]:
import csv

# (a) Reading as tuples

print('Reading as tuples:')
with open('stocks.csv') as f:
    f_csv = csv.reader(f)
    headers = next(f_csv)
    for row in f_csv:
        # process row
        print('    ', row)


Reading as tuples:
     ['AA', '39.48', '6/11/2007', '9:36am', '-0.18', '181800']
     ['AIG', '71.38', '6/11/2007', '9:36am', '-0.15', '195500']
     ['AXP', '62.58', '6/11/2007', '9:36am', '-0.46', '935000']
     ['BA', '98.31', '6/11/2007', '9:36am', '+0.12', '104800']
     ['C', '53.08', '6/11/2007', '9:36am', '-0.25', '360900']
     ['CAT', '78.29', '6/11/2007', '9:36am', '-0.23', '225400']


In [26]:
# (b) Reading as namedtuples

print('Reading as namedtuples')
from collections import namedtuple
with open('stocks.csv') as f:
    f_csv = csv.reader(f)
    Row = namedtuple('Row', next(f_csv))   # headers 作为名称
    for r in f_csv:
        row = Row(*r)
        # Process row
        print('    ', row)

Reading as namedtuples
     Row(Symbol='AA', Price='39.48', Date='6/11/2007', Time='9:36am', Change='-0.18', Volume='181800')
     Row(Symbol='AIG', Price='71.38', Date='6/11/2007', Time='9:36am', Change='-0.15', Volume='195500')
     Row(Symbol='AXP', Price='62.58', Date='6/11/2007', Time='9:36am', Change='-0.46', Volume='935000')
     Row(Symbol='BA', Price='98.31', Date='6/11/2007', Time='9:36am', Change='+0.12', Volume='104800')
     Row(Symbol='C', Price='53.08', Date='6/11/2007', Time='9:36am', Change='-0.25', Volume='360900')
     Row(Symbol='CAT', Price='78.29', Date='6/11/2007', Time='9:36am', Change='-0.23', Volume='225400')


In [27]:
# (c) Reading as dictionaries

print('Reading as dicts')
with open('stocks.csv') as f:
    f_csv = csv.DictReader(f)   # 注意这里不用跳过头部
    for row in f_csv:
        # process row
        print('    ', row)


Reading as dicts
     OrderedDict([('Symbol', 'AA'), ('Price', '39.48'), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', '-0.18'), ('Volume', '181800')])
     OrderedDict([('Symbol', 'AIG'), ('Price', '71.38'), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', '-0.15'), ('Volume', '195500')])
     OrderedDict([('Symbol', 'AXP'), ('Price', '62.58'), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', '-0.46'), ('Volume', '935000')])
     OrderedDict([('Symbol', 'BA'), ('Price', '98.31'), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', '+0.12'), ('Volume', '104800')])
     OrderedDict([('Symbol', 'C'), ('Price', '53.08'), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', '-0.25'), ('Volume', '360900')])
     OrderedDict([('Symbol', 'CAT'), ('Price', '78.29'), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', '-0.23'), ('Volume', '225400')])


In [28]:
# (d) Reading into tuples with type conversion

print('Reading into named tuples with type conversion')

col_types = [str, float, str, str, float, int]   # 类型转换
with open('stocks.csv') as f:
    f_csv = csv.reader(f)
    headers = next(f_csv)
    for row in f_csv:
        # Apply conversions to the row items
        row = tuple(convert(value) for convert, value in zip(col_types, row))
        print(row)

Reading into named tuples with type conversion
('AA', 39.48, '6/11/2007', '9:36am', -0.18, 181800)
('AIG', 71.38, '6/11/2007', '9:36am', -0.15, 195500)
('AXP', 62.58, '6/11/2007', '9:36am', -0.46, 935000)
('BA', 98.31, '6/11/2007', '9:36am', 0.12, 104800)
('C', 53.08, '6/11/2007', '9:36am', -0.25, 360900)
('CAT', 78.29, '6/11/2007', '9:36am', -0.23, 225400)


In [29]:
# (e) Converting selected dict fields

print('Reading as dicts with type conversion')

field_types = [ ('Price', float),
                ('Change', float),
                ('Volume', int) ]   # 按照指定 field进行转换

with open('stocks.csv') as f:
    for row in csv.DictReader(f):
        row.update((key, conversion(row[key])) 
                   for key, conversion in field_types)
        print(row)


Reading as dicts with type conversion
OrderedDict([('Symbol', 'AA'), ('Price', 39.48), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', -0.18), ('Volume', 181800)])
OrderedDict([('Symbol', 'AIG'), ('Price', 71.38), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', -0.15), ('Volume', 195500)])
OrderedDict([('Symbol', 'AXP'), ('Price', 62.58), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', -0.46), ('Volume', 935000)])
OrderedDict([('Symbol', 'BA'), ('Price', 98.31), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', 0.12), ('Volume', 104800)])
OrderedDict([('Symbol', 'C'), ('Price', 53.08), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', -0.25), ('Volume', 360900)])
OrderedDict([('Symbol', 'CAT'), ('Price', 78.29), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', -0.23), ('Volume', 225400)])


### 6.2 JSON文件处理

- json.loads  字符串
- json.load   文件
- json.dump   文件
- json.dumps  字符串

In [30]:
# Some advanced JSON examples involving ordered dicts and classes
import json

# Some JSON encoded text
s = '{"name": "ACME", "shares": 50, "price": 490.1}'

# (a) Turning JSON into an OrderedDict

from collections import OrderedDict
data = json.loads(s, object_pairs_hook=OrderedDict)   # 转成 OrderedDict
print(data)

OrderedDict([('name', 'ACME'), ('shares', 50), ('price', 490.1)])


### 6.3 XML文件处理

这也是一大类问题

In [38]:
from urllib.request import urlopen
from xml.etree.ElementTree import parse

# Download the RSS feed and parse it
u = urlopen('http://planet.python.org/rss20.xml')
doc = parse(u)

# Extract and output tags of interest
for item in doc.iterfind('channel/item'):
    title = item.findtext('title')
    date = item.findtext('pubDate')
    link = item.findtext('link')

    print(title)
    print(date)
    print(link)
    print()


Vasudev Ram: Checking if web sites are online with Python
Sat, 31 Mar 2018 01:46:42 +0000
http://jugad2.blogspot.com/2018/03/checking-if-web-sites-are-online-with.html

Continuum Analytics Blog: Improved Security &amp; Performance in Anaconda Distribution 5
Fri, 30 Mar 2018 16:50:17 +0000
https://www.anaconda.com/blog/developer-blog/improved-security-performance-in-anaconda-distribution-5/

Stack Abuse: Single Page Apps with Vue.js and Flask: JWT Authentication
Fri, 30 Mar 2018 14:00:00 +0000
http://stackabuse.com/single-page-apps-with-vue-js-and-flask-jwt-authentication/

Codementor: How to use Python to test your Ethereum Smart Contracts
Fri, 30 Mar 2018 09:17:07 +0000
https://www.codementor.io/mandarvaze/how-to-use-python-to-test-your-ethereum-smart-contracts-i2nlflu5w

Codementor: Parallelizing Builds In Travis CI
Fri, 30 Mar 2018 08:08:04 +0000
https://www.codementor.io/parthshandilya/parallelizing-builds-in-travis-ci-hqb4yktmt

Brandon Rhodes: A New Driver for the Original Twiddl

### 6.4 处理大的XML文件（只是用少量内存，增量处理）


用堆栈的方式递归

In [39]:
# Example of incremental XML parsing
#
# The file 'potholes.xml' is a greatly condensed version of a larger
# file available for download at
#
# https://data.cityofchicago.org/api/views/7as2-ds3y/rows.xml?accessType=DOWNLOAD

from xml.etree.ElementTree import iterparse

def parse_and_remove(filename, path):
    path_parts = path.split('/')
    doc = iterparse(filename, ('start', 'end'))
    # Skip the root element
    next(doc)

    tag_stack = []
    elem_stack = []
    for event, elem in doc:
        if event == 'start':
            tag_stack.append(elem.tag)
            elem_stack.append(elem)
        elif event == 'end':
            if tag_stack == path_parts:
                yield elem
                elem_stack[-2].remove(elem)
            try:
                tag_stack.pop()
                elem_stack.pop()
            except IndexError:
                pass

# Find zip code with most potholes

from collections import Counter
potholes_by_zip = Counter()

data = parse_and_remove(samples_path + 'potholes.xml', 'row/row')
for pothole in data:
    potholes_by_zip[pothole.findtext('zip')] += 1

for zipcode, num in potholes_by_zip.most_common():
    print(zipcode, num)


60617 13
60626 8
60651 7
60647 6
60623 6
60613 4
60636 4
60625 4
60628 4
60609 4
60622 3
60657 3
60619 3
60629 3
60641 3
60618 2
60644 2
60654 2
60649 2
60638 2
60656 2
60660 1
60643 1
60634 1
60632 1
60639 1
60630 1
60612 1
60616 1
60614 1
60652 1
60707 1
60631 1
60637 1


### 6.5 将字典转成xml文件

In [44]:
from xml.etree.ElementTree import Element, tostring

def dict_to_xml(tag, d):
    elem = Element(tag)
    for key, val in d.items():
        child = Element(key)
        child.text = str(val)
        elem.append(child)
    return elem

s = {'name': 'GOOG', 'shares': 100, 'price': 490.1}
e = dict_to_xml('stock', s)
e.set('_id', '1234')   # attribute
print(tostring(e))

b'<stock _id="1234"><name>GOOG</name><shares>100</shares><price>490.1</price></stock>'


### 6.6 Parsing, Modifying and Rewriting XML

修改XML

In [49]:
from xml.etree.ElementTree import parse, Element


with open('pred.xml') as f:
    print(f.read())
doc = parse(samples_path + 'pred.xml')
root = doc.getroot()

# Remove a few elements 移除 sri 和cr
root.remove(root.find('sri'))
root.remove(root.find('cr'))

# Insert a new element after <nm>...</nm>， 在 nm 元素后面插入新元素
nm_index = root.getchildren().index(root.find('nm'))

e = Element('spam')
e.text = 'This is a test'
root.insert(nm_index + 1, e)

# Write back to a file
doc.write(samples_path + 'newpred.xml', xml_declaration=True)

with open('newpred.xml') as f:
    print(f.read())

<?xml version="1.0"?>
<stop>
    <id>14791</id>
    <nm>Clark &amp; Balmoral</nm>
    <sri>
        <rt>22</rt>
        <d>North Bound</d>
        <dd>North Bound</dd>
    </sri>
    <cr>22</cr>
    <pre>
       <pt>5 MIN</pt>
       <fd>Howard</fd>
       <v>1378</v>
       <rn>22</rn>
   </pre>
   <pre>
       <pt>15 MIN</pt>
       <fd>Howard</fd>
       <v>1867</v>
       <rn>22</rn>
   </pre>
</stop>

<?xml version='1.0' encoding='us-ascii'?>
<stop>
    <id>14791</id>
    <nm>Clark &amp; Balmoral</nm>
    <spam>This is a test</spam><pre>
       <pt>5 MIN</pt>
       <fd>Howard</fd>
       <v>1378</v>
       <rn>22</rn>
   </pre>
   <pre>
       <pt>15 MIN</pt>
       <fd>Howard</fd>
       <v>1867</v>
       <rn>22</rn>
   </pre>
</stop>


### 6.7 关系型数据库操作（略）

### 6.8 十六进制编解码

其实就是 ascii， 二进制编码

In [51]:
s = b'hello'

import binascii

h = binascii.b2a_hex(s)
print(h)

print(binascii.a2b_hex(h))

b'68656c6c6f'
b'hello'


In [55]:
import base64
h = base64.b16encode(s)
print(h)
print(base64.b16decode(h))

b'68656C6C6F'
b'hello'


### 6.9 base64 编解码

Base64是一种基于64个可打印字符来表示二进制数据的表示方法，参考 <https://zh.wikipedia.org/wiki/Base64>

`ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/`  ， = 作为后缀

每 6个bit 对应一个可打印字符（8bit）， 这样 编码会增加到 $4/3$ 倍

In [57]:
s = b'Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure.'

b = base64.b64encode(s)
print(b)

print(base64.b64decode(b))

b'TWFuIGlzIGRpc3Rpbmd1aXNoZWQsIG5vdCBvbmx5IGJ5IGhpcyByZWFzb24sIGJ1dCBieSB0aGlzIHNpbmd1bGFyIHBhc3Npb24gZnJvbSBvdGhlciBhbmltYWxzLCB3aGljaCBpcyBhIGx1c3Qgb2YgdGhlIG1pbmQsIHRoYXQgYnkgYSBwZXJzZXZlcmFuY2Ugb2YgZGVsaWdodCBpbiB0aGUgY29udGludWVkIGFuZCBpbmRlZmF0aWdhYmxlIGdlbmVyYXRpb24gb2Yga25vd2xlZGdlLCBleGNlZWRzIHRoZSBzaG9ydCB2ZWhlbWVuY2Ugb2YgYW55IGNhcm5hbCBwbGVhc3VyZS4='
b'Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, that by a perseverance of delight in the continued and indefatigable generation of knowledge, exceeds the short vehemence of any carnal pleasure.'
