#  数据加载、存储与文件格式

In [1]:
!cat ch06/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [2]:
from pandas import DataFrame, Series
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
df = pd.read_csv('ch06/ex1.csv')

In [4]:
df # 默认第一章作为columns

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
# 我们也可以使用read_table，不过需要指定分隔符
pd.read_table('ch06/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [8]:
!cat ch06/ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [9]:
# 可以让pandas为其分配默认的列名字，也可以自己定义列名
pd.read_csv('ch06/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [10]:
pd.read_csv('ch06/ex2.csv', names=['a','b','c','d','message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [11]:
# 假设希望将message列做成DataFrame的索引，可以明确表示将该列放到索引4的位置上。也可以通过index_col参数指定“message”
names = ['a','b','c','d','message']
pd.read_csv('ch06/ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [12]:
# 如果希望将多个列做成一个层次化索引，只需要传入由列编号或列名组成的列表即可

In [13]:
!cat ch06/csv_mindex.csv

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [14]:
parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1','key2'])

In [15]:
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [16]:
# 对于不是固定的分隔符来分割字段的，可以编写正则表达式来作为read_table的分隔符

In [17]:
list(open('ch06/ex3.txt'))

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [18]:
result = pd.read_table('ch06/ex3.txt', sep='\s+')

In [19]:
result

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [20]:
# skiprows可以跳过文件的第一行、第三行、第四行等

In [21]:
!cat ch06/ex4.csv

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [22]:
pd.read_csv('ch06/ex4.csv',skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


#### 对于缺失值，默认情况下，pandas会用一组经常出现的标记值进行识别，如NA，-1，#IND以及NULL等

In [23]:
!cat ch06/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo


In [25]:
result = pd.read_csv('ch06/ex5.csv')

In [26]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [28]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [29]:
# na_values 可以接受一组用于表示缺失值的字符串

In [30]:
result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])

In [32]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [33]:
# 可以用一个字典为各列指定不同的NA标记值
sentinels = {'message':['foo','NA'], 'something':['two']}

In [34]:
pd.read_csv('ch06/ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


### 逐块读取文本文件

####  在处理很大文件的时候，或者找出大文件中的参数以便于后续处理的时候，可能只想读出文件的一部分或者逐块对文件进行迭代

In [35]:
result = pd.read_csv('ch06/ex6.csv')

In [36]:
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.817480,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.358480,-0.497572,-0.367016,0.507702,S
9,-1.740877,-1.160417,-1.637830,2.172201,G


####  如果只想读取几行，避免读取整个文件，通过nrows进行指定即可

In [37]:
pd.read_csv('ch06/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


#### 要逐块读取文件，需要设置chunksize（行数）

In [38]:
chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)

In [39]:
chunker

<pandas.io.parsers.TextFileReader at 0xa3dac18>

#### 返回的TextParser对象可以使你根据chunsize对文件进行逐块迭代。
#### 比如我们可以迭代ex6.csv，将值计数聚合到“key”列中

In [127]:
global tot

In [128]:
tot = Series([])
for piece in chunker:
    tot = tot.append(piece['key'].value_counts(), fill_value=0)

In [129]:
tot[:10]

Series([], dtype: float64)

In [115]:
a = Series([1])

In [116]:
b = Series([2
           ])

In [117]:
a.add(b)

0    3
dtype: int64

In [118]:
a+b

0    3
dtype: int64

In [119]:
a.append(b)

0    1
0    2
dtype: int64

### 将数据写出到文本格式

In [145]:
data = pd.read_csv('ch06/ex5.csv')

In [146]:
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


#### 利用DataFrame的to_csv方法，我们可以将数据写到一个以逗号分隔的文件中

In [148]:
data.to_csv('ch06/out.csv')

In [149]:
!cat ch06/out.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [151]:
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


#### 缺失值在输出结果中被表示为空字符串，也可以表示为其他的

In [152]:
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


#### 行和列的标签可以被禁用

In [153]:
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


#### `也可以只写出一部分的列，并以指定的顺序排列

In [155]:
data.to_csv(sys.stdout, index=False, columns=['a','b','c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


#### Series也有to_csv的方法

In [157]:
dates = pd.date_range('1/1/2000', periods=7)

In [158]:
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

In [159]:
ts = Series(np.arange(7), index=dates)

In [160]:
ts.to_csv('ch06/tseries.csv')

In [161]:
!cat ch06/tseries.csv

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


In [162]:
Series.from_csv('ch06/tseries.csv', parse_dates=True)

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64

### 手工处理分隔符格式

In [163]:
!cat ch06/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3","4"


In [164]:
import csv
f = open('ch06/ex7.csv')
reader = csv.reader(f)

In [166]:
reader

<_csv.reader at 0xa47bd08>

In [167]:
# 对reader进行迭代，将会为每一行产生一个元祖，并移除所有的引号
for line in reader:
    print line

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']


In [168]:
# 很明显得到的是一个列表，而不是元祖

In [169]:
lines = list(csv.reader(open('ch06/ex7.csv')))

In [170]:
header, values = lines[0], lines[1:]

In [172]:
data_dict = {h : v for h,v in zip(header, zip(*values))}

In [173]:
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [190]:
class my_dialect(csv.Dialect):
    lineterminator  = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = 0

In [191]:
with open('mydata.csv','w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one','two','three'))
    writer.writerow(('1','2','3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

In [192]:
!cat mydata.csv

one;two;three
1;2;3
4;5;6
7;8;9


###  JSON数据

In [202]:
obj = """
{
    "name" : "Wes",
    "places_lived" : ["United states", "Spain", "Germany"],
    "pet" : null,
    "siblings" : [{"name" : "Scott", "age" :25, "pet" : "Zuko"},
                    {"name" : "Katie", "age" : 33, "pet" : "Cisco"}]
}
"""

In [203]:
import json

In [204]:
result = json.loads(obj)         # json.loads可将JSON字符串转换成Python形式

In [205]:
result

{u'name': u'Wes',
 u'pet': None,
 u'places_lived': [u'United states', u'Spain', u'Germany'],
 u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'},
  {u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}

In [208]:
asjson = json.dumps(result)    # json.dumps将Python对象转换成JSON格式

In [207]:
asjson

'{"pet": null, "siblings": [{"pet": "Zuko", "age": 25, "name": "Scott"}, {"pet": "Cisco", "age": 33, "name": "Katie"}], "name": "Wes", "places_lived": ["United states", "Spain", "Germany"]}'

#### JSON转DataFrame

In [211]:
siblings = DataFrame(result['siblings'], columns=['name', 'age'])

In [212]:
siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


### XML和HTML： Web信息收集

In [213]:
from lxml.html import parse
from urllib2 import urlopen

In [218]:
parsed = parse(urlopen('https://finance.yahoo.com/quote/AAPL/options?ltr=1'))

In [219]:
doc = parsed.getroot()

In [220]:
links = doc.findall('.//a')

In [221]:
links[15:20]

[<Element a at 0xa6c6ae8>,
 <Element a at 0xa6c6b38>,
 <Element a at 0xa6c6b88>,
 <Element a at 0xa6c6bd8>,
 <Element a at 0xa6c6c28>]

In [222]:
lnk = links[28]

In [223]:
lnk

<Element a at 0xa6c6ef8>

In [224]:
lnk.get('href')

'/quote/FB?p=FB'

In [225]:
lnk.text_content()

'FB'

In [227]:
urls = [lnk.get('href') for lnk in doc.findall('.//a')]

In [228]:
urls[-10:]

['https://smallbusiness.yahoo.com',
 'https://help.yahoo.com/kb/index?page=content&y=PROD_FIN_DESK&locale=en_US&id=SLN2310',
 'http://help.yahoo.com/l/us/yahoo/finance/',
 'https://yahoo.uservoice.com/forums/382977',
 'http://info.yahoo.com/privacy/us/yahoo/',
 'http://info.yahoo.com/relevantads/',
 'http://info.yahoo.com/legal/us/yahoo/utos/utos-173.html',
 'https://twitter.com/YahooFinance',
 'https://facebook.com/yahoofinance',
 'http://yahoofinance.tumblr.com']

In [229]:
tables = doc.findall('.//table')

In [231]:
tables

[<Element table at 0xa5fe408>,
 <Element table at 0xa5fe3b8>,
 <Element table at 0xa5fe458>]

In [232]:
calls = tables[2]

In [234]:
puts = tables[1]

In [235]:
rows = calls.findall('.//tr')

In [238]:
def _unpack(row, kind='td'):
    elts = row.findall('.//%s' % kind)
    return [val.text_content() for val in elts]

In [239]:
_unpack(rows[0], kind ='th')

['Contract Name',
 'Last Trade Date',
 'Strike',
 'Last Price',
 'Bid',
 'Ask',
 'Change',
 '% Change',
 'Volume',
 'Open Interest',
 'Implied Volatility']

In [240]:
_unpack(rows[1], kind='td')

['AAPL170714P00110000',
 '2017-06-09 11:48PM EDT',
 '110.00',
 '0.02',
 '0.00',
 '0.07',
 '0.00',
 '-',
 '3',
 '0',
 '95.31%']

In [241]:
from pandas.io.parsers import TextParser

In [242]:
def parse_options_data(table):
    rows = table.findall('.//tr')
    hearder = _unpack(rows[0], kind='th')
    data = [_unpack(r) for r in rows[1:]]
    return TextParser(data, name=header).get_chunk()

In [243]:
call_data = parse_options_data(calls)

In [244]:
put_data = parse_options_data(puts)

In [245]:
call_data

Unnamed: 0,AAPL170714P00110000,2017-06-09 11:48PM EDT,110.00,0.02,0.00,0.07,0.00.1,-,3,0,95.31%
0,AAPL170714P00115000,2017-06-30 3:47PM EDT,115.0,0.02,0.0,0.02,0.01,100.00%,70,874,70.31%
1,AAPL170714P00120000,2017-07-07 9:34AM EDT,120.0,0.02,0.0,0.01,0.01,100.00%,35,237,54.69%
2,AAPL170714P00123000,2017-07-07 11:45PM EDT,123.0,0.04,0.0,0.03,0.0,-,100,50,53.91%
3,AAPL170714P00124000,2017-07-07 11:45PM EDT,124.0,0.03,0.0,0.03,0.0,-,10,10,50.78%
4,AAPL170714P00125000,2017-07-07 3:55PM EDT,125.0,0.01,0.0,0.01,0.0,-,100,562,46.88%
5,AAPL170714P00126000,2017-07-05 10:01AM EDT,126.0,0.03,0.0,0.03,0.0,-,2,20,50.39%
6,AAPL170714P00127000,2017-07-07 11:45PM EDT,127.0,0.05,0.0,0.03,0.0,-,210,200,47.66%
7,AAPL170714P00128000,2017-07-07 9:30AM EDT,128.0,0.03,0.0,0.02,0.0,-,10,510,42.97%
8,AAPL170714P00129000,2017-07-06 2:42PM EDT,129.0,0.04,0.01,0.03,0.0,-,104,100,42.58%
9,AAPL170714P00130000,2017-07-07 3:53PM EDT,130.0,0.02,0.01,0.02,-0.04,-66.67%,137,1231,37.89%


### 利用lxml.objectify解析XML

In [247]:
from lxml import objectify

path = 'ch06/mta_perf/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()

In [248]:
data = []
skip_fields = ['PARENT_REQ', 'INDICATOR_SEQ', "DESIRED_CHANGE", 'DECIMAL_PLACES']

In [253]:
# root.INDICATOR 返回一个用于产生各个<INDICATOR> XML元素的生成器
for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)

In [254]:
pref = DataFrame(data)

In [255]:
pref

Unnamed: 0,AGENCY_NAME,CATEGORY,DESCRIPTION,FREQUENCY,INDICATOR_NAME,INDICATOR_UNIT,MONTHLY_ACTUAL,MONTHLY_TARGET,PARENT_SEQ,PERIOD_MONTH,PERIOD_YEAR,YTD_ACTUAL,YTD_TARGET
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
5,,,,,,,,,,,,,
6,,,,,,,,,,,,,
7,,,,,,,,,,,,,
8,,,,,,,,,,,,,
9,,,,,,,,,,,,,


In [256]:
from StringIO import StringIO

In [258]:
tag = '<a href="http://www.google.com">Google</a>'

In [259]:
root = objectify.parse(StringIO(tag)).getroot()

In [260]:
root

<Element a at 0xbed0ec8>

In [261]:
root.get('href')

'http://www.google.com'

In [262]:
root.text

'Google'

### 二进制数据格式

####  实现数据的二进制格式存储最简单的办法之一是使用Python内置的pickle序列化。pandas对象都有一个用于将数据以pickle形式保存到磁盘上的save方法

In [263]:
frame = pd.read_csv('ch06/ex1.csv')

In [264]:
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [265]:
frame.save('ch06/frame_pickle')

AttributeError: 'DataFrame' object has no attribute 'save'

### 使用HDF5格式

####  1. c库，有多重语言接口
####  2. HDF指的是层次型数据格式（hierarchical data format）
####  3. 每个HDF5文件都含有一个文件系统式的节点结构，能存储多个数据集并支持元数据
####  4. 支持多重压缩器的及时压缩，还能更高效地存储重复模式数据
####  5. 对于非常大的无法直接放入内存的数据集，HDF5支持有效地分块读写 

#### Python中HDF5有两个接口（即PyTables和h5py）

In [266]:
store = pd.HDFStore('mydata.h5')

In [267]:
store['obj1'] = frame

In [268]:
store['obj1_col'] = frame['a']

In [269]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
/obj1                frame        (shape->[3,5])
/obj1_col            series       (shape->[3])  

In [270]:
store['obj1']

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


#### HDF5不是数据库，适合一次写多次读的数据集

### 读取Excel文件

####  pandas的ExcelFile类支持读取存储在Excel 2003或更高版本的表格型数据，用到的包为：ExcelFile用到了xlrd和openpyxl包

In [273]:
xls_file = pd.ExcelFile('data.xls')

In [274]:
table = xls_file.parse('Sheet1')

In [275]:
table

Unnamed: 0,name,age
0,text1,12
1,test2,23
2,deqw,13
3,qweq,3


### 使用HTML和Web API

In [276]:
import requests

In [277]:
url = 'http://search.twitter.com/search.json?q=python%20pandas'

In [278]:
resp = requests.get(url)

ConnectionError: HTTPConnectionPool(host='search.twitter.com', port=80): Max retries exceeded with url: /search.json?q=python%20pandas (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x000000000A3DA390>: Failed to establish a new connection: [Errno 10060] ',))

### 使用数据库

In [279]:
import sqlite3

In [280]:
query = """
    CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER);
"""
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()

In [281]:
data = [('Atlanta', 'Georgia', 1.25, 6),
       ('Tallahassee', 'Florida', 2.6, 3),
       ('Sacramento', 'California', 1.7, 5)]

In [282]:
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"

con.executemany(stmt, data)
con.commit()

In [283]:
cursor = con.execute('select * from test')

In [284]:
rows = cursor.fetchall()

In [285]:
rows

[(u'Atlanta', u'Georgia', 1.25, 6),
 (u'Tallahassee', u'Florida', 2.6, 3),
 (u'Sacramento', u'California', 1.7, 5)]

In [286]:
# 可以将这个元祖列表穿个DataFrame的构造器，但是还需要列名（位于游标的description属性中）
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [289]:
frame3 = DataFrame(rows, columns = zip(*cursor.description)[0])

In [290]:
frame3

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [303]:
import pandas.io.sql as psql

In [310]:
# sql.read_frame('select * frame test')改为
s = psql.read_sql_query('select * from test', con)

In [311]:
s

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5
