In [62]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import pandas_datareader.data as web
from numpy import nan as NA

## pandas的数据结构介绍

### Series

Series是一种类似于以为数组的对象，它由一组数据（各种NumPy数据类型）以及一组与之相关的数据标签（即索引）组成。仅由一组数据即可产生最简单的Series。

In [3]:
obj = Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

Series的字符串表现形式为：索引在左边，值在右边。没有指定索引时，默认从0开始的数字指引。

In [4]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj2 = Series([4,7,-5,3], index=['a', 'b', 'c', 'd'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [8]:
# 通过索引找值
obj2['a']

4

In [9]:
obj2[obj2 > 0]

a    4
b    7
d    3
dtype: int64

In [10]:
'b' in obj2

True

In [12]:
# 通过字典创建Series
sdata = {
    'Ohio':35000,
    'Texas':71000,
    'Oregon':16000,
    'Utah':5000
}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [13]:
# 如果只传入一个字典，则结果Series中的缩影就是原字典的键
states = ['Califonia', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4
# NaN: not a number

Califonia        NaN
Ohio         35000.0
Oregon       16000.0
Texas        71000.0
dtype: float64

In [15]:
pd.isnull(obj4)

Califonia     True
Ohio         False
Oregon       False
Texas        False
dtype: bool

In [16]:
# 数组名字和索引名字
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
Califonia        NaN
Ohio         35000.0
Oregon       16000.0
Texas        71000.0
Name: population, dtype: float64

In [17]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

DataFrame是一个表格型的数据结构，它含有一组有序的列，每列可以是不同的值类型。DataFrame既有行索引也有列索引，他可以被看作有Series组成的字典。跟其他类似的数据结构相比，DataFrame中面向行和面向列的操作基本上是平衡的。

In [18]:
data = {
    'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
    'year':[2000,2001,2002,2001,2002],
    'pop':[1.5,1.7,3.6,2.4,2.9]
}
frame = DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [19]:
# 指定列顺序
DataFrame(data, columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [20]:
frame2 = DataFrame(data, columns=['year','state','pop','debt'], index=['one','two','three','four','five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [21]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [23]:
# 按行取值
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [24]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [26]:
frame2['debt'] = np.arange(5)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [27]:
# 为不存在的列赋值会创建出一个新列
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,0,True
two,2001,Ohio,1.7,1,True
three,2002,Ohio,3.6,2,True
four,2001,Nevada,2.4,3,False
five,2002,Nevada,2.9,4,False


In [29]:
del frame2['eastern']
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


通过索引方式返回的列知识相应数据的视图而已，并不是副本。因此，对返回的Series所做的任何就地修改都会反映到源DataFrame上。

### 索引对象

In [30]:
obj = Series(range(3), index=['a','b','c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [31]:
index[1:]

Index(['b', 'c'], dtype='object')

index对象是不可以修改的，这样才能使index对象在多个数据结构之间安全共享

In [32]:
index = pd.Index(np.arange(3))
obj2 = Series([1.5,2.5,0], index=index)
obj2

0    1.5
1    2.5
2    0.0
dtype: float64

In [33]:
obj2.index is index

True

## 基本功能

### 重新索引

In [34]:
obj = Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [35]:
# 重新设定索引
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [36]:
# 插值处理，向前填充
obj3 = Series(['blue','purple','yellow'], index=[0,2,4])
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [37]:
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a','b','c'], columns=['Ohio','Texas','California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [38]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,,,


In [39]:
states = ['Texas','Utah','California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,4,,5
c,7,,8


In [41]:
frame.reindex(index=['a','b','c','d'], columns=states).ffill()

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,4.0,,5.0
c,7.0,,8.0
d,7.0,,8.0


### 丢弃指定轴上的项

In [42]:
obj = Series(np.arange(5), index=['a','b','c','d','e'])
new_obj = obj.drop('c')
new_obj

a    0
b    1
d    3
e    4
dtype: int32

In [43]:
data = DataFrame(np.arange(16).reshape((4,4)), index=['Ohio','Colorado','Utah','New York'], columns=['one','two','three','four'])
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [44]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


### 索引、选取和过滤

In [45]:
obj = Series(np.arange(4), index=['a','b','c','d'])
obj

a    0
b    1
c    2
d    3
dtype: int32

In [46]:
obj['b']

1

In [48]:
obj[1]

1

In [49]:
obj[[1,3]]

b    1
d    3
dtype: int32

In [50]:
obj[2:4]

c    2
d    3
dtype: int32

利用标签的切片运算与普通的Python切片运算不同，是闭区间

In [51]:
obj['b':'c']

b    1
c    2
dtype: int32

In [52]:
obj['b','c'] = 5
obj

a    0
b    5
c    5
d    3
dtype: int32

In [53]:
data = DataFrame(np.arange(16).reshape((4,4)), index=['Ohio','Colorado','Utah','New York'], columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [54]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [55]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [56]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [58]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [59]:
data.loc['Colorado',['two','three']]

two      5
three    6
Name: Colorado, dtype: int32

### 算术运算和数据对齐

In [60]:
s1 = Series([7.3,-2.5,3.4,1.5], index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1], index=['a','c','e','f','g'])
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [61]:
df1 = DataFrame(np.arange(9).reshape((3,3)), index=['Ohio','Texas','Colorado'], columns=list('bcd'))
df2 = DataFrame(np.arange(12).reshape((4,3)), index=['Utah','Ohio','Texas','Oregon'], columns=list('bde'))
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


### 在算术方法中填充值

In [3]:
df1 = DataFrame(np.arange(12).reshape((3,4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20).reshape((4,5)), columns=list('abcde'))

In [4]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [5]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [6]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,0
1,4,5,6,7,0
2,8,9,10,11,0


### DataFrame和Series之间的运算

In [7]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [9]:
arr - arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [11]:
frame = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [16]:
series = frame.iloc[0]
series

b    0
d    1
e    2
Name: Utah, dtype: int32

In [17]:
frame - series

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


### 函数应用和映射

NumPy的ufuncs（元素级数组方法）也可用于操作pandas对象

In [2]:
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-1.318639,0.478683,0.383246
Ohio,1.755283,-0.569915,-0.090714
Texas,-0.771674,-1.043,1.902752
Oregon,0.437007,-1.127813,-0.027327


另一个常见的操作是，将函数应用到由各列或行所形成的一维数组上

In [4]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    3.073922
d    1.606496
e    1.993466
dtype: float64

In [5]:
frame.apply(f, axis=1)

Utah      1.797322
Ohio      2.325198
Texas     2.945752
Oregon    1.564820
dtype: float64

许多最为常见的数组统计功能被实现成DataFrame的方法，因此无需使用apply方法。除了标量外，传递给apply的函数还可以返回由多个值组成的Series

In [7]:
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.318639,-1.127813,-0.090714
max,1.755283,0.478683,1.902752


此外，元素级的Python函数也是可以用的。加入你想得到frame中各个浮点值的格式化字符串，使用apply即可

In [8]:
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-1.32,0.48,0.38
Ohio,1.76,-0.57,-0.09
Texas,-0.77,-1.04,1.9
Oregon,0.44,-1.13,-0.03


之所以叫做applymap，是因为Series有一个用于应用元素级函数的map方法

In [9]:
frame['e'].map(format)

Utah       0.38
Ohio      -0.09
Texas      1.90
Oregon    -0.03
Name: e, dtype: object

### 排序和排名

根据条件对数据集排序也是一种重要的内置运算。要对行列或索引进行排序，可使用sort_index方法，它将返回一个已排序的新对象

In [10]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

面对于DataFrame，则可以根据任意一个轴上的索引进行排序

In [12]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [13]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


数据默认是按升序排序的，但也可以降序排序

In [14]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


在排序时，任何缺失值默认都会被放到Series的末尾

In [15]:
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [17]:
# obj.order()
# Python3.6之后Series没有order属性
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

在DataFrame上，你可能希望根据一个或多个列中的值进行排序。将一个或多个列的名字传递给by选项即可

In [18]:
frame = DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [19]:
frame.sort_index(by='b')

  """Entry point for launching an IPython kernel.


Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [20]:
frame.sort_index(by=['a','b'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


排名跟排序关系密切，且他会增设一个排名值（从1开始，一直到数组中有效数据的数量）。它跟numpy.argsort产生的间接排序索引差不多，只不过它可以根据某种规则破坏平级关系。默认情况下，rank是通过为各组分配一个平均排名的方式破坏平级关系的。

In [21]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

也可以根据值在源数据中出现的顺序给出排名

In [22]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

降序排列

In [23]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [24]:
frame = DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})
frame.rank(axis=1)

Unnamed: 0,b,a
0,2.0,1.0
1,2.0,1.0
2,1.0,2.0
3,2.0,1.0


### 带有重复值的轴索引

知道目前为止，我多介绍的所有范例都有着唯一的轴标签。虽然许多pandas函数都要求标签唯一，但这并不是强制性的。

In [26]:
obj = Series(range(5), index=['a', 'b', 'c', 'd', 'd'])
obj

a    0
b    1
c    2
d    3
d    4
dtype: int64

检测索引是否唯一

In [27]:
obj.index.is_unique

False

对于带有重复值的索引，数据选取的行为将会有些不同。如果某个索引对应多个值，则返回一个Series；而对应单个值的，则返回一个标量值。

In [28]:
obj['a']

0

In [29]:
obj['d']

d    3
d    4
dtype: int64

对DataFrame的行进行索引时也是如此

In [30]:
df = DataFrame(np.random.randn(4,3), index=['a','a','b','b'])
df

Unnamed: 0,0,1,2
a,1.14827,0.048054,0.273974
a,-1.405207,-0.394627,2.10389
b,0.814022,-2.252754,1.853922
b,0.765728,-0.167237,-2.520983


In [32]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.814022,-2.252754,1.853922
b,0.765728,-0.167237,-2.520983


In [34]:
df.iloc[0]

0    1.148270
1    0.048054
2    0.273974
Name: a, dtype: float64

## 汇总和计算描述统计

In [None]:
pandas对象拥有一组常用的数学和统计方法。它们大部分都属于约简和汇总统计，用于从Series中提取单个值或从DataFrame的行或列中提取一个Series，跟对应的NumPy数组方法相比，他们都是基于没有缺失数据的假设而构建的。

In [35]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]], index=['a','b','c','d'],columns=['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


调用DataFrame的sum方法将会返回一个含有列小计的Series

In [36]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [37]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

NA值会自动被排除，除非整个切片（这里指的是行或列）都是NA。通过skipna选项可以禁用此功能

In [38]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [39]:
df.idxmax()

one    b
two    d
dtype: object

In [40]:
# 累计统计
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [41]:
# 一次性产生多个汇总统计
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


### 相关系数与协方差

有些汇总统计是通过参数对计算出来的。下面的数据来自Yahoo!Finance的股票价格和成交量

In [45]:
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
price = DataFrame({tic:data['Adj Close']
            for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume']
            for tic, data in all_data.items()})

In [46]:
# 价格百分数变化
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-24,0.03434,0.004385,0.002587,0.011117
2009-12-28,0.012294,0.013326,0.005484,0.007098
2009-12-29,-0.011861,-0.003477,0.007058,-0.005571
2009-12-30,0.012147,0.005461,-0.013699,0.005376
2009-12-31,-0.004299,-0.012597,-0.015504,-0.004416


Series的corr方法用于计算两个Series中重叠的、非NA的、按索引对齐的值的相关系数。cov协方差也类似

In [47]:
returns.MSFT.corr(returns.IBM)

0.49435816026600404

In [48]:
returns.MSFT.cov(returns.IBM)

0.00021582139787637014

DataFrame的corr和cov方法将以DataFrame的兴衰返回完整的相关系数或协方差矩阵

In [49]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.412393,0.423598,0.470676
IBM,0.412393,1.0,0.494358,0.390689
MSFT,0.423598,0.494358,1.0,0.443586
GOOG,0.470676,0.390689,0.443586,1.0


In [50]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.00103,0.000254,0.000309,0.000303
IBM,0.000254,0.000369,0.000216,0.000142
MSFT,0.000309,0.000216,0.000516,0.000205
GOOG,0.000303,0.000142,0.000205,0.00058


利用DataFrame的corrwith方法，你可以计算其列或行跟另一个Series或DataFrame之间的相关系数。

In [51]:
returns.corrwith(returns.IBM)

AAPL    0.412393
IBM     1.000000
MSFT    0.494358
GOOG    0.390689
dtype: float64

In [52]:
returns.corrwith(volume)

AAPL   -0.057665
IBM    -0.006592
MSFT   -0.014228
GOOG    0.062647
dtype: float64

## 唯一值、值计算以及成员资格

In [54]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [55]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [56]:
pd.value_counts(obj.values, sort=False)

c    3
b    2
a    3
d    1
dtype: int64

In [57]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [58]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

## 处理缺失数据

In [None]:
缺失数据在大部分数据分析应用中都很常见。pandas的设计目标之一就是让缺失数据的处理任务尽量轻松。例如，pandas对象上的所有描述统计都排除了缺失数据。pandas使用浮点NaN表示浮点和肺腑点数组中的缺失数据。它只是一个便于被检测出来的标记而已。

In [59]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [60]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

Python内置的None值也会被当做NA处理

### 滤除缺失数据

过滤掉缺失数据的方法有很多种。纯手工操作永远都是一个办法，但dropna可能会更实用。

In [63]:
data = Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [64]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

而对于DataFrame对象，事情就有点复杂了。你可能希望丢弃全NA或含有NA的行货列，dropna默认丢弃任何含有缺失值的行

In [66]:
# NA需要是大写的
data = DataFrame([[1,3,4],[1,NA,NA],[NA,NA,NA],[NA,4,5]])
data

Unnamed: 0,0,1,2
0,1.0,3.0,4.0
1,1.0,,
2,,,
3,,4.0,5.0


In [68]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,3.0,4.0


In [70]:
# how=all丢弃行全为NA的值
cleaned = data.dropna(how='all')
cleaned

Unnamed: 0,0,1,2
0,1.0,3.0,4.0
1,1.0,,
3,,4.0,5.0


In [71]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,3.0,4.0,
1,1.0,,,
2,,,,
3,,4.0,5.0,


In [72]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,3.0,4.0
1,1.0,,
2,,,
3,,4.0,5.0


### 填充缺失数据

你可能不想滤除缺失数据（有可能会丢弃跟它有关的其他数据），而是希望通过其他方式填补哪些‘空洞’。对于大多数情况而言，fillna方法是最主要的函数。

In [73]:
df.fillna(0)

Unnamed: 0,one,two
a,1.4,0.0
b,7.1,-4.5
c,0.0,0.0
d,0.75,-1.3


### 层次化索引

层次化索引时pandas的一项重要功能，它使你能在一个轴上拥有多个索引级别。抽象点说，它使你能以低纬度形式处理高纬度数据。

In [74]:
data = Series(np.random.randn(10), index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
data

a  1    2.465786
   2    0.030054
   3   -0.766532
b  1    1.740273
   2    0.802297
   3    0.656885
c  1    0.860777
   2   -0.926302
d  2    0.614864
   3   -0.182352
dtype: float64

In [76]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [77]:
data['b']

1    1.740273
2    0.802297
3    0.656885
dtype: float64

In [78]:
data['b':'c']

b  1    1.740273
   2    0.802297
   3    0.656885
c  1    0.860777
   2   -0.926302
dtype: float64

In [79]:
data.loc[['b','d']]

b  1    1.740273
   2    0.802297
   3    0.656885
d  2    0.614864
   3   -0.182352
dtype: float64

In [81]:
data[:,2]

a    0.030054
b    0.802297
c   -0.926302
d    0.614864
dtype: float64

In [82]:
data.unstack()

Unnamed: 0,1,2,3
a,2.465786,0.030054,-0.766532
b,1.740273,0.802297,0.656885
c,0.860777,-0.926302,
d,,0.614864,-0.182352


In [83]:
data.unstack().stack()

a  1    2.465786
   2    0.030054
   3   -0.766532
b  1    1.740273
   2    0.802297
   3    0.656885
c  1    0.860777
   2   -0.926302
d  2    0.614864
   3   -0.182352
dtype: float64

In [85]:
frame = DataFrame(np.arange(12).reshape((4,3)), index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### 重排分级顺序

有时，你需要重新调整某条轴上各级别的顺序，或根据指定级别上的值对数据进行排序。swaplevel接收两个级别编号或名称，并返回一个互换级别的新对象。

## 其他有关pandas的话题

### 整数索引

### 面板数据

pandas有一个Panel数据结构，你可以将其看做一个三维版的DataFrame。pandas的大部分开发工作都集中在表格型数据的操作上，因为这些数据更常见，而且层次化索引也是的多数情况下没必要使用真正的N维数组。