In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

### pandas的数据结构介绍

#### Series

In [2]:
obj = Series([2, 8, 5])

In [3]:
obj

0    2
1    8
2    5
dtype: int64

In [4]:
obj.values

array([2, 8, 5])

In [5]:
obj.index

RangeIndex(start=0, stop=3, step=1)

In [6]:
obj2 = Series([1, 8, 6], index=["x", "y", "z"])

In [7]:
obj2

x    1
y    8
z    6
dtype: int64

In [8]:
obj2.index

Index(['x', 'y', 'z'], dtype='object')

In [9]:
obj2["z"] 

6

In [10]:
obj2

x    1
y    8
z    6
dtype: int64

In [11]:
obj2[obj2 > 1]

y    8
z    6
dtype: int64

In [12]:
obj2 * 2

x     2
y    16
z    12
dtype: int64

In [13]:
np.exp(obj2)

x       2.718282
y    2980.957987
z     403.428793
dtype: float64

In [14]:
# 可以将Series看成一个定长的有序字典，因为它是索引值到数据值的一个映射。
"x" in obj2

True

In [15]:
"k" in obj2

False

In [16]:
# 通过字典创建Series
company_dict = {"GOOG":"谷歌", "APPL":"苹果", "NFLX":"奈飞"}
obj3 = Series(company_dict)

In [17]:
obj3

APPL    苹果
GOOG    谷歌
NFLX    奈飞
dtype: object

In [18]:
# 有序排列
company_index_list = ["GOOG","BABA", "TSLA", "NFLX", "APPL"]
obj4 = Series(company_dict, index=company_index_list)
obj4

GOOG     谷歌
BABA    NaN
TSLA    NaN
NFLX     奈飞
APPL     苹果
dtype: object

In [19]:
# pandas的isnull和notnull函数可用于检测缺失数据
pd.isnull(obj4)

GOOG    False
BABA     True
TSLA     True
NFLX    False
APPL    False
dtype: bool

In [20]:
pd.notnull(obj4)

GOOG     True
BABA    False
TSLA    False
NFLX     True
APPL     True
dtype: bool

In [21]:
# Series类似的实例方法
obj4.isnull()

GOOG    False
BABA     True
TSLA     True
NFLX    False
APPL    False
dtype: bool

In [22]:
# 对于许多应用而言，Series最重要的一个功能是：它在算术运算中会自动对齐不同索引的数据。
obj3

APPL    苹果
GOOG    谷歌
NFLX    奈飞
dtype: object

In [23]:
obj4

GOOG     谷歌
BABA    NaN
TSLA    NaN
NFLX     奈飞
APPL     苹果
dtype: object

In [24]:
obj3 + obj4

APPL    苹果苹果
BABA     NaN
GOOG    谷歌谷歌
NFLX    奈飞奈飞
TSLA     NaN
dtype: object

In [25]:
# Series对象本身及其索引都有一个name属性，该属性跟pandas其他的关键功能关系非常密切
obj4.name = "公司名称"

In [26]:
obj4.index.name = "股票代码"

In [27]:
obj4

股票代码
GOOG     谷歌
BABA    NaN
TSLA    NaN
NFLX     奈飞
APPL     苹果
Name: 公司名称, dtype: object

In [28]:
# 通过赋值的方式修改Series的索引
obj4.index = ["GOOG", "AMZN", "HK0700", "NFLX", "APPL"]

In [29]:
obj4

GOOG       谷歌
AMZN      NaN
HK0700    NaN
NFLX       奈飞
APPL       苹果
Name: 公司名称, dtype: object

#### DataFrame

In [30]:
# 构建DataFrame
company_data = {"name":["谷歌","奈飞"], "stock_no":["GOOG", "NFLX"]}
company_data_frame= DataFrame(company_data)

In [31]:
company_data_frame

Unnamed: 0,name,stock_no
0,谷歌,GOOG
1,奈飞,NFLX


In [32]:
# 指定列序列
DataFrame(company_data, columns=["stock_no", "name"])

Unnamed: 0,stock_no,name
0,GOOG,谷歌
1,NFLX,奈飞


In [33]:
# 列在数据中找不到，就会产生NA值
company_data_frame2 = DataFrame(company_data, columns=["stock_no", "name", "address"],
                                index=["one", "two"])

In [34]:
company_data_frame2

Unnamed: 0,stock_no,name,address
one,GOOG,谷歌,
two,NFLX,奈飞,


In [35]:
company_data_frame2.columns

Index(['stock_no', 'name', 'address'], dtype='object')

In [36]:
company_data_frame2.name

one    谷歌
two    奈飞
Name: name, dtype: object

In [37]:
company_data_frame2['name']

one    谷歌
two    奈飞
Name: name, dtype: object

In [38]:
company_data_frame2.ix['two']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


stock_no    NFLX
name          奈飞
address      NaN
Name: two, dtype: object

In [39]:
# 通过赋值的方式修改列
company_data_frame2.address = "美国"

In [40]:
company_data_frame2

Unnamed: 0,stock_no,name,address
one,GOOG,谷歌,美国
two,NFLX,奈飞,美国


In [41]:
# 为不存在的列赋值会创建一个新列
company_data_frame2["is_usd"] = company_data_frame2.address == "美国"

In [42]:
company_data_frame2

Unnamed: 0,stock_no,name,address,is_usd
one,GOOG,谷歌,美国,True
two,NFLX,奈飞,美国,True


In [43]:
# 删除列
del company_data_frame2["is_usd"]

In [44]:
company_data_frame2

Unnamed: 0,stock_no,name,address
one,GOOG,谷歌,美国
two,NFLX,奈飞,美国


In [45]:
# 嵌套字典（字典的字典）
company_data = {"name":{"one":"谷歌", "two":"奈飞"},
                   "stock_no":{"one":"GOOG", "two":"NFLX"},
                   "address":{"one":"美国", "two":"美国"},
               }

In [46]:
company_data_frame3 = DataFrame(company_data)
company_data_frame3

Unnamed: 0,address,name,stock_no
one,美国,谷歌,GOOG
two,美国,奈飞,NFLX


In [47]:
company_data_frame3.T

Unnamed: 0,one,two
address,美国,美国
name,谷歌,奈飞
stock_no,GOOG,NFLX


In [50]:
DataFrame(company_data, index=["one", "two"])

Unnamed: 0,address,name,stock_no
one,美国,谷歌,GOOG
two,美国,奈飞,NFLX


In [57]:
company_data_frame3.index.name = "记录号"

In [58]:
company_data_frame3.columns.name = "字段"

In [59]:
company_data_frame3

字段,address,name,stock_no
记录号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,美国,谷歌,GOOG
two,美国,奈飞,NFLX


In [60]:
# 索引对象
obj = Series(range(3), index=["x", "y", "z"])

In [61]:
index = obj.index

In [62]:
index

Index(['x', 'y', 'z'], dtype='object')

In [63]:
index[1:]

Index(['y', 'z'], dtype='object')

In [65]:
# Index对象是不可修改的（immutable）
# 不可修改性非常重要，因为这样才能使Index对象在多个数据结构之间安全共享
index["x"] = "g"

TypeError: Index does not support mutable operations

### 基本功能

#### 1.重新索引

In [66]:
company_obj = Series(["google", "alibaba", "apple"], index=["GOOG", "BABA", "APPL"])

In [67]:
company_obj

GOOG     google
BABA    alibaba
APPL      apple
dtype: object

In [68]:
# 根据新索引进行重新排序
company_obj2= company_obj.reindex(["APPL", "GOOG", "BABA", "NFLX"])

In [69]:
company_obj2

APPL      apple
GOOG     google
BABA    alibaba
NFLX        NaN
dtype: object

In [70]:
# 对于时间序列这样的有序数据，重新索引时可能需要做一些插值处理
color_obj = Series(["blue", "purple", "yellow"], index=[0, 2, 4])

In [71]:
color_obj

0      blue
2    purple
4    yellow
dtype: object

In [72]:
color_obj.reindex(range(6), method="ffill")

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [73]:
# 对于DataFrame，reindex可以修改（行）索引、列，或者连个都修改。
company_frame = DataFrame(np.arange(9).reshape((3, 3)), index=["GOOG", "BABA", "APPL"],
                          columns=["谷歌", "阿里巴巴", "苹果"]
                         )

In [74]:
company_frame

Unnamed: 0,谷歌,阿里巴巴,苹果
GOOG,0,1,2
BABA,3,4,5
APPL,6,7,8


In [75]:
company_frame2 = company_frame.reindex(["BABA", "APPL", "GOOG"])

In [76]:
company_frame2

Unnamed: 0,谷歌,阿里巴巴,苹果
BABA,3,4,5
APPL,6,7,8
GOOG,0,1,2


In [77]:
company_list = ["谷歌", "苹果", "阿里巴巴"]

In [79]:
company_frame.reindex(columns=company_list)

Unnamed: 0,谷歌,苹果,阿里巴巴
GOOG,0,2,1
BABA,3,5,4
APPL,6,8,7


In [87]:
# 可以同时对行和列进行重新索引
company_frame.reindex(index=["BABA", "APPL", "GOOG"],
                        columns=company_list)

Unnamed: 0,谷歌,苹果,阿里巴巴
BABA,3,5,4
APPL,6,8,7
GOOG,0,2,1


In [90]:
# 利用ix的标签索引功能，重新索引任务可以变得更简洁
company_frame.ix[["APPL", "BABA", "GOOG"], company_list]

Unnamed: 0,谷歌,苹果,阿里巴巴
APPL,6,8,7
BABA,3,5,4
GOOG,0,2,1


#### 2.丢弃指定轴上的项

In [92]:
obj = Series(np.arange(3.), index=["x", "y", "z"])

In [93]:
obj

x    0.0
y    1.0
z    2.0
dtype: float64

In [94]:
new_obj = obj.drop("y")

In [95]:
new_obj

x    0.0
z    2.0
dtype: float64

In [96]:
# 输出任意轴上的索引值
new_obj = obj.drop(["x", "z"])

In [97]:
new_obj

y    1.0
dtype: float64

#### 3.索引、选取和过滤

In [98]:
obj = Series(np.arange(3.), index=["x", "y", "z"])

In [99]:
obj

x    0.0
y    1.0
z    2.0
dtype: float64

In [100]:
obj["y"]

1.0

In [101]:
obj[0]

0.0

In [103]:
obj[1:3]

y    1.0
z    2.0
dtype: float64

In [104]:
obj[["x", "y"]]

x    0.0
y    1.0
dtype: float64

In [105]:
obj[[1, 2]]

y    1.0
z    2.0
dtype: float64

In [106]:
obj[obj < 2]

x    0.0
y    1.0
dtype: float64

In [108]:
# 对DataFrame进行索引其实就是获取一个或多个实例
data = DataFrame(np.arange(16).reshape((4, 4)),
                index=["Google", "Apple", "Amazon", "Alibaba"],
                columns=["one", "two", "three", "four"])

In [109]:
data

Unnamed: 0,one,two,three,four
Google,0,1,2,3
Apple,4,5,6,7
Amazon,8,9,10,11
Alibaba,12,13,14,15


In [110]:
data["two"]

Google      1
Apple       5
Amazon      9
Alibaba    13
Name: two, dtype: int64

In [111]:
data[["three", "one"]]

Unnamed: 0,three,one
Google,2,0
Apple,6,4
Amazon,10,8
Alibaba,14,12


In [113]:
# 通过切片或布尔型数组选取行
data[:2]

Unnamed: 0,one,two,three,four
Google,0,1,2,3
Apple,4,5,6,7


In [114]:
data[data["three"] > 5]

Unnamed: 0,one,two,three,four
Apple,4,5,6,7
Amazon,8,9,10,11
Alibaba,12,13,14,15


In [115]:
# 通过布尔型DataFrame进行索引
# DataFrame在语法上更像ndarray
data < 5

Unnamed: 0,one,two,three,four
Google,True,True,True,True
Apple,True,False,False,False
Amazon,False,False,False,False
Alibaba,False,False,False,False


In [116]:
data[data < 5] = 0

In [117]:
data

Unnamed: 0,one,two,three,four
Google,0,0,0,0
Apple,0,5,6,7
Amazon,8,9,10,11
Alibaba,12,13,14,15


In [120]:
# 使用索引字段ix在DataFrame的行上进行标签索引
data.ix["Apple", ["two", "three"]]

two      5
three    6
Name: Apple, dtype: int64

In [121]:
data.ix[["Google", "Apple"], [3, 0, 1]]

Unnamed: 0,four,one,two
Google,0,0,0
Apple,7,0,5


In [122]:
data.ix[2]

one       8
two       9
three    10
four     11
Name: Amazon, dtype: int64

In [123]:
data.ix[:"Amazon", "two"]

Google    0
Apple     5
Amazon    9
Name: two, dtype: int64

In [125]:
data.ix[data.three > 5, :3]

Unnamed: 0,one,two,three
Apple,0,5,6
Amazon,8,9,10
Alibaba,12,13,14


#### 4.算术运算和数据对齐

In [126]:
s1 = Series([1, 3, 5, 7], index=['a', 'b', 'c', 'd'])

In [132]:
s2 = Series([2, 1, 4, 6, 4, 3], index=['a', 'b', 'c', 'd', 'e', 'f'])

In [133]:
s1

a    1
b    3
c    5
d    7
dtype: int64

In [134]:
s2

a    2
b    1
c    4
d    6
e    4
f    3
dtype: int64

In [136]:
# 自动的数据对齐操作在不重叠的索引出引入了NA值
# 缺失值会在算术过程中传播
s1 + s2

a     3.0
b     4.0
c     9.0
d    13.0
e     NaN
f     NaN
dtype: float64

In [137]:
# 对于DataFrame，对齐操作会同时在行和列上
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list("abc"),
               index=list("xyz"))

In [138]:
df1

Unnamed: 0,a,b,c
x,0.0,1.0,2.0
y,3.0,4.0,5.0
z,6.0,7.0,8.0


In [139]:
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bcd"),
               index=list("axzh"))

In [140]:
df2

Unnamed: 0,b,c,d
a,0.0,1.0,2.0
x,3.0,4.0,5.0
z,6.0,7.0,8.0
h,9.0,10.0,11.0


In [141]:
# 相加，返回新的DataFrame，其索引和列为原来那两个DataFrame的并集
df1 + df2

Unnamed: 0,a,b,c,d
a,,,,
h,,,,
x,,4.0,6.0,
y,,,,
z,,13.0,15.0,


#### 5.在算术方法中填充值

In [142]:
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))

In [143]:
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list("abcde"))

In [144]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [145]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [146]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [147]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [149]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


#### 6.DataFrame和Series之间的运算

In [150]:
arr = np.arange(12.).reshape((3, 4))

In [151]:
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [152]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [154]:
# 广播（broadcasting）
arr - arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [156]:
# DataFrame和Series之间的运算
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"),
                 index=["Google", "Apple", "Amazon", "Alibaba"])

In [158]:
series = frame.ix[0]

In [159]:
frame

Unnamed: 0,b,d,e
Google,0.0,1.0,2.0
Apple,3.0,4.0,5.0
Amazon,6.0,7.0,8.0
Alibaba,9.0,10.0,11.0


In [160]:
series

b    0.0
d    1.0
e    2.0
Name: Google, dtype: float64

In [161]:
frame - series

Unnamed: 0,b,d,e
Google,0.0,0.0,0.0
Apple,3.0,3.0,3.0
Amazon,6.0,6.0,6.0
Alibaba,9.0,9.0,9.0


In [168]:
series2 = Series(range(3), index=["b", "e", "f"])

In [169]:
frame + series2

Unnamed: 0,b,d,e,f
Google,0.0,,3.0,
Apple,3.0,,6.0,
Amazon,6.0,,9.0,
Alibaba,9.0,,12.0,


In [170]:
series3 = frame["d"]

In [171]:
frame

Unnamed: 0,b,d,e
Google,0.0,1.0,2.0
Apple,3.0,4.0,5.0
Amazon,6.0,7.0,8.0
Alibaba,9.0,10.0,11.0


In [172]:
series3

Google      1.0
Apple       4.0
Amazon      7.0
Alibaba    10.0
Name: d, dtype: float64

In [173]:
# 匹配DataFrame的行索引并进行广播
frame.sub(series3, axis=0)

Unnamed: 0,b,d,e
Google,-1.0,0.0,1.0
Apple,-1.0,0.0,1.0
Amazon,-1.0,0.0,1.0
Alibaba,-1.0,0.0,1.0
