In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

### pandas的数据结构介绍

#### Series

In [2]:
obj = Series([2, 8, 5])

In [3]:
obj

0    2
1    8
2    5
dtype: int64

In [4]:
obj.values

array([2, 8, 5])

In [5]:
obj.index

RangeIndex(start=0, stop=3, step=1)

In [6]:
obj2 = Series([1, 8, 6], index=["x", "y", "z"])

In [7]:
obj2

x    1
y    8
z    6
dtype: int64

In [8]:
obj2.index

Index(['x', 'y', 'z'], dtype='object')

In [9]:
obj2["z"] 

6

In [10]:
obj2

x    1
y    8
z    6
dtype: int64

In [11]:
obj2[obj2 > 1]

y    8
z    6
dtype: int64

In [12]:
obj2 * 2

x     2
y    16
z    12
dtype: int64

In [13]:
np.exp(obj2)

x       2.718282
y    2980.957987
z     403.428793
dtype: float64

In [14]:
# 可以将Series看成一个定长的有序字典，因为它是索引值到数据值的一个映射。
"x" in obj2

True

In [15]:
"k" in obj2

False

In [16]:
# 通过字典创建Series
company_dict = {"GOOG":"谷歌", "APPL":"苹果", "NFLX":"奈飞"}
obj3 = Series(company_dict)

In [17]:
obj3

APPL    苹果
GOOG    谷歌
NFLX    奈飞
dtype: object

In [18]:
# 有序排列
company_index_list = ["GOOG","BABA", "TSLA", "NFLX", "APPL"]
obj4 = Series(company_dict, index=company_index_list)
obj4

GOOG     谷歌
BABA    NaN
TSLA    NaN
NFLX     奈飞
APPL     苹果
dtype: object

In [19]:
# pandas的isnull和notnull函数可用于检测缺失数据
pd.isnull(obj4)

GOOG    False
BABA     True
TSLA     True
NFLX    False
APPL    False
dtype: bool

In [20]:
pd.notnull(obj4)

GOOG     True
BABA    False
TSLA    False
NFLX     True
APPL     True
dtype: bool

In [21]:
# Series类似的实例方法
obj4.isnull()

GOOG    False
BABA     True
TSLA     True
NFLX    False
APPL    False
dtype: bool

In [22]:
# 对于许多应用而言，Series最重要的一个功能是：它在算术运算中会自动对齐不同索引的数据。
obj3

APPL    苹果
GOOG    谷歌
NFLX    奈飞
dtype: object

In [23]:
obj4

GOOG     谷歌
BABA    NaN
TSLA    NaN
NFLX     奈飞
APPL     苹果
dtype: object

In [24]:
obj3 + obj4

APPL    苹果苹果
BABA     NaN
GOOG    谷歌谷歌
NFLX    奈飞奈飞
TSLA     NaN
dtype: object

In [25]:
# Series对象本身及其索引都有一个name属性，该属性跟pandas其他的关键功能关系非常密切
obj4.name = "公司名称"

In [26]:
obj4.index.name = "股票代码"

In [27]:
obj4

股票代码
GOOG     谷歌
BABA    NaN
TSLA    NaN
NFLX     奈飞
APPL     苹果
Name: 公司名称, dtype: object

In [28]:
# 通过赋值的方式修改Series的索引
obj4.index = ["GOOG", "AMZN", "HK0700", "NFLX", "APPL"]

In [29]:
obj4

GOOG       谷歌
AMZN      NaN
HK0700    NaN
NFLX       奈飞
APPL       苹果
Name: 公司名称, dtype: object

#### DataFrame

In [30]:
# 构建DataFrame
company_data = {"name":["谷歌","奈飞"], "stock_no":["GOOG", "NFLX"]}
company_data_frame= DataFrame(company_data)

In [31]:
company_data_frame

Unnamed: 0,name,stock_no
0,谷歌,GOOG
1,奈飞,NFLX


In [32]:
# 指定列序列
DataFrame(company_data, columns=["stock_no", "name"])

Unnamed: 0,stock_no,name
0,GOOG,谷歌
1,NFLX,奈飞


In [33]:
# 列在数据中找不到，就会产生NA值
company_data_frame2 = DataFrame(company_data, columns=["stock_no", "name", "address"],
                                index=["one", "two"])

In [34]:
company_data_frame2

Unnamed: 0,stock_no,name,address
one,GOOG,谷歌,
two,NFLX,奈飞,


In [35]:
company_data_frame2.columns

Index(['stock_no', 'name', 'address'], dtype='object')

In [36]:
company_data_frame2.name

one    谷歌
two    奈飞
Name: name, dtype: object

In [37]:
company_data_frame2['name']

one    谷歌
two    奈飞
Name: name, dtype: object

In [38]:
company_data_frame2.ix['two']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


stock_no    NFLX
name          奈飞
address      NaN
Name: two, dtype: object

In [39]:
# 通过赋值的方式修改列
company_data_frame2.address = "美国"

In [40]:
company_data_frame2

Unnamed: 0,stock_no,name,address
one,GOOG,谷歌,美国
two,NFLX,奈飞,美国


In [41]:
# 为不存在的列赋值会创建一个新列
company_data_frame2["is_usd"] = company_data_frame2.address == "美国"

In [42]:
company_data_frame2

Unnamed: 0,stock_no,name,address,is_usd
one,GOOG,谷歌,美国,True
two,NFLX,奈飞,美国,True


In [43]:
# 删除列
del company_data_frame2["is_usd"]

In [44]:
company_data_frame2

Unnamed: 0,stock_no,name,address
one,GOOG,谷歌,美国
two,NFLX,奈飞,美国


In [45]:
# 嵌套字典（字典的字典）
company_data = {"name":{"one":"谷歌", "two":"奈飞"},
                   "stock_no":{"one":"GOOG", "two":"NFLX"},
                   "address":{"one":"美国", "two":"美国"},
               }

In [46]:
company_data_frame3 = DataFrame(company_data)
company_data_frame3

Unnamed: 0,address,name,stock_no
one,美国,谷歌,GOOG
two,美国,奈飞,NFLX


In [47]:
company_data_frame3.T

Unnamed: 0,one,two
address,美国,美国
name,谷歌,奈飞
stock_no,GOOG,NFLX


In [48]:
DataFrame(company_data, index=["one", "two"])

AttributeError: 'list' object has no attribute 'astype'

In [None]:
company_data_frame3.index.name = "记录号"

In [None]:
company_data_frame3.columns.name = "字段"

In [None]:
company_data_frame3

In [None]:
# 索引对象
obj = Series(range(3), index=["x", "y", "z"])

In [None]:
index = obj.index

In [None]:
index

In [None]:
index[1:]

In [None]:
# Index对象是不可修改的（immutable）
# 不可修改性非常重要，因为这样才能使Index对象在多个数据结构之间安全共享
index["x"] = "g"

### 基本功能

#### 1.重新索引

In [None]:
company_obj = Series(["google", "alibaba", "apple"], index=["GOOG", "BABA", "APPL"])

In [None]:
company_obj

In [None]:
# 根据新索引进行重新排序
company_obj2= company_obj.reindex(["APPL", "GOOG", "BABA", "NFLX"])

In [None]:
company_obj2

In [None]:
# 对于时间序列这样的有序数据，重新索引时可能需要做一些插值处理
color_obj = Series(["blue", "purple", "yellow"], index=[0, 2, 4])

In [None]:
color_obj

In [None]:
color_obj.reindex(range(6), method="ffill")

In [None]:
# 对于DataFrame，reindex可以修改（行）索引、列，或者连个都修改。
company_frame = DataFrame(np.arange(9).reshape((3, 3)), index=["GOOG", "BABA", "APPL"],
                          columns=["谷歌", "阿里巴巴", "苹果"]
                         )

In [None]:
company_frame

In [None]:
company_frame2 = company_frame.reindex(["BABA", "APPL", "GOOG"])

In [None]:
company_frame2

In [None]:
company_list = ["谷歌", "苹果", "阿里巴巴"]

In [None]:
company_frame.reindex(columns=company_list)

In [None]:
# 可以同时对行和列进行重新索引
company_frame.reindex(index=["BABA", "APPL", "GOOG"],
                        columns=company_list)

In [None]:
# 利用ix的标签索引功能，重新索引任务可以变得更简洁
company_frame.ix[["APPL", "BABA", "GOOG"], company_list]

#### 2.丢弃指定轴上的项

In [None]:
obj = Series(np.arange(3.), index=["x", "y", "z"])

In [None]:
obj

In [None]:
new_obj = obj.drop("y")

In [None]:
new_obj

In [None]:
# 输出任意轴上的索引值
new_obj = obj.drop(["x", "z"])

In [None]:
new_obj

#### 3.索引、选取和过滤

In [None]:
obj = Series(np.arange(3.), index=["x", "y", "z"])

In [None]:
obj

In [None]:
obj["y"]

In [None]:
obj[0]

In [None]:
obj[1:3]

In [None]:
obj[["x", "y"]]

In [None]:
obj[[1, 2]]

In [None]:
obj[obj < 2]

In [None]:
# 对DataFrame进行索引其实就是获取一个或多个实例
data = DataFrame(np.arange(16).reshape((4, 4)),
                index=["Google", "Apple", "Amazon", "Alibaba"],
                columns=["one", "two", "three", "four"])

In [None]:
data

In [None]:
data["two"]

In [None]:
data[["three", "one"]]

In [None]:
# 通过切片或布尔型数组选取行
data[:2]

In [None]:
data[data["three"] > 5]

In [None]:
# 通过布尔型DataFrame进行索引
# DataFrame在语法上更像ndarray
data < 5

In [None]:
data[data < 5] = 0

In [None]:
data

In [None]:
# 使用索引字段ix在DataFrame的行上进行标签索引
data.ix["Apple", ["two", "three"]]

In [None]:
data.ix[["Google", "Apple"], [3, 0, 1]]

In [None]:
data.ix[2]

In [None]:
data.ix[:"Amazon", "two"]

In [None]:
data.ix[data.three > 5, :3]

#### 4.算术运算和数据对齐

In [None]:
s1 = Series([1, 3, 5, 7], index=['a', 'b', 'c', 'd'])

In [None]:
s2 = Series([2, 1, 4, 6, 4, 3], index=['a', 'b', 'c', 'd', 'e', 'f'])

In [None]:
s1

In [None]:
s2

In [None]:
# 自动的数据对齐操作在不重叠的索引出引入了NA值
# 缺失值会在算术过程中传播
s1 + s2

In [None]:
# 对于DataFrame，对齐操作会同时在行和列上
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list("abc"),
               index=list("xyz"))

In [None]:
df1

In [None]:
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bcd"),
               index=list("axzh"))

In [None]:
df2

In [None]:
# 相加，返回新的DataFrame，其索引和列为原来那两个DataFrame的并集
df1 + df2

#### 5.在算术方法中填充值

In [None]:
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))

In [None]:
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list("abcde"))

In [None]:
df1

In [None]:
df2

In [None]:
df1 + df2

In [None]:
df1.add(df2, fill_value=0)

In [None]:
df1.reindex(columns=df2.columns, fill_value=0)

#### 6.DataFrame和Series之间的运算

In [None]:
arr = np.arange(12.).reshape((3, 4))

In [None]:
arr

In [None]:
arr[0]

In [None]:
# 广播（broadcasting）
arr - arr[0]

In [None]:
# DataFrame和Series之间的运算
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"),
                 index=["Google", "Apple", "Amazon", "Alibaba"])

In [None]:
series = frame.ix[0]

In [None]:
frame

In [None]:
series

In [None]:
frame - series

In [None]:
series2 = Series(range(3), index=["b", "e", "f"])

In [None]:
frame + series2

In [None]:
series3 = frame["d"]

In [None]:
frame

In [None]:
series3

In [None]:
# 匹配DataFrame的行索引并进行广播
frame.sub(series3, axis=0)

#### 7.函数应用和映射

In [50]:
frame = DataFrame(np.random.randn(4, 3), columns=list("bde"),
                 index=["Google", "Amazon", "Apple", "Alibaba"])

In [51]:
frame

Unnamed: 0,b,d,e
Google,-1.828146,-0.899532,-0.456173
Amazon,0.207476,-0.374757,0.311922
Apple,0.671386,-1.427171,0.190694
Alibaba,-0.508251,0.231013,-0.411191


In [52]:
np.abs(frame)

Unnamed: 0,b,d,e
Google,1.828146,0.899532,0.456173
Amazon,0.207476,0.374757,0.311922
Apple,0.671386,1.427171,0.190694
Alibaba,0.508251,0.231013,0.411191


In [53]:
f = lambda x : x.max() - x.min()
frame.apply(f)

b    2.499532
d    1.658183
e    0.768095
dtype: float64

In [54]:
frame.apply(f, axis=1)

Google     1.371973
Amazon     0.686679
Apple      2.098557
Alibaba    0.739264
dtype: float64

In [55]:
def f(x):
    return Series([x.min(), x.max()], index=["min", "max"])

In [56]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.828146,-1.427171,-0.456173
max,0.671386,0.231013,0.311922


In [57]:
format = lambda x: "%.2f" % x

In [58]:
frame.applymap(format)

Unnamed: 0,b,d,e
Google,-1.83,-0.9,-0.46
Amazon,0.21,-0.37,0.31
Apple,0.67,-1.43,0.19
Alibaba,-0.51,0.23,-0.41


In [59]:
frame["e"].map(format)

Google     -0.46
Amazon      0.31
Apple       0.19
Alibaba    -0.41
Name: e, dtype: object

#### 8.排序和排名

In [60]:
obj = Series(range(4), index=["d", "a", "b", "c"])

In [61]:
obj

d    0
a    1
b    2
c    3
dtype: int64

In [62]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [63]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"],
                 columns=["d", "a", "b", "c"])

In [64]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [65]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [66]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [70]:
frame = DataFrame({"b":[4, 7, -3, 2], 'a':[0, 1, 0, 1]})

In [71]:
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [74]:
frame.sort_values(by="b")

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [75]:
frame.sort_values(by=["a", "b"])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [76]:
# 默认情况下，rank是通过“为各组分配一个平均排名”的方式破坏平级关系的
obj = Series([7, -5, 7, 4, 2, 0, 4])

In [77]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [78]:
# 根据值在原数据中出现的顺序给出排名
obj.rank(method="first")

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [80]:
# 按降序进行排名
obj.rank(ascending=False, method="max")

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [81]:
frame = DataFrame({"b":[4.3, 7, -3, 2], "a":[0, 1, 0, 1],
                  "c":[-2, 5, 8, -2.5]})

In [82]:
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [83]:
frame.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


#### 9.带有重复值的轴索引

In [84]:
obj = Series(range(5), index=["a", "a", "b", "b", "c"])

In [85]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [86]:
obj.index.is_unique

False

In [87]:
obj["a"]

a    0
a    1
dtype: int64

In [88]:
obj["c"]

4

In [89]:
df = DataFrame(np.random.randn(4, 3), index=["a", "a", "b", "b"])

In [90]:
df

Unnamed: 0,0,1,2
a,0.730068,1.971555,1.525696
a,0.608506,-1.810814,-0.068516
b,0.312319,-0.192871,-0.191239
b,0.40975,-1.285392,-0.70656


In [92]:
df.loc["b"]

Unnamed: 0,0,1,2
b,0.312319,-0.192871,-0.191239
b,0.40975,-1.285392,-0.70656


### 汇总和计算汇总统计