In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

# 4.2.1 Series

In [3]:
obj = Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [7]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [10]:
obj2['a']

-5

In [11]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    4
dtype: int64

In [12]:
obj2[0:2]

d    4
b    7
dtype: int64

In [14]:
obj2['b':'c']

b    7
a   -5
c    3
dtype: int64

In [15]:
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

In [18]:
'b' in obj2

True

In [29]:
obj3 = Series()
obj3.empty

True

In [31]:
# 可以给索引和值设置名称
obj2.name = 'population'
obj2.index.name = 'state'
obj2

state
d    4
b    7
a   -5
c    3
Name: population, dtype: int64

In [34]:
# 修改Series值
obj2['d'] = 99
obj2

state
d    99
b     7
a    -5
c     3
Name: population, dtype: int64

# 4.2.2 DataFrame

In [98]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)

In [36]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [9]:
frame.index

RangeIndex(start=0, stop=5, step=1)

In [10]:
frame.columns

Index([u'pop', u'state', u'year'], dtype='object')

In [99]:
frame.dtypes

pop      float64
state     object
year       int64
dtype: object

# 4.2.3 Index objects

In [15]:
obj = Series(range(3), index=['a', 'b', 'c'])
obj.index

Index([u'a', u'b', u'c'], dtype='object')

In [16]:
obj.index[1:]

Index([u'b', u'c'], dtype='object')

In [14]:
frame.index[3]

3

In [17]:
'pop' in frame.columns

True

# 5 基本操作

## 5.3 创建表

In [22]:
frame1 = DataFrame([[1,2],[3,4]], columns=['year', 'state'],
                   index=['one', 'two' ])
frame1

Unnamed: 0,year,state
one,1,2
two,3,4


In [23]:
frame1 = DataFrame([[1,2],[3,4]], columns=['year', 'state'])
frame1

Unnamed: 0,year,state
0,1,2
1,3,4


In [25]:
# 利用Python的字典生成表，形式如下
frame1 = DataFrame({'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}})
frame1

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


## 5.4 访问表

### 5.4.1 返回值类型

In [None]:
# 访问一行返回值类型
type(frame1.loc[2000])

In [32]:
# 访问一列返回值类型
type(frame1['Ohio'])

pandas.core.series.Series

In [49]:
type(frame1.loc[:'ohio'])

pandas.core.frame.DataFrame

In [34]:
type(frame1.iloc[0,1])

numpy.float64

In [41]:
frame1.loc[[2000,2002]]

Unnamed: 0,Nevada,Ohio
2000,,1.5
2002,2.9,3.6


In [52]:
type(frame1.iloc[:,0])

pandas.core.series.Series

In [55]:
type(frame1.ix[:,'Ohio'])

pandas.core.series.Series

In [42]:
type(frame1.loc[[2000,2002]])

pandas.core.frame.DataFrame

### 5.4.2 loc/iloc/ix函数

In [59]:
#loc函数 根据名称访问

In [66]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [69]:
frame.loc[1:3,'pop':'year']

Unnamed: 0,pop,state,year
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001


In [70]:
frame.loc[[1,3],['pop','year']]

Unnamed: 0,pop,year
1,1.7,2001
3,2.4,2001


In [71]:
# 访问单行时可以省略列参数
frame.loc[2]

pop       3.6
state    Ohio
year     2002
Name: 2, dtype: object

In [73]:
# 与上面省略列参数的访问一样
frame.loc[2,:]

pop       3.6
state    Ohio
year     2002
Name: 2, dtype: object

In [72]:
frame.loc[:, 'pop']

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
Name: pop, dtype: float64

In [74]:
#iloc函数 根据名称访问

In [77]:
# 访问单行时同样可以省略列参数
frame.iloc[4]

pop         2.9
state    Nevada
year       2002
Name: 4, dtype: object

In [78]:
# 注意编号从0开始
frame.iloc[:,1]

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

In [80]:
frame.iloc[1:3, 0:1]

Unnamed: 0,pop
1,1.7
2,3.6


In [81]:
frame.iloc[[0,3],[1,2]]

Unnamed: 0,state,year
0,Ohio,2000
3,Nevada,2001


In [82]:
#ix函数 loc和iloc的混合体

In [83]:
frame.ix[1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


pop       1.7
state    Ohio
year     2001
Name: 1, dtype: object

In [84]:
frame.ix[0:2,['pop']]

Unnamed: 0,pop
0,1.5
1,1.7
2,3.6


### 5.4.3 索引访问

In [146]:
frame1

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [147]:
frame1['Ohio']

2000    1.5
2001    1.7
2002    3.6
Name: Ohio, dtype: float64

In [151]:
frame1[0:2]

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


### 5.4.4 条件访问

# 5.5 修改表

## 5.5.1 添加表列 

In [85]:
# 添加一列数据

In [109]:
# 注意列长度要和原表一致
frame['value'] = [5,4,7,8,0]

In [110]:
frame

Unnamed: 0,pop,state,year,value
0,1.5,Ohio,2000,5
1,1.7,Ohio,2001,4
2,3.6,Ohio,2002,7
3,2.4,Nevada,2001,8
4,2.9,Nevada,2002,0


In [114]:
# 如果列值相同，可以简单赋值
frame['mod'] = -1
frame

Unnamed: 0,pop,state,year,value,mod
0,1.5,Ohio,2000,5,-1
1,1.7,Ohio,2001,4,-1
2,3.6,Ohio,2002,7,-1
3,2.4,Nevada,2001,8,-1
4,2.9,Nevada,2002,0,-1


In [115]:
s = Series([6,7,8,9])
frame['add_one'] = s

In [116]:
frame

Unnamed: 0,pop,state,year,value,mod,add_one
0,1.5,Ohio,2000,5,-1,6.0
1,1.7,Ohio,2001,4,-1,7.0
2,3.6,Ohio,2002,7,-1,8.0
3,2.4,Nevada,2001,8,-1,9.0
4,2.9,Nevada,2002,0,-1,


In [117]:
f = frame.iloc[:,1:3]

In [118]:
pd.concat([frame,f],axis=1)

Unnamed: 0,pop,state,year,value,mod,add_one,state.1,year.1
0,1.5,Ohio,2000,5,-1,6.0,Ohio,2000
1,1.7,Ohio,2001,4,-1,7.0,Ohio,2001
2,3.6,Ohio,2002,7,-1,8.0,Ohio,2002
3,2.4,Nevada,2001,8,-1,9.0,Nevada,2001
4,2.9,Nevada,2002,0,-1,,Nevada,2002


In [119]:
frame = pd.concat([frame,f],axis=1)

In [120]:
frame

Unnamed: 0,pop,state,year,value,mod,add_one,state.1,year.1
0,1.5,Ohio,2000,5,-1,6.0,Ohio,2000
1,1.7,Ohio,2001,4,-1,7.0,Ohio,2001
2,3.6,Ohio,2002,7,-1,8.0,Ohio,2002
3,2.4,Nevada,2001,8,-1,9.0,Nevada,2001
4,2.9,Nevada,2002,0,-1,,Nevada,2002


In [126]:
# 利用Python循环语句添加多列数据
# 首先创建一个包含2列的表
d = DataFrame({'new1':[3,4,6,7,8],'new2':['s','g','d','g','t']})
d

Unnamed: 0,new1,new2
0,3,s
1,4,g
2,6,d
3,7,g
4,8,t


In [130]:
for c in d:
    frame[c] = d[c]

In [131]:
frame

Unnamed: 0,pop,state,year,value,mod,add_one,state.1,year.1,new1,new2
0,1.5,Ohio,2000,5,-1,6.0,Ohio,2000,3,s
1,1.7,Ohio,2001,4,-1,7.0,Ohio,2001,4,g
2,3.6,Ohio,2002,7,-1,8.0,Ohio,2002,6,d
3,2.4,Nevada,2001,8,-1,9.0,Nevada,2001,7,g
4,2.9,Nevada,2002,0,-1,,Nevada,2002,8,t


### 5.5.2 添加行

In [None]:
# concat函数

In [143]:
frame.loc[5] = [3, 'beijing', 2006, 2, -1, 5, 'shanghai', 2007, 4, 'u']

In [144]:
frame

Unnamed: 0,pop,state,year,value,mod,add_one,state.1,year.1,new1,new2
0,1.5,Ohio,2000,5,-1,6.0,Ohio,2000,3,s
1,1.7,Ohio,2001,4,-1,7.0,Ohio,2001,4,g
2,3.6,Ohio,2002,7,-1,8.0,Ohio,2002,6,d
3,2.4,Nevada,2001,8,-1,9.0,Nevada,2001,7,g
4,2.9,Nevada,2002,0,-1,,Nevada,2002,8,t
5,3.0,beijing,2006,2,-1,5.0,shanghai,2007,4,u


In [158]:
s = Series({'pop':4.4, 'mod': -1})

In [159]:
frame.loc[6] = s
frame

Unnamed: 0,pop,state,year,value,mod,add_one,state.1,year.1,new1,new2
0,1.5,Ohio,2000.0,5.0,-1.0,6.0,Ohio,2000.0,3.0,s
1,1.7,Ohio,2001.0,4.0,-1.0,7.0,Ohio,2001.0,4.0,g
2,3.6,Ohio,2002.0,7.0,-1.0,8.0,Ohio,2002.0,6.0,d
3,2.4,Nevada,2001.0,8.0,-1.0,9.0,Nevada,2001.0,7.0,g
4,2.9,Nevada,2002.0,0.0,-1.0,,Nevada,2002.0,8.0,t
5,3.0,beijing,2006.0,2.0,-1.0,5.0,shanghai,2007.0,4.0,u
6,4.4,,,,-1.0,,,,,


In [162]:
# 为了方便截取frame的两行作为要添加的行
f = frame.loc[[1,3]]

In [165]:
frame = pd.concat([frame,f])
frame

Unnamed: 0,pop,state,year,value,mod,add_one,state.1,year.1,new1,new2
0,1.5,Ohio,2000.0,5.0,-1.0,6.0,Ohio,2000.0,3.0,s
1,1.7,Ohio,2001.0,4.0,-1.0,7.0,Ohio,2001.0,4.0,g
2,3.6,Ohio,2002.0,7.0,-1.0,8.0,Ohio,2002.0,6.0,d
3,2.4,Nevada,2001.0,8.0,-1.0,9.0,Nevada,2001.0,7.0,g
4,2.9,Nevada,2002.0,0.0,-1.0,,Nevada,2002.0,8.0,t
5,3.0,beijing,2006.0,2.0,-1.0,5.0,shanghai,2007.0,4.0,u
6,4.4,,,,-1.0,,,,,
1,1.7,Ohio,2001.0,4.0,-1.0,7.0,Ohio,2001.0,4.0,g
3,2.4,Nevada,2001.0,8.0,-1.0,9.0,Nevada,2001.0,7.0,g


In [171]:
s = DataFrame({2000:{'Nevada':5.5}, 2001:{'Ohio':6.7, 'Nevada':3.2}})

In [174]:
# 这个命令是转置的意思
s = s.T
s

Unnamed: 0,Nevada,Ohio
2000,5.5,
2001,3.2,6.7


In [170]:
frame1

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [191]:
frame1 = pd.concat([frame1,s], ignore_index=True)
frame1

Unnamed: 0,Nevada,Ohio
0,,1.5
1,2.4,1.7
2,2.9,3.6
3,5.5,
4,3.2,6.7


In [None]:
# 循环添加（参考）

In [192]:
s

Unnamed: 0,Nevada,Ohio
2000,5.5,
2001,3.2,6.7


In [195]:
s1 = DataFrame({2003:{'Nevada':5.5}, 2004:{'Ohio':6.7, 'Nevada':3.2}})
s1 = s1.T
s1

Unnamed: 0,Nevada,Ohio
2003,5.5,
2004,3.2,6.7


In [196]:
for i in xrange(len(s1)):
    s.loc[s1.index[i]] = s1.iloc[i]

In [197]:
s

Unnamed: 0,Nevada,Ohio
2000,5.5,
2001,3.2,6.7
2003,5.5,
2004,3.2,6.7


### 5.5.3 删除行

In [204]:
# 删除一行
s = s.drop(2000)
s

Unnamed: 0,Nevada,Ohio
2001,3.2,6.7
2003,5.5,
2004,3.2,6.7


In [205]:
# 删除多行
s.drop([2001,2004])

Unnamed: 0,Nevada,Ohio
2003,5.5,


### 5.5.4删除列

In [208]:
frame

Unnamed: 0,pop,state,year,value,mod,add_one,state.1,year.1,new1,new2
0,1.5,Ohio,2000.0,5.0,-1.0,6.0,Ohio,2000.0,3.0,s
1,1.7,Ohio,2001.0,4.0,-1.0,7.0,Ohio,2001.0,4.0,g
2,3.6,Ohio,2002.0,7.0,-1.0,8.0,Ohio,2002.0,6.0,d
3,2.4,Nevada,2001.0,8.0,-1.0,9.0,Nevada,2001.0,7.0,g
4,2.9,Nevada,2002.0,0.0,-1.0,,Nevada,2002.0,8.0,t
5,3.0,beijing,2006.0,2.0,-1.0,5.0,shanghai,2007.0,4.0,u
6,4.4,,,,-1.0,,,,,
1,1.7,Ohio,2001.0,4.0,-1.0,7.0,Ohio,2001.0,4.0,g
3,2.4,Nevada,2001.0,8.0,-1.0,9.0,Nevada,2001.0,7.0,g


In [None]:
# 删除一列

In [212]:
frame = frame.drop('mod', axis=1)
frame

Unnamed: 0,pop,state,year,value,add_one,state.1,year.1,new1,new2
0,1.5,Ohio,2000.0,5.0,6.0,Ohio,2000.0,3.0,s
1,1.7,Ohio,2001.0,4.0,7.0,Ohio,2001.0,4.0,g
2,3.6,Ohio,2002.0,7.0,8.0,Ohio,2002.0,6.0,d
3,2.4,Nevada,2001.0,8.0,9.0,Nevada,2001.0,7.0,g
4,2.9,Nevada,2002.0,0.0,,Nevada,2002.0,8.0,t
5,3.0,beijing,2006.0,2.0,5.0,shanghai,2007.0,4.0,u
6,4.4,,,,,,,,
1,1.7,Ohio,2001.0,4.0,7.0,Ohio,2001.0,4.0,g
3,2.4,Nevada,2001.0,8.0,9.0,Nevada,2001.0,7.0,g


In [None]:
# 删除多列

In [216]:
frame = frame.drop(['mod','add_one','new1','new2'], axis=1)
frame

Unnamed: 0,pop,state,year,value,state.1,year.1
0,1.5,Ohio,2000.0,5.0,Ohio,2000.0
1,1.7,Ohio,2001.0,4.0,Ohio,2001.0
2,3.6,Ohio,2002.0,7.0,Ohio,2002.0
3,2.4,Nevada,2001.0,8.0,Nevada,2001.0
4,2.9,Nevada,2002.0,0.0,Nevada,2002.0
5,3.0,beijing,2006.0,2.0,shanghai,2007.0
6,4.4,,,,,
1,1.7,Ohio,2001.0,4.0,Ohio,2001.0
3,2.4,Nevada,2001.0,8.0,Nevada,2001.0


In [219]:
del frame['value']

In [220]:
frame

Unnamed: 0,pop,state,year,state.1,year.1
0,1.5,Ohio,2000.0,Ohio,2000.0
1,1.7,Ohio,2001.0,Ohio,2001.0
2,3.6,Ohio,2002.0,Ohio,2002.0
3,2.4,Nevada,2001.0,Nevada,2001.0
4,2.9,Nevada,2002.0,Nevada,2002.0
5,3.0,beijing,2006.0,shanghai,2007.0
6,4.4,,,,
1,1.7,Ohio,2001.0,Ohio,2001.0
3,2.4,Nevada,2001.0,Nevada,2001.0


### 5.5.5 添加表头

In [222]:
s

Unnamed: 0,Nevada,Ohio
2001,3.2,6.7
2003,5.5,
2004,3.2,6.7


In [225]:
s.index.name = 'year'
s.columns.name = 'city'
s

city,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,3.2,6.7
2003,5.5,
2004,3.2,6.7


In [236]:
type(s.loc[2001:2001])

pandas.core.frame.DataFrame

In [None]:
# 数值比较 > < == >= <=

In [240]:
frame = frame.iloc[:,[0,1,3]]
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000.0
1,1.7,Ohio,2001.0
2,3.6,Ohio,2002.0
3,2.4,Nevada,2001.0
4,2.9,Nevada,2002.0
5,3.0,beijing,2006.0
6,4.4,,
1,1.7,Ohio,2001.0
3,2.4,Nevada,2001.0


In [241]:
frame[frame['pop'] > 2]

Unnamed: 0,pop,state,year
2,3.6,Ohio,2002.0
3,2.4,Nevada,2001.0
4,2.9,Nevada,2002.0
5,3.0,beijing,2006.0
6,4.4,,
3,2.4,Nevada,2001.0


In [242]:
frame[(frame['pop'] > 2) & (frame['year'] < 2002)]

Unnamed: 0,pop,state,year
3,2.4,Nevada,2001.0
3,2.4,Nevada,2001.0


In [244]:
frame[(frame['pop'] > 2) | (frame['year'] >2002)]

Unnamed: 0,pop,state,year
2,3.6,Ohio,2002.0
3,2.4,Nevada,2001.0
4,2.9,Nevada,2002.0
5,3.0,beijing,2006.0
6,4.4,,
3,2.4,Nevada,2001.0


In [248]:
cy = ['Ohio', 'beijing']
frame[frame['state'].isin(cy)]

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000.0
1,1.7,Ohio,2001.0
2,3.6,Ohio,2002.0
5,3.0,beijing,2006.0
1,1.7,Ohio,2001.0


In [250]:
# 利用Python列表的 not in特性
cy = ['Ohio', 'beijing']
f = frame['state'].map(lambda x: x not in cy)
frame[f]

Unnamed: 0,pop,state,year
3,2.4,Nevada,2001.0
4,2.9,Nevada,2002.0
6,4.4,,
3,2.4,Nevada,2001.0


In [251]:
e = ['a','b','c']

In [252]:
'a' in e

True

In [253]:
'c' not in e

False

In [254]:
f

0    False
1    False
2    False
3     True
4     True
5    False
6     True
1    False
3     True
Name: state, dtype: bool

In [257]:
frame.loc[frame['pop'] > 3]

Unnamed: 0,pop,state,year
2,3.6,Ohio,2002.0
6,4.4,,


### 5.5.6 修改索引

In [2]:
frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [5]:
frame = frame.reindex(['a', 'b', 'c', 'd'])
frame

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [6]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [13]:
frame = frame.drop('b')

In [16]:
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill')

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,0.0,1.0,2.0
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [3]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [13]:
frame.rename(columns={'Ohio':'beijing'})

Unnamed: 0,beijing,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [14]:
frame.rename(index={'a':'d'})

Unnamed: 0,Ohio,Texas,California
d,0,1,2
c,3,4,5
d,6,7,8


## 5.6 多重索引

### 5.6.1 访问

In [21]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [22]:
frame.index

MultiIndex(levels=[[u'a', u'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [23]:
frame.columns

MultiIndex(levels=[[u'Colorado', u'Ohio'], [u'Green', u'Red']],
           labels=[[1, 1, 0], [0, 1, 0]])

In [None]:
# 访问行

In [33]:
frame.loc['a']

Unnamed: 0_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Green,Red,Green
1,0,1,2
2,3,4,5


In [45]:
frame.loc['a'].loc[1]

Ohio      Green    0
          Red      1
Colorado  Green    2
Name: 1, dtype: int32

In [None]:
# 访问列

In [37]:
frame.loc[:,'Ohio']

Unnamed: 0,Unnamed: 1,Green,Red
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [43]:
frame.loc[:,'Ohio']['Green']

a  1    0
   2    3
b  1    6
   2    9
Name: Green, dtype: int32

In [None]:
# 访问行列的另一种方式

In [15]:
frame['Ohio','Green']

a  1    0
   2    3
b  1    6
   2    9
Name: (Ohio, Green), dtype: int32

In [17]:
frame.loc['a',1]

Ohio      Green    0
          Red      1
Colorado  Green    2
Name: (a, 1), dtype: int32

### 5.6.2 修改多重索引

In [47]:
frame.swaplevel(0,1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [48]:
frame.swaplevel(0,1,axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Green,Red,Green
Unnamed: 0_level_1,Unnamed: 1_level_1,Ohio,Ohio,Colorado
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### 5.6.3 恢复索引为列

In [50]:
frame.reset_index(0)

Unnamed: 0_level_0,level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [52]:
frame.reset_index(1)

Unnamed: 0_level_0,level_1,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### 5.6.4 指定列为索引

In [2]:
frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [3]:
frame.set_index('Ohio')

Unnamed: 0_level_0,Texas,California
Ohio,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,2
3,4,5
6,7,8


In [4]:
frame.set_index(['Ohio', 'Texas'])

Unnamed: 0_level_0,Unnamed: 1_level_0,California
Ohio,Texas,Unnamed: 2_level_1
0,1,2
3,4,5
6,7,8


In [None]:
#多层次索引情况

In [62]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [63]:
frame.set_index([frame['Ohio','Green']])

Unnamed: 0_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Green,Red,Green
"(Ohio, Green)",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0,1,2
3,3,4,5
6,6,7,8
9,9,10,11


In [64]:
frame.set_index([frame['Ohio','Green']],append=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Green,Red,Green
Unnamed: 0_level_2,Unnamed: 1_level_2,"(Ohio, Green)",Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,0,1,2
a,2,3,3,4,5
b,1,6,6,7,8
b,2,9,9,10,11


### 5.6.5 删除多索引下的行

In [107]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [38]:
frame.drop(1, level=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,2,3,4,5
b,2,9,10,11


In [39]:
frame.drop('a', level=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
b,1,6,7,8
b,2,9,10,11


### 5.6.6 删除多索引下的列

In [40]:
frame.drop('Red',level=1, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Green
a,1,0,2
a,2,3,5
b,1,6,8
b,2,9,11


In [41]:
frame.drop('Ohio',level=0, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green
a,1,2
a,2,5
b,1,8
b,2,11


In [108]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### 5.6.7 多层次索引的变换

In [None]:
# 行索引变换为列索引

In [113]:
# 默认转换内层索引
frame.unstack()

Unnamed: 0_level_0,Ohio,Ohio,Ohio,Ohio,Colorado,Colorado
Unnamed: 0_level_1,Green,Green,Red,Red,Green,Green
Unnamed: 0_level_2,1,2,1,2,1,2
a,0,3,1,4,2,5
b,6,9,7,10,8,11


In [114]:
frame.unstack(0)

Unnamed: 0_level_0,Ohio,Ohio,Ohio,Ohio,Colorado,Colorado
Unnamed: 0_level_1,Green,Green,Red,Red,Green,Green
Unnamed: 0_level_2,a,b,a,b,a,b
1,0,6,1,7,2,8
2,3,9,4,10,5,11


In [None]:
# 列索引变换为行索引

In [119]:
frame.stack() # 等效frame.stack(1)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Colorado,Ohio
a,1,Green,2.0,0
a,1,Red,,1
a,2,Green,5.0,3
a,2,Red,,4
b,1,Green,8.0,6
b,1,Red,,7
b,2,Green,11.0,9
b,2,Red,,10


In [117]:
frame.stack(0)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Green,Red
a,1,Colorado,2,
a,1,Ohio,0,1.0
a,2,Colorado,5,
a,2,Ohio,3,4.0
b,1,Colorado,8,
b,1,Ohio,6,7.0
b,2,Colorado,11,
b,2,Ohio,9,10.0


### 5.6.8 添加表头

In [120]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


## 5.7 表之间的计算

In [73]:
series1 = Series(range(3), index=['b', 'e', 'f'])
series1

b    0
e    1
f    2
dtype: int64

In [74]:
series2 = Series([6,7,8], index=['b', 'e', 'f'])
series2

b    6
e    7
f    8
dtype: int64

In [75]:
series1+series2

b     6
e     8
f    10
dtype: int64

In [76]:
series1.add(series2)

b     6
e     8
f    10
dtype: int64

In [78]:
series3 = Series([6,7,8,9], index=['b', 'e', 'f','g'])
series3

b    6
e    7
f    8
g    9
dtype: int64

In [79]:
series1+series3

b     6.0
e     8.0
f    10.0
g     NaN
dtype: float64

In [81]:
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [82]:
frame+frame

Unnamed: 0,b,d,e
Utah,0.0,2.0,4.0
Ohio,6.0,8.0,10.0
Texas,12.0,14.0,16.0
Oregon,18.0,20.0,22.0


In [83]:
frame.add(frame)

Unnamed: 0,b,d,e
Utah,0.0,2.0,4.0
Ohio,6.0,8.0,10.0
Texas,12.0,14.0,16.0
Oregon,18.0,20.0,22.0


In [96]:
series2 = Series([6,7,8], index=frame.columns)
series2

b    6
d    7
e    8
dtype: int64

In [98]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [99]:
frame.sub(series2)

Unnamed: 0,b,d,e
Utah,-6.0,-6.0,-6.0
Ohio,-3.0,-3.0,-3.0
Texas,0.0,0.0,0.0
Oregon,3.0,3.0,3.0


In [100]:
series3 = Series([6,7,8,9], index=frame.index)
series3

Utah      6
Ohio      7
Texas     8
Oregon    9
dtype: int64

In [103]:
frame.sub(series3, axis=0)

Unnamed: 0,b,d,e
Utah,-6.0,-5.0,-4.0
Ohio,-4.0,-3.0,-2.0
Texas,-2.0,-1.0,0.0
Oregon,0.0,1.0,2.0


## 5.8 表的合并连接

### 5.8.1 合并

In [None]:
# Series的合并

In [130]:
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])

In [131]:
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [132]:
pd.concat([s1, s2, s3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [None]:
# DataFrame合并

In [137]:
df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                columns=['one', 'two'])
df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                columns=['three', 'four'])


In [None]:
# 按列合并

In [138]:
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [134]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [135]:
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [None]:
# 按行合并

In [136]:
pd.concat([df1, df2])

Unnamed: 0,four,one,three,two
a,,0.0,,1.0
b,,2.0,,3.0
c,,4.0,,5.0
a,6.0,,5.0,
c,8.0,,7.0,


In [None]:
# 索引无关合并

In [139]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,four,one,three,two
0,,0.0,,1.0
1,,2.0,,3.0
2,,4.0,,5.0
3,6.0,,5.0,
4,8.0,,7.0,


### 5.8.2 连接

In [None]:
# 1. 依据指定列连接

In [121]:
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b', 'd'],
                 'data2': range(3)})

In [122]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [123]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


In [124]:
pd.merge(df1, df2, on='key')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [126]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


In [125]:
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'],
                 'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,data1,lkey,data2,rkey
0,0,b,1,b
1,1,b,1,b
2,6,b,1,b
3,2,a,0,a
4,4,a,0,a
5,5,a,0,a


In [128]:
left = DataFrame({'key1': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [129]:
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [None]:
# 2. 依据索引连接

In [140]:
left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                  'value': range(6)})
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

In [142]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [144]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [141]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [145]:
left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
                 columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                   index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])

In [146]:
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [147]:
right2

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [148]:
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


# 6. 数据处理

## 6.1 无效值处理

### 6.1.1 填充无效值

In [172]:
from numpy import nan as NA
df = DataFrame(np.random.randn(7, 3))
df[3] = NA
df[1] = NA
df.iloc[3,2] = NA

In [173]:
df

Unnamed: 0,0,1,2,3
0,-1.303019,,0.490286,
1,0.686045,,-0.395774,
2,0.345563,,-0.528029,
3,1.363409,,,
4,-1.40302,,0.518871,
5,-0.056937,,0.560997,
6,-0.27384,,-0.833538,


In [174]:
df.fillna(0)

Unnamed: 0,0,1,2,3
0,-1.303019,0.0,0.490286,0.0
1,0.686045,0.0,-0.395774,0.0
2,0.345563,0.0,-0.528029,0.0
3,1.363409,0.0,0.0,0.0
4,-1.40302,0.0,0.518871,0.0
5,-0.056937,0.0,0.560997,0.0
6,-0.27384,0.0,-0.833538,0.0


In [175]:
df.fillna({1: 0.5, 3: -1})

Unnamed: 0,0,1,2,3
0,-1.303019,0.5,0.490286,-1.0
1,0.686045,0.5,-0.395774,-1.0
2,0.345563,0.5,-0.528029,-1.0
3,1.363409,0.5,,-1.0
4,-1.40302,0.5,0.518871,-1.0
5,-0.056937,0.5,0.560997,-1.0
6,-0.27384,0.5,-0.833538,-1.0


In [216]:
df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                 'b': [np.nan, 2., np.nan, 6.],
                 'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
                 'b': [np.nan, 3., 4., 6., 8.]})

In [217]:
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [218]:
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [223]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


In [221]:
df1.combine_first(df2.loc[:,'a':'a'])

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,,10.0
3,3.0,6.0,14.0
4,7.0,,


### 6.1.2 删除无效值

In [176]:
df.dropna(axis=1)

Unnamed: 0,0
0,-1.303019
1,0.686045
2,0.345563
3,1.363409
4,-1.40302
5,-0.056937
6,-0.27384


In [178]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,-1.303019,,0.490286,
1,0.686045,,-0.395774,
2,0.345563,,-0.528029,
4,-1.40302,,0.518871,
5,-0.056937,,0.560997,
6,-0.27384,,-0.833538,


## 6.2 数据去重

In [185]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                   'c': [-2, 5, 8, -2.5]})
frame = frame.set_index('a', drop=False)
frame

Unnamed: 0_level_0,a,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,4.3,-2.0
1,1,7.0,5.0
0,0,-3.0,8.0
1,1,2.0,-2.5


### 6.2.1 重复判断

In [None]:
# 判断列是否有重复数据

In [186]:
frame['a'].is_unique

False

In [187]:
frame['a'].is_unique

False

In [188]:
frame.index.is_unique

False

### 6.2.2 去重

In [189]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [191]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [192]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
3,two,3


In [201]:
d = data.set_index('k1')
d

Unnamed: 0_level_0,k2
k1,Unnamed: 1_level_1
one,1
one,1
one,2
two,3
two,3
two,4
two,4


In [202]:
d.index.drop_duplicates()

Index([u'one', u'two'], dtype='object', name=u'k1')

## 6.3 排序

### 6.3.1 按索引排序

In [209]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
frame = frame.T

In [210]:
frame

Unnamed: 0,three,one
d,0,4
a,1,5
b,2,6
c,3,7


In [211]:
frame.sort_index()

Unnamed: 0,three,one
a,1,5
b,2,6
c,3,7
d,0,4


In [213]:
frame.sort_index(axis=1)

Unnamed: 0,one,three
d,4,0
a,5,1
b,6,2
c,7,3


### 6.3.2 按列排序

In [214]:
frame.sort_values(by='one')

Unnamed: 0,three,one
d,0,4
a,1,5
b,2,6
c,3,7


In [215]:
frame.sort_values(by=['one', 'three'], ascending=False)

Unnamed: 0,three,one
c,3,7
b,2,6
a,1,5
d,0,4


# 6.4 函数

### 6.4.1 字符串函数

In [2]:
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [None]:
# 字符串长度

In [225]:
data['food'].str.len()

0     5
1    11
2     5
3     8
4    11
5     5
6     8
7     9
8     8
Name: food, dtype: int64

In [234]:
# 转换为大写字母
data['food'].str.upper()

0          BACON
1    PULLED PORK
2          BACON
3       PASTRAMI
4    CORNED BEEF
5          BACON
6       PASTRAMI
7      HONEY HAM
8       NOVA LOX
Name: food, dtype: object

In [5]:
# 在列中查找字符串
data['food'].str.find('st')

0   -1
1   -1
2   -1
3    2
4   -1
5   -1
6    2
7   -1
8   -1
Name: food, dtype: int64

In [47]:
data['city'] = ['beijing', 'shanghai', 'beijing', 'guangzhou', 
                'shenzhen', 'beijing','shanghai', 
                'shenzhen', 'shanghai']

In [48]:
# 合并两列（也可以是指定字符列表）
data['food'].str.cat(data['city'], sep='_')

0           bacon_beijing
1    pulled pork_shanghai
2           bacon_beijing
3      Pastrami_guangzhou
4    corned beef_shenzhen
5           Bacon_beijing
6       pastrami_shanghai
7      honey ham_shenzhen
8       nova lox_shanghai
Name: food, dtype: object

In [49]:
# 替换列中字符（串）
data['food'].str.replace('ba', 'pa')

0          pacon
1    pulled pork
2          pacon
3       Pastrami
4    corned beef
5          Bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [50]:
data['food'].replace(['bacon', 'pastrami'], ['shaoji', 'kaoya'])

0         shaoji
1    pulled pork
2         shaoji
3       Pastrami
4    corned beef
5          Bacon
6          kaoya
7      honey ham
8       nova lox
Name: food, dtype: object

### 6.4.2 map函数

In [51]:
data

Unnamed: 0,food,ounces,city,nums
0,bacon,8.0,beijing,10
1,pulled pork,6.0,shanghai,21
2,bacon,24.0,beijing,29
3,Pastrami,12.0,guangzhou,20
4,corned beef,15.0,shenzhen,755
5,Bacon,16.0,beijing,25
6,pastrami,6.0,shanghai,27
7,honey ham,10.0,shenzhen,28
8,nova lox,12.0,shanghai,30


In [52]:
# 根据城市名添加区号（随便写的，不是实际数据）
# 字典类型
city_nums = {'beijing':10, 'shanghai':21, 'nanjing':29, 'guangzhou':20, 
                'shenzhen':755, 'hangzhou':25,'wuhan':27, 
                'chengdu':28, 'chongqing':30}


In [53]:
data['nums'] = data['city'].map(lambda x : city_nums[x])

In [55]:
data

Unnamed: 0,food,ounces,city,nums
0,bacon,8.0,beijing,10
1,pulled pork,6.0,shanghai,21
2,bacon,24.0,beijing,10
3,Pastrami,12.0,guangzhou,20
4,corned beef,15.0,shenzhen,755
5,Bacon,16.0,beijing,10
6,pastrami,6.0,shanghai,21
7,honey ham,10.0,shenzhen,755
8,nova lox,12.0,shanghai,21


In [56]:
# 将ounces列的数据增加一倍
def ounces_twice(val):
    return val*2;

data['ounces'] = data['ounces'].map(ounces_twice)

In [57]:
data

Unnamed: 0,food,ounces,city,nums
0,bacon,16.0,beijing,10
1,pulled pork,12.0,shanghai,21
2,bacon,48.0,beijing,10
3,Pastrami,24.0,guangzhou,20
4,corned beef,30.0,shenzhen,755
5,Bacon,32.0,beijing,10
6,pastrami,12.0,shanghai,21
7,honey ham,20.0,shenzhen,755
8,nova lox,24.0,shanghai,21


### 6.4.3 apply函数

In [60]:
def func(df):
    return df['ounces'].max()
data.groupby('city').apply(func)

city
beijing      48.0
guangzhou    24.0
shanghai     24.0
shenzhen     30.0
dtype: float64

## 6.5 数据区间分组

In [63]:
data

Unnamed: 0,food,ounces,city,nums
0,bacon,16.0,beijing,10
1,pulled pork,12.0,shanghai,21
2,bacon,48.0,beijing,10
3,Pastrami,24.0,guangzhou,20
4,corned beef,30.0,shenzhen,755
5,Bacon,32.0,beijing,10
6,pastrami,12.0,shanghai,21
7,honey ham,20.0,shenzhen,755
8,nova lox,24.0,shanghai,21


In [64]:
bins = [0, 20, 100]

In [65]:
pd.cut(data['ounces'], bins)

0      (0, 20]
1      (0, 20]
2    (20, 100]
3    (20, 100]
4    (20, 100]
5    (20, 100]
6      (0, 20]
7      (0, 20]
8    (20, 100]
Name: ounces, dtype: category
Categories (2, interval[int64]): [(0, 20] < (20, 100]]

In [67]:
# 改为左闭右开区间
data1 = pd.cut(data['ounces'], bins, right=False)
data1

0      [0, 20)
1      [0, 20)
2    [20, 100)
3    [20, 100)
4    [20, 100)
5    [20, 100)
6      [0, 20)
7    [20, 100)
8    [20, 100)
Name: ounces, dtype: category
Categories (2, interval[int64]): [[0, 20) < [20, 100)]

In [69]:
data1.values.categories

IntervalIndex([[0, 20), [20, 100)]
              closed='left',
              dtype='interval[int64]')

In [70]:
data1.values.codes

array([0, 0, 1, 1, 1, 1, 0, 1, 1], dtype=int8)

In [74]:
g_name = [u'大于0小于20', u'大于等于20小于100']
data[u'分组'] = pd.cut(data['ounces'], bins, labels=g_name)
data

Unnamed: 0,food,ounces,city,nums,分组
0,bacon,16.0,beijing,10,大于0小于20
1,pulled pork,12.0,shanghai,21,大于0小于20
2,bacon,48.0,beijing,10,大于等于20小于100
3,Pastrami,24.0,guangzhou,20,大于等于20小于100
4,corned beef,30.0,shenzhen,755,大于等于20小于100
5,Bacon,32.0,beijing,10,大于等于20小于100
6,pastrami,12.0,shanghai,21,大于0小于20
7,honey ham,20.0,shenzhen,755,大于0小于20
8,nova lox,24.0,shanghai,21,大于等于20小于100


In [75]:
pd.cut(data['ounces'], 4, right=False)

0      [12.0, 21.0)
1      [12.0, 21.0)
2    [39.0, 48.036)
3      [21.0, 30.0)
4      [30.0, 39.0)
5      [30.0, 39.0)
6      [12.0, 21.0)
7      [12.0, 21.0)
8      [21.0, 30.0)
Name: ounces, dtype: category
Categories (4, interval[float64]): [[12.0, 21.0) < [21.0, 30.0) < [30.0, 39.0) < [39.0, 48.036)]

## 6.6 统计信息计算

In [76]:
data['ounces'].describe()

count     9.000000
mean     24.222222
std      11.421228
min      12.000000
25%      16.000000
50%      24.000000
75%      30.000000
max      48.000000
Name: ounces, dtype: float64

In [77]:
data['ounces'].sum()

218.0

In [78]:
data['ounces'].mean()

24.22222222222222

In [79]:
data['ounces'].max()

48.0

In [80]:
data['ounces'].min()

12.0

In [81]:
data['ounces'].std()

11.421227799341212

# 7. 分组和数据透视表

## 7.1 分组聚合

### 7.1.2 groupby函数

In [85]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,-1.699096,-0.126146,a,one
1,-0.887453,1.80196,a,two
2,1.017835,0.999763,b,one
3,-0.471564,-0.639938,b,two
4,-1.419464,1.877192,a,one


In [86]:
# 按照key1进行分组
grouped = df.groupby('key1')

In [88]:
# 迭代查看分组数据
# 注意Python2和Python3的print函数语法有差异
# 这里默认Python2环境，如果是Python3环境，使用
# print(x)
# print(y)
for x,y in grouped:
    print x
    print y

a
      data1     data2 key1 key2
0 -1.699096 -0.126146    a  one
1 -0.887453  1.801960    a  two
4 -1.419464  1.877192    a  one
b
      data1     data2 key1 key2
2  1.017835  0.999763    b  one
3 -0.471564 -0.639938    b  two


In [95]:
# 若想查看分组信息，有一个简单的办法
# 先将分组信息转换为list，然后在转换成字典类型
p = dict(list(df.groupby('key1')))
p

{'a':       data1     data2 key1 key2
 0 -1.699096 -0.126146    a  one
 1 -0.887453  1.801960    a  two
 4 -1.419464  1.877192    a  one, 'b':       data1     data2 key1 key2
 2  1.017835  0.999763    b  one
 3 -0.471564 -0.639938    b  two}

In [96]:
# 查看一项
p['b']

Unnamed: 0,data1,data2,key1,key2
2,1.017835,0.999763,b,one
3,-0.471564,-0.639938,b,two


In [91]:
# 分组后求data1列的均值
df.groupby('key1')['data1'].mean()

key1
a   -1.335338
b    0.273135
Name: data1, dtype: float64

In [92]:
# 分组后分别求data1，data2列的均值
df.groupby('key1')['data1','data2'].mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.335338,1.184335
b,0.273135,0.179912


In [100]:
# 按类型分组
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -1.699096 -0.126146
 1 -0.887453  1.801960
 2  1.017835  0.999763
 3 -0.471564 -0.639938
 4 -1.419464  1.877192, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [103]:
# 分组并选取每组中data1的值最小的数据
def func(df):
    return df.sort_values('data1').iloc[0:1]
df.groupby('key1').apply(func)

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2,key1,key2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,0,-1.699096,-0.126146,a,one
b,3,-0.471564,-0.639938,b,two
