In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20201011', periods=6)
dates

DatetimeIndex(['2020-10-11', '2020-10-12', '2020-10-13', '2020-10-14',
               '2020-10-15', '2020-10-16'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2020-10-11,0.444876,-0.467661,-0.375443,-1.979512
2020-10-12,-1.472198,0.358472,-3.082573,-0.952117
2020-10-13,0.028777,1.015571,0.140181,1.443431
2020-10-14,1.322536,1.537445,-0.436917,0.904359
2020-10-15,0.059951,1.262102,0.51269,0.359931
2020-10-16,-0.850823,-0.707714,-2.117373,0.879086


In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['c1', 'c2', 'c3', 'c4'])
df

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137
2020-10-14,0.550959,1.263393,-0.845099,1.723545
2020-10-15,0.847408,0.261318,-0.866097,0.89747
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795


In [8]:
# pandas 数据自动对齐， 会根据各字段输入的数据自动对齐
df1 = pd.DataFrame({
    'A':1.0,
    'B':pd.Timestamp('20201101'),
    'C':np.array([3]*4, dtype = 'int32'),
    'D':pd.Categorical(['test', 'train', 'test', 'train']),
    'F':'foo'
})
df1

Unnamed: 0,A,B,C,D,F
0,1.0,2020-11-01,3,test,foo
1,1.0,2020-11-01,3,train,foo
2,1.0,2020-11-01,3,test,foo
3,1.0,2020-11-01,3,train,foo


In [9]:
df1.dtypes

A           float64
B    datetime64[ns]
C             int32
D          category
F            object
dtype: object

In [10]:
df1.describe()

Unnamed: 0,A,C
count,4.0,4.0
mean,1.0,3.0
std,0.0,0.0
min,1.0,3.0
25%,1.0,3.0
50%,1.0,3.0
75%,1.0,3.0
max,1.0,3.0


In [11]:
df1.head()

Unnamed: 0,A,B,C,D,F
0,1.0,2020-11-01,3,test,foo
1,1.0,2020-11-01,3,train,foo
2,1.0,2020-11-01,3,test,foo
3,1.0,2020-11-01,3,train,foo


In [12]:
df1.tail(2)

Unnamed: 0,A,B,C,D,F
2,1.0,2020-11-01,3,test,foo
3,1.0,2020-11-01,3,train,foo


In [15]:
df1.index

RangeIndex(start=0, stop=4, step=1)

In [16]:
df1.columns

Index(['A', 'B', 'C', 'D', 'F'], dtype='object')

In [19]:
## DataFrame转array
array = df.to_numpy()
array


array([[ 1.12316907,  1.22958861,  0.9183056 , -0.18783349],
       [-0.5097061 , -0.76671028,  1.46139918,  0.08024094],
       [-1.30342291,  0.88250641, -0.86988193, -1.21513682],
       [ 0.55095905,  1.26339265, -0.84509925,  1.72354472],
       [ 0.84740772,  0.26131783, -0.86609682,  0.89746959],
       [ 1.9158666 , -2.40377492, -0.82311776, -3.19479508]])

In [20]:
df1.to_numpy()

array([[1.0, Timestamp('2020-11-01 00:00:00'), 3, 'test', 'foo'],
       [1.0, Timestamp('2020-11-01 00:00:00'), 3, 'train', 'foo'],
       [1.0, Timestamp('2020-11-01 00:00:00'), 3, 'test', 'foo'],
       [1.0, Timestamp('2020-11-01 00:00:00'), 3, 'train', 'foo']],
      dtype=object)

In [18]:
## Series转array
s.to_numpy()

array([ 1.,  3.,  5., nan,  6.,  8.])

In [21]:
### DataFrame转置
df.T

Unnamed: 0,2020-10-11,2020-10-12,2020-10-13,2020-10-14,2020-10-15,2020-10-16
c1,1.123169,-0.509706,-1.303423,0.550959,0.847408,1.915867
c2,1.229589,-0.76671,0.882506,1.263393,0.261318,-2.403775
c3,0.918306,1.461399,-0.869882,-0.845099,-0.866097,-0.823118
c4,-0.187833,0.080241,-1.215137,1.723545,0.89747,-3.194795


In [25]:
## 排序
## 按轴排序
df.sort_index(axis = 1, ascending=False) 


Unnamed: 0,c4,c3,c2,c1
2020-10-11,-0.187833,0.918306,1.229589,1.123169
2020-10-12,0.080241,1.461399,-0.76671,-0.509706
2020-10-13,-1.215137,-0.869882,0.882506,-1.303423
2020-10-14,1.723545,-0.845099,1.263393,0.550959
2020-10-15,0.89747,-0.866097,0.261318,0.847408
2020-10-16,-3.194795,-0.823118,-2.403775,1.915867


In [26]:
## 按值排序
df.sort_values(by= 'c3')

Unnamed: 0,c1,c2,c3,c4
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137
2020-10-15,0.847408,0.261318,-0.866097,0.89747
2020-10-14,0.550959,1.263393,-0.845099,1.723545
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241


In [27]:
## 字段选择
col = df['c1']
col


2020-10-11    1.123169
2020-10-12   -0.509706
2020-10-13   -1.303423
2020-10-14    0.550959
2020-10-15    0.847408
2020-10-16    1.915867
Freq: D, Name: c1, dtype: float64

In [29]:
col = df.c1
col

2020-10-11    1.123169
2020-10-12   -0.509706
2020-10-13   -1.303423
2020-10-14    0.550959
2020-10-15    0.847408
2020-10-16    1.915867
Freq: D, Name: c1, dtype: float64

In [31]:
## 切片
df_temp = df[0:3]
df_temp

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137


In [32]:
df_temp = df['2020-10-11':'2020-10-14']
df_temp

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137
2020-10-14,0.550959,1.263393,-0.845099,1.723545


In [33]:
df_temp = df[['c1', 'c3', 'c4']]
df_temp


Unnamed: 0,c1,c3,c4
2020-10-11,1.123169,0.918306,-0.187833
2020-10-12,-0.509706,1.461399,0.080241
2020-10-13,-1.303423,-0.869882,-1.215137
2020-10-14,0.550959,-0.845099,1.723545
2020-10-15,0.847408,-0.866097,0.89747
2020-10-16,1.915867,-0.823118,-3.194795


In [39]:
df.loc[df.index[1]]

c1   -0.509706
c2   -0.766710
c3    1.461399
c4    0.080241
Name: 2020-10-12 00:00:00, dtype: float64

In [40]:
df.loc[df.index[0:2], ['c1', 'c3']]

Unnamed: 0,c1,c3
2020-10-11,1.123169,0.918306
2020-10-12,-0.509706,1.461399


In [41]:
df.loc[df.index[1], ['c2']]

c2   -0.76671
Name: 2020-10-12 00:00:00, dtype: float64

In [42]:
### 按照位置选择
df.iloc[2]


c1   -1.303423
c2    0.882506
c3   -0.869882
c4   -1.215137
Name: 2020-10-13 00:00:00, dtype: float64

In [43]:
df.iloc[0:2, 1:3]

Unnamed: 0,c2,c3
2020-10-11,1.229589,0.918306
2020-10-12,-0.76671,1.461399


In [46]:
df.iloc[[0,2,3], [1,2,3]]

Unnamed: 0,c2,c3,c4
2020-10-11,1.229589,0.918306,-0.187833
2020-10-13,0.882506,-0.869882,-1.215137
2020-10-14,1.263393,-0.845099,1.723545


In [47]:
df.iloc[:, 1:3]

Unnamed: 0,c2,c3
2020-10-11,1.229589,0.918306
2020-10-12,-0.76671,1.461399
2020-10-13,0.882506,-0.869882
2020-10-14,1.263393,-0.845099
2020-10-15,0.261318,-0.866097
2020-10-16,-2.403775,-0.823118


In [48]:
df.iloc[1,1]

-0.7667102764831915

In [49]:
##  过滤
df[df.c1 > 0]

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-14,0.550959,1.263393,-0.845099,1.723545
2020-10-15,0.847408,0.261318,-0.866097,0.89747
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795


In [50]:
df[df>0]

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,
2020-10-12,,,1.461399,0.080241
2020-10-13,,0.882506,,
2020-10-14,0.550959,1.263393,,1.723545
2020-10-15,0.847408,0.261318,,0.89747
2020-10-16,1.915867,,,


In [54]:
df[(df.c1>0)&(df.c2>0)]

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-14,0.550959,1.263393,-0.845099,1.723545
2020-10-15,0.847408,0.261318,-0.866097,0.89747


In [56]:
df2 = df.copy()
df2['c5'] = ['one', 'one','three','four', 'five','six']
df2

Unnamed: 0,c1,c2,c3,c4,c5
2020-10-11,1.123169,1.229589,0.918306,-0.187833,one
2020-10-12,-0.509706,-0.76671,1.461399,0.080241,one
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137,three
2020-10-14,0.550959,1.263393,-0.845099,1.723545,four
2020-10-15,0.847408,0.261318,-0.866097,0.89747,five
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795,six


In [59]:
df2[df2['c5'].isin(['one', 'six'])]

Unnamed: 0,c1,c2,c3,c4,c5
2020-10-11,1.123169,1.229589,0.918306,-0.187833,one
2020-10-12,-0.509706,-0.76671,1.461399,0.080241,one
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795,six


### 赋值

In [67]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20201011', periods = 6))
s1

2020-10-11    1
2020-10-12    2
2020-10-13    3
2020-10-14    4
2020-10-15    5
2020-10-16    6
Freq: D, dtype: int64

In [68]:
### 要确保对应的index相同， 不然， c6字段的值为 NAN
df2['c6'] = s1
df2

Unnamed: 0,c1,c2,c3,c4,c5,c6
2020-10-11,1.123169,1.229589,0.918306,-0.187833,one,1
2020-10-12,-0.509706,-0.76671,1.461399,0.080241,one,2
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137,three,3
2020-10-14,0.550959,1.263393,-0.845099,1.723545,four,4
2020-10-15,0.847408,0.261318,-0.866097,0.89747,five,5
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795,six,6


In [69]:
df2.at[dates[0], 'c6'] = 111
df2

Unnamed: 0,c1,c2,c3,c4,c5,c6
2020-10-11,1.123169,1.229589,0.918306,-0.187833,one,111
2020-10-12,-0.509706,-0.76671,1.461399,0.080241,one,2
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137,three,3
2020-10-14,0.550959,1.263393,-0.845099,1.723545,four,4
2020-10-15,0.847408,0.261318,-0.866097,0.89747,five,5
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795,six,6


In [70]:
df2.iat[1,2] = 2.2211

In [71]:
df2

Unnamed: 0,c1,c2,c3,c4,c5,c6
2020-10-11,1.123169,1.229589,0.918306,-0.187833,one,111
2020-10-12,-0.509706,-0.76671,2.2211,0.080241,one,2
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137,three,3
2020-10-14,0.550959,1.263393,-0.845099,1.723545,four,4
2020-10-15,0.847408,0.261318,-0.866097,0.89747,five,5
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795,six,6


In [72]:
df2.loc[:,'c4'] = np.array([1.1]*6)
df2

Unnamed: 0,c1,c2,c3,c4,c5,c6
2020-10-11,1.123169,1.229589,0.918306,1.1,one,111
2020-10-12,-0.509706,-0.76671,2.2211,1.1,one,2
2020-10-13,-1.303423,0.882506,-0.869882,1.1,three,3
2020-10-14,0.550959,1.263393,-0.845099,1.1,four,4
2020-10-15,0.847408,0.261318,-0.866097,1.1,five,5
2020-10-16,1.915867,-2.403775,-0.823118,1.1,six,6


In [76]:
df

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137
2020-10-14,0.550959,1.263393,-0.845099,1.723545
2020-10-15,0.847408,0.261318,-0.866097,0.89747
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795


In [82]:
df[df>-0.8]

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241
2020-10-13,,0.882506,,
2020-10-14,0.550959,1.263393,,1.723545
2020-10-15,0.847408,0.261318,,0.89747
2020-10-16,1.915867,,,


In [83]:
##删除包括nan的行
df[df>-0.8].dropna(how='any')

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241


In [84]:
## 对nan填充值 
df[df>-0.8].fillna(value = 11)

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241
2020-10-13,11.0,0.882506,11.0,11.0
2020-10-14,0.550959,1.263393,11.0,1.723545
2020-10-15,0.847408,0.261318,11.0,0.89747
2020-10-16,1.915867,11.0,11.0,11.0


In [85]:
pd.isna(df[df>-0.8])

Unnamed: 0,c1,c2,c3,c4
2020-10-11,False,False,False,False
2020-10-12,False,False,False,False
2020-10-13,True,False,True,True
2020-10-14,False,False,True,False
2020-10-15,False,False,True,False
2020-10-16,False,True,True,True


### 运算

In [86]:
df

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137
2020-10-14,0.550959,1.263393,-0.845099,1.723545
2020-10-15,0.847408,0.261318,-0.866097,0.89747
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795


In [87]:
### 统计第个字段的平均值
df.mean()

c1    0.437379
c2    0.077720
c3   -0.170748
c4   -0.316085
dtype: float64

In [88]:
### 统计第行的平均值
df.mean(axis = 1)

2020-10-11    0.770807
2020-10-12    0.066306
2020-10-13   -0.626484
2020-10-14    0.673199
2020-10-15    0.285025
2020-10-16   -1.126455
Freq: D, dtype: float64

In [89]:
## 向前移动 没有的值补NAN
s = pd.Series([1,3,5,np.nan, 6,8], index=dates).shift(2)
s

2020-10-11    NaN
2020-10-12    NaN
2020-10-13    1.0
2020-10-14    3.0
2020-10-15    5.0
2020-10-16    NaN
Freq: D, dtype: float64

In [90]:
## 向后移动 没有的值补NAN
s = pd.Series([1,3,5,np.nan, 6,8], index=dates).shift(-2)
s

2020-10-11    5.0
2020-10-12    NaN
2020-10-13    6.0
2020-10-14    8.0
2020-10-15    NaN
2020-10-16    NaN
Freq: D, dtype: float64

In [92]:
### 相减 
df.sub(s, axis='index')

Unnamed: 0,c1,c2,c3,c4
2020-10-11,-3.876831,-3.770411,-4.081694,-5.187833
2020-10-12,,,,
2020-10-13,-7.303423,-5.117494,-6.869882,-7.215137
2020-10-14,-7.449041,-6.736607,-8.845099,-6.276455
2020-10-15,,,,
2020-10-16,,,,


In [93]:
df

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137
2020-10-14,0.550959,1.263393,-0.845099,1.723545
2020-10-15,0.847408,0.261318,-0.866097,0.89747
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795


### apply函数

In [94]:
df

Unnamed: 0,c1,c2,c3,c4
2020-10-11,1.123169,1.229589,0.918306,-0.187833
2020-10-12,-0.509706,-0.76671,1.461399,0.080241
2020-10-13,-1.303423,0.882506,-0.869882,-1.215137
2020-10-14,0.550959,1.263393,-0.845099,1.723545
2020-10-15,0.847408,0.261318,-0.866097,0.89747
2020-10-16,1.915867,-2.403775,-0.823118,-3.194795


In [95]:
df.apply(lambda x: np.max(x) - np.min(x))

c1    3.219290
c2    3.667168
c3    2.331281
c4    4.918340
dtype: float64

In [96]:
df.apply(lambda x: np.max(x) - np.min(x), axis = 1)

2020-10-11    1.417422
2020-10-12    2.228109
2020-10-13    2.185929
2020-10-14    2.568644
2020-10-15    1.763566
2020-10-16    5.110662
Freq: D, dtype: float64

In [97]:
### 合并

df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-1.234955,0.158617,1.401628,0.635126
1,-0.244391,-0.357813,0.888058,0.370729
2,-0.946385,-1.801802,0.044887,1.546102
3,-0.706324,0.320509,0.309978,-0.328156
4,0.673385,0.285387,-1.8791,0.954688
5,-0.666104,-0.202782,-2.844582,0.337295
6,2.72727,-0.846959,-0.809408,0.331394
7,-0.775907,1.100612,-1.012533,1.012593
8,1.754139,-0.184373,0.569623,-1.604083
9,0.075601,-1.274096,-0.789244,0.200005


In [99]:
df1 = [df[:3], df[7:], df[3:7]]

In [100]:
df2 = pd.concat(df1)
df2

Unnamed: 0,0,1,2,3
0,-1.234955,0.158617,1.401628,0.635126
1,-0.244391,-0.357813,0.888058,0.370729
2,-0.946385,-1.801802,0.044887,1.546102
7,-0.775907,1.100612,-1.012533,1.012593
8,1.754139,-0.184373,0.569623,-1.604083
9,0.075601,-1.274096,-0.789244,0.200005
3,-0.706324,0.320509,0.309978,-0.328156
4,0.673385,0.285387,-1.8791,0.954688
5,-0.666104,-0.202782,-2.844582,0.337295
6,2.72727,-0.846959,-0.809408,0.331394


In [108]:
## 连接 join
left = pd.DataFrame({'key':['foo', 'foo1', 'foo2'], 'val':[1, 2, 3]})
left



Unnamed: 0,key,val
0,foo,1
1,foo1,2
2,foo2,3


In [109]:
right = pd.DataFrame({'key':['foo', 'foo1'], 'val':[4, 5]})
right

Unnamed: 0,key,val
0,foo,4
1,foo1,5


In [110]:
joined = pd.merge(left, right, on = 'key')
joined

Unnamed: 0,key,val_x,val_y
0,foo,1,4
1,foo1,2,5


In [112]:
joined = pd.merge(left, right, on = 'key', how='left')
joined

Unnamed: 0,key,val_x,val_y
0,foo,1,4.0
1,foo1,2,5.0
2,foo2,3,


In [120]:
### 追加 append
df = pd.DataFrame(np.random.randn(8, 4), columns=['c1', 'c2', 'c3', 'c4'])
df

Unnamed: 0,c1,c2,c3,c4
0,-0.118416,2.397098,0.29988,-0.611653
1,-0.537739,0.525762,-1.905174,-1.354133
2,-0.501981,-1.627718,-0.028233,-1.874207
3,0.626649,-0.083126,1.211901,-1.052068
4,0.238119,0.431277,-0.511072,-0.050622
5,1.237203,0.271006,0.375436,0.797702
6,0.294896,-0.578069,1.103484,-0.328705
7,0.425609,-1.032103,0.737583,-0.040792


In [121]:
temp = df.iloc[3,:]
temp

c1    0.626649
c2   -0.083126
c3    1.211901
c4   -1.052068
Name: 3, dtype: float64

In [122]:
df = df.append(temp, ignore_index=True)
df

Unnamed: 0,c1,c2,c3,c4
0,-0.118416,2.397098,0.29988,-0.611653
1,-0.537739,0.525762,-1.905174,-1.354133
2,-0.501981,-1.627718,-0.028233,-1.874207
3,0.626649,-0.083126,1.211901,-1.052068
4,0.238119,0.431277,-0.511072,-0.050622
5,1.237203,0.271006,0.375436,0.797702
6,0.294896,-0.578069,1.103484,-0.328705
7,0.425609,-1.032103,0.737583,-0.040792
8,0.626649,-0.083126,1.211901,-1.052068


In [123]:
### 分组(group)
df = pd.DataFrame({
    'c1': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
    'c2': ['one', 'one', 'three', 'two', 'one', 'two', 'three', 'one'],
    'c3': np.random.randn(8),
    'c4': np.random.randn(8)
})
df


Unnamed: 0,c1,c2,c3,c4
0,foo,one,0.461809,-0.112531
1,bar,one,0.018133,0.026181
2,foo,three,-0.250586,0.4838
3,bar,two,1.443538,0.431073
4,foo,one,-0.748881,-0.350749
5,bar,two,0.259967,-0.019033
6,foo,three,0.151223,-0.291199
7,foo,one,-0.032681,0.237194


In [125]:
df1 = df.groupby('c1').sum()
df1

Unnamed: 0_level_0,c3,c4
c1,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.721638,0.438221
foo,-0.419116,-0.033485


In [128]:
df1 = df.groupby('c2').max()
df1

Unnamed: 0_level_0,c1,c3,c4
c2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,foo,0.461809,0.237194
three,foo,0.151223,0.4838
two,bar,1.443538,0.431073


In [129]:
df1 = df.groupby(['c1','c2']).mean()
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,c3,c4
c1,c2,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.018133,0.026181
bar,two,0.851752,0.20602
foo,one,-0.106584,-0.075362
foo,three,-0.049681,0.0963


In [131]:
df1 = df.groupby(['c1','c2']).std()
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,c3,c4
c1,c2,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,,
bar,two,0.836911,0.318274
foo,one,0.608719,0.295728
foo,three,0.284122,0.548007


In [133]:
## 堆叠（stack)
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                   ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]))

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [134]:
df = pd.DataFrame(np.random.randn(8,2), index=index, columns = ['A', 'B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.452043,0.966761
bar,two,-0.190751,0.020714
baz,one,-0.00149,2.260213
baz,two,1.677084,-0.532336
foo,one,0.632506,1.774714
foo,two,1.072156,0.155703
qux,one,-0.74693,1.006989
qux,two,-1.460233,2.725604


In [135]:
## 把列压缩成一层 （列转行？）
stacked = df.stack()  
stacked

first  second   
bar    one     A    1.452043
               B    0.966761
       two     A   -0.190751
               B    0.020714
baz    one     A   -0.001490
               B    2.260213
       two     A    1.677084
               B   -0.532336
foo    one     A    0.632506
               B    1.774714
       two     A    1.072156
               B    0.155703
qux    one     A   -0.746930
               B    1.006989
       two     A   -1.460233
               B    2.725604
dtype: float64

In [136]:
### stack 逆接任unstack
unstack = stacked.unstack()
unstack


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.452043,0.966761
bar,two,-0.190751,0.020714
baz,one,-0.00149,2.260213
baz,two,1.677084,-0.532336
foo,one,0.632506,1.774714
foo,two,1.072156,0.155703
qux,one,-0.74693,1.006989
qux,two,-1.460233,2.725604


In [143]:
unstack = stacked.unstack(0)
unstack

Unnamed: 0_level_0,first,bar,baz,foo,qux
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,A,1.452043,-0.00149,0.632506,-0.74693
one,B,0.966761,2.260213,1.774714,1.006989
two,A,-0.190751,1.677084,1.072156,-1.460233
two,B,0.020714,-0.532336,0.155703,2.725604


In [144]:
unstack = stacked.unstack(1)
unstack

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,1.452043,-0.190751
bar,B,0.966761,0.020714
baz,A,-0.00149,1.677084
baz,B,2.260213,-0.532336
foo,A,0.632506,1.072156
foo,B,1.774714,0.155703
qux,A,-0.74693,-1.460233
qux,B,1.006989,2.725604


In [145]:
unstack = stacked.unstack(2)
unstack

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.452043,0.966761
bar,two,-0.190751,0.020714
baz,one,-0.00149,2.260213
baz,two,1.677084,-0.532336
foo,one,0.632506,1.774714
foo,two,1.072156,0.155703
qux,one,-0.74693,1.006989
qux,two,-1.460233,2.725604


In [146]:
unstack = stacked.unstack([0, 1])
unstack

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,1.452043,-0.190751,-0.00149,1.677084,0.632506,1.072156,-0.74693,-1.460233
B,0.966761,0.020714,2.260213,-0.532336,1.774714,0.155703,1.006989,2.725604
