In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'key1':['a','a','b','b','a'],
    'key2':['one','two','one','two','one'],
    'data1':np.random.randn(5),
    'data2':np.random.randn(5)
})

In [12]:
list1 = [True,False,True,False,True]
d = pd.Series(list1)
df[d]

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.273387,-0.550245
2,b,one,0.230299,-0.204204
4,a,one,-0.119497,0.733446


In [4]:
grouped = df['data1'].groupby(df['data1'])

In [5]:
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fed6e8b98a0>

In [9]:
list(grouped)

[(-1.4228331519893986,
  2   -1.422833
  Name: data1, dtype: float64),
 (0.12860485554810794,
  0    0.128605
  Name: data1, dtype: float64),
 (0.49207946777965605,
  1    0.492079
  Name: data1, dtype: float64),
 (1.0767690238222885,
  4    1.076769
  Name: data1, dtype: float64),
 (1.440102072152684,
  3    1.440102
  Name: data1, dtype: float64)]

In [10]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()

In [11]:
means

key1  key2
a     one     0.602687
      two     0.492079
b     one    -1.422833
      two     1.440102
Name: data1, dtype: float64

In [13]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  0.128605  0.240140
1    a  two  0.492079 -0.457660
4    a  one  1.076769 -0.330855
b
  key1 key2     data1     data2
2    b  one -1.422833 -0.813879
3    b  two  1.440102 -1.312965


In [14]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one  0.128605  0.240140
4    a  one  1.076769 -0.330855
('a', 'two')
  key1 key2     data1    data2
1    a  two  0.492079 -0.45766
('b', 'one')
  key1 key2     data1     data2
2    b  one -1.422833 -0.813879
('b', 'two')
  key1 key2     data1     data2
3    b  two  1.440102 -1.312965


In [16]:
a = df.groupby('key1')['data1']

In [17]:
list(a)

[('a',
  0    0.128605
  1    0.492079
  4    1.076769
  Name: data1, dtype: float64),
 ('b',
  2   -1.422833
  3    1.440102
  Name: data1, dtype: float64)]

In [18]:
people = pd.DataFrame(np.random.randn(5,5),
                      columns=['a','b','c','d','e'],
                      index=['Joe','Steve','Wes','Jim','Travis'])
people.iloc[2:3,[1,2]] = np.nan

In [19]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.343212,-0.511119,0.71935,0.075816,0.38789
Steve,-0.373848,1.688375,-1.389028,3.032972,0.236493
Wes,-1.148513,,,-0.582643,0.467557
Jim,-0.324187,-1.582209,1.120964,-2.604839,-0.118904
Travis,-0.614453,0.794641,-1.124122,0.511303,-0.982299


In [20]:
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}

In [21]:
by_column = people.groupby(mapping,axis=1)

In [22]:
by_column

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fed6e5ddde0>

In [23]:
list(by_column)

[('blue',
                 c         d
  Joe     0.719350  0.075816
  Steve  -1.389028  3.032972
  Wes          NaN -0.582643
  Jim     1.120964 -2.604839
  Travis -1.124122  0.511303),
 ('red',
                 a         b         e
  Joe    -1.343212 -0.511119  0.387890
  Steve  -0.373848  1.688375  0.236493
  Wes    -1.148513       NaN  0.467557
  Jim    -0.324187 -1.582209 -0.118904
  Travis -0.614453  0.794641 -0.982299)]

In [24]:
by_column.sum()

Unnamed: 0,blue,red
Joe,0.795166,-1.466441
Steve,1.643944,1.55102
Wes,-0.582643,-0.680957
Jim,-1.483874,-2.0253
Travis,-0.612818,-0.802111


In [25]:
map_series = pd.Series(mapping)

In [26]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [30]:
people.groupby(map_series,axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [32]:
# 函数分组是针对索引的分组
list(people.groupby(len))

[(3,
              a         b         c         d         e
  Joe -1.343212 -0.511119  0.719350  0.075816  0.387890
  Wes -1.148513       NaN       NaN -0.582643  0.467557
  Jim -0.324187 -1.582209  1.120964 -2.604839 -0.118904),
 (5,
                a         b         c         d         e
  Steve -0.373848  1.688375 -1.389028  3.032972  0.236493),
 (6,
                 a         b         c         d         e
  Travis -0.614453  0.794641 -1.124122  0.511303 -0.982299)]

In [29]:
states = ['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']
group_key = ['East'] * 4 + ['West'] * 4

In [30]:
data = pd.Series(np.random.randn(8),index=states)

In [31]:
data

Ohio         -1.142032
New York      0.722589
Vermont       0.829238
Florida      -0.332280
Oregon        0.620844
Nevada        0.496515
California   -1.410945
Idaho        -0.487048
dtype: float64

In [32]:
data[['Vermont','Nevada','Idaho']] = np.nan

In [33]:
data

Ohio         -1.142032
New York      0.722589
Vermont            NaN
Florida      -0.332280
Oregon        0.620844
Nevada             NaN
California   -1.410945
Idaho              NaN
dtype: float64

In [34]:
list(data.groupby(group_key))

[('East',
  Ohio       -1.142032
  New York    0.722589
  Vermont          NaN
  Florida    -0.332280
  dtype: float64),
 ('West',
  Oregon        0.620844
  Nevada             NaN
  California   -1.410945
  Idaho              NaN
  dtype: float64)]

In [35]:
fill_mean = lambda  g:g.fillna(g.mean())

In [36]:
data.groupby(group_key).apply(fill_mean)

Ohio         -1.142032
New York      0.722589
Vermont      -0.250575
Florida      -0.332280
Oregon        0.620844
Nevada       -0.395051
California   -1.410945
Idaho        -0.395051
dtype: float64

In [3]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.134436,1.360594
1,a,two,-0.244901,0.84988
2,b,one,-0.648574,-0.87736
3,b,two,-0.59732,-0.551749
4,a,one,1.134185,-1.790316


In [4]:
df[df.data1 > df.data1.mean()]

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.134436,1.360594
4,a,one,1.134185,-1.790316


In [5]:
df.head()

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.134436,1.360594
1,a,two,-0.244901,0.84988
2,b,one,-0.648574,-0.87736
3,b,two,-0.59732,-0.551749
4,a,one,1.134185,-1.790316


In [7]:
df.head()[[True,True,False,False,False]]

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.134436,1.360594
1,a,two,-0.244901,0.84988


In [8]:
df.head()['key1']

0    a
1    a
2    b
3    b
4    a
Name: key1, dtype: object