In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(12345)

In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.arange(5),
                   'data2' : np.arange(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0,0
1,a,two,1,1
2,b,one,2,2
3,b,two,3,3
4,a,one,4,4


In [14]:
df['data1']

0    0
1    1
2    2
3    3
4    4
Name: data1, dtype: int32

In [3]:
grouped = df['data1'].groupby(df['key1'])

grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000001869DDC4A20>

In [13]:
grouped.mean()

key1
a    1.666667
b    2.500000
Name: data1, dtype: float64

In [4]:
df['data1'].groupby(df['key1']).mean()

key1
a    1.666667
b    2.500000
Name: data1, dtype: float64

In [5]:
means = df['data1'].groupby([df['key1'], df['key2']]).sum() 
means

key1  key2
a     one     4
      two     1
b     one     2
      two     3
Name: data1, dtype: int64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4,1
b,2,3


In [11]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

California  2005    1.0
            2006    2.0
Ohio        2005    1.5
            2006    4.0
Name: data1, dtype: float64

In [6]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0,0
1,a,two,1,1
2,b,one,2,2
3,b,two,3,3
4,a,one,4,4


In [15]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.666667,1.666667
b,2.5,2.5


In [16]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,2,2
a,two,1,1
b,one,2,2
b,two,3,3


**.size()**：returns a Series containing group sizes.

In [17]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### Selecting a Column or Subset of Columns
Indexing a GroupBy object created from a DataFrame with a column name or array of column names has the effect of column subsetting for aggregation. This means that:

    df.groupby('key1')['data1']
    df.groupby('key1')[['data2']]
are syntactic sugar for:

    df['data1'].groupby(df['key1'])
    df[['data2']].groupby(df['key1'])


In [23]:
df['data1'].groupby(df['key1']).mean()

key1
a    1.666667
b    2.500000
Name: data1, dtype: float64

In [18]:
df.groupby(['key1', 'key2'])[['data2']].mean() ###先用key1做分组，再用key2做分组，然后在计算平均数

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,2
a,two,1
b,one,2
b,two,3


> The object returned by this indexing operation is a **grouped DataFrame** if a **list or array** is passed or 

> a **grouped Series** if only a **single column name** is passed as a **scalar**

In [19]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x000001869E0B8550>

In [20]:
s_grouped.mean()

key1  key2
a     one     2
      two     1
b     one     2
      two     3
Name: data2, dtype: int32

### Grouping with Dicts and Series
Grouping information may exist in a form other than an array. Let’s consider another example **DataFrame**:

In [26]:
people = pd.DataFrame(np.arange(25).reshape(5,5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,0,1.0,2.0,3,4
Steve,5,6.0,7.0,8,9
Wes,10,,,13,14
Jim,15,16.0,17.0,18,19
Travis,20,21.0,22.0,23,24


In [27]:
#you could construct an array from this dict to pass to groupby
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [35]:
by_column = people.groupby(mapping, axis=1) ###使用
by_column.sum()


Unnamed: 0,blue,red
Joe,5.0,5.0
Steve,15.0,20.0
Wes,13.0,24.0
Jim,35.0,50.0
Travis,45.0,65.0


In [32]:
map_series = pd.Series(mapping)
map_series
#The same functionality holds for Series, which can be viewed as a fixed-size mapping:

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [33]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


### Grouping with Functions
Suppose you wanted to group by the length of the names;while you could compute an array of string lengths, it’s simpler to just pass the **len function**:

In [36]:
people.groupby(len).sum()
#按照名字長度分組

Unnamed: 0,a,b,c,d,e
3,25,17.0,19.0,34,37
5,5,6.0,7.0,8,9
6,20,21.0,22.0,23,24


Mixing functions with arrays, dicts, or Series is not a problem as everything gets converted to **arrays** internally:

In [10]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0,1.0,2.0,3,4
3,two,15,16.0,17.0,18,19
5,one,5,6.0,7.0,8,9
6,two,20,21.0,22.0,23,24


### Grouping by Index Levels
A final convenience for hierarchically indexed datasets is the ability to aggregate using one of the levels of an axis index. Let’s look at an example:

In [37]:
columns_M = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]],
                                    names=['cty', 'tenor'])

In [38]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns_M)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.204708,0.478943,-0.519439,-0.55573,1.965781
1,1.393406,0.092908,0.281746,0.769023,1.246435
2,1.007189,-1.296221,0.274992,0.228913,1.352917
3,0.886429,-2.001637,-0.371843,1.669025,-0.43857


In [39]:
#To group by level, pass the level number or name using the level keyword:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
