# 分组与聚合
    目录：
    1. GroupBy对象
    2. GroupBy分组对象迭代
    3. 其他分组方式
    4. 聚合

### 1. GroupBy对象

#### · groupedby函数中的参数：
    as_index的作用：控制聚合输出是否以组标签为索引值，默认为True，就是分层次的索引，若为False多加一列默认索引索引，相当于非其他数据排序好了。
    但是这两组标签索引值不同有什么作用呢？=== 作用就是，根据的一列是否为索引列。
    sort_values的作用：对选定的一列数值数据从上往下从小到大进行排序（如果传值没成功===设置本体覆盖，传值覆盖）

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot 
%matplotlib inline

In [2]:
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randn(8),
            'data2': np.random.randn(8)}
df_obj = pd.DataFrame(dict_obj)
print (df_obj)

  key1   key2     data1     data2
0    a    one  0.182229 -0.648371
1    b    one  0.977752  1.749360
2    a    two  0.162701 -0.176894
3    b  three -0.026076 -1.352676
4    a    two  0.338570  1.342475
5    b    two -0.505157  1.015842
6    a    one -1.583258 -1.313521
7    a  three  0.588953 -0.768686


In [3]:
'''1. dataframe根据key2进行分组'''
print(df_obj.groupby('key2')['key1'].count())

print (type(df_obj.groupby('key1')))
#没有可视化的输出 

key2
one      3
three    2
two      3
Name: key1, dtype: int64
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


In [4]:
'''2. 指定列根据key1进行分组'''
print (type(df_obj['data1'].groupby(df_obj['key1'])))

<class 'pandas.core.groupby.generic.SeriesGroupBy'>


In [5]:
# 分组运算
grouped1 = df_obj.groupby('key1',as_index=False)
print (grouped1.mean())

grouped2 = df_obj['data1'].groupby(df_obj['key1'])#指定某一列的数据在该索引下进行分组并且加以聚合
print (grouped2.mean())

  key1     data1     data2
0    a -0.062161 -0.312999
1    b  0.148839  0.470842
key1
a   -0.062161
b    0.148839
Name: data1, dtype: float64


In [6]:
'''3. 按自定义key分组，列表'''
self_def_key = [1, 1, 2, 2, 2, 1, 1, 1]
df_obj.groupby(self_def_key).mean()

Unnamed: 0,data1,data2
1,-0.067896,0.006925
2,0.158398,-0.062365


In [7]:
df_obj

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.182229,-0.648371
1,b,one,0.977752,1.74936
2,a,two,0.162701,-0.176894
3,b,three,-0.026076,-1.352676
4,a,two,0.33857,1.342475
5,b,two,-0.505157,1.015842
6,a,one,-1.583258,-1.313521
7,a,three,0.588953,-0.768686


In [8]:
'''4. 按多个列多层分组 = = = 通过列表'''
grouped2 = df_obj.groupby(['key1', 'key2'],as_index=False)
print (grouped2.mean())
print('--------比较asindex的差异-------')
grouped2 = df_obj.groupby(['key1', 'key2'],as_index=True)
print (grouped2.mean())

  key1   key2     data1     data2
0    a    one -0.700514 -0.980946
1    a  three  0.588953 -0.768686
2    a    two  0.250636  0.582790
3    b    one  0.977752  1.749360
4    b  three -0.026076 -1.352676
5    b    two -0.505157  1.015842
--------比较asindex的差异-------
               data1     data2
key1 key2                     
a    one   -0.700514 -0.980946
     three  0.588953 -0.768686
     two    0.250636  0.582790
b    one    0.977752  1.749360
     three -0.026076 -1.352676
     two   -0.505157  1.015842


In [9]:
# 多层分组按key的顺序进行===和上面的asindex作用一样，把所选取的列数据当成索引，这才是区别之处
grouped3 = df_obj.groupby(['key2', 'key1'])
print (grouped3.mean())
print ('=============================================')
'''PS：如果想按照列进行分组聚合运算 === unstack===也可以通过转置'''
print (grouped3.mean().unstack())

               data1     data2
key2  key1                    
one   a    -0.700514 -0.980946
      b     0.977752  1.749360
three a     0.588953 -0.768686
      b    -0.026076 -1.352676
two   a     0.250636  0.582790
      b    -0.505157  1.015842
          data1               data2          
key1          a         b         a         b
key2                                         
one   -0.700514  0.977752 -0.980946  1.749360
three  0.588953 -0.026076 -0.768686 -1.352676
two    0.250636 -0.505157  0.582790  1.015842


### 2. GroupBy对象分组迭代===“遍历”

In [10]:
grouped1

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001A4727FA518>

In [11]:
# 单层分组
print(grouped1.head(5))
print("------------------------------------分割线------------------------------------------")
for group_name, group_data in grouped1:
    print (group_name)
    print (group_data['data1'])

  key1   key2     data1     data2
0    a    one  0.182229 -0.648371
1    b    one  0.977752  1.749360
2    a    two  0.162701 -0.176894
3    b  three -0.026076 -1.352676
4    a    two  0.338570  1.342475
5    b    two -0.505157  1.015842
6    a    one -1.583258 -1.313521
7    a  three  0.588953 -0.768686
------------------------------------分割线------------------------------------------
a
0    0.182229
2    0.162701
4    0.338570
6   -1.583258
7    0.588953
Name: data1, dtype: float64
b
1    0.977752
3   -0.026076
5   -0.505157
Name: data1, dtype: float64


In [12]:
# 多层分组
for group_name, group_data in grouped2:
    print (group_name)
    print (group_data)

('a', 'one')
  key1 key2     data1     data2
0    a  one  0.182229 -0.648371
6    a  one -1.583258 -1.313521
('a', 'three')
  key1   key2     data1     data2
7    a  three  0.588953 -0.768686
('a', 'two')
  key1 key2     data1     data2
2    a  two  0.162701 -0.176894
4    a  two  0.338570  1.342475
('b', 'one')
  key1 key2     data1    data2
1    b  one  0.977752  1.74936
('b', 'three')
  key1   key2     data1     data2
3    b  three -0.026076 -1.352676
('b', 'two')
  key1 key2     data1     data2
5    b  two -0.505157  1.015842


In [13]:
# GroupBy对象转换list
print(grouped1.mean())
list(grouped1)

  key1     data1     data2
0    a -0.062161 -0.312999
1    b  0.148839  0.470842


[('a',   key1   key2     data1     data2
  0    a    one  0.182229 -0.648371
  2    a    two  0.162701 -0.176894
  4    a    two  0.338570  1.342475
  6    a    one -1.583258 -1.313521
  7    a  three  0.588953 -0.768686), ('b',   key1   key2     data1     data2
  1    b    one  0.977752  1.749360
  3    b  three -0.026076 -1.352676
  5    b    two -0.505157  1.015842)]

In [14]:
# GroupBy对象转换dict
dict(list(grouped1))

{'a':   key1   key2     data1     data2
 0    a    one  0.182229 -0.648371
 2    a    two  0.162701 -0.176894
 4    a    two  0.338570  1.342475
 6    a    one -1.583258 -1.313521
 7    a  three  0.588953 -0.768686, 'b':   key1   key2     data1     data2
 1    b    one  0.977752  1.749360
 3    b  three -0.026076 -1.352676
 5    b    two -0.505157  1.015842}

In [15]:
# 按列分组
print (df_obj.dtypes)

# 按数据类型分组
df_obj.groupby(df_obj.dtypes, axis=1).size()
df_obj.groupby(df_obj.dtypes, axis=1).sum()

key1      object
key2      object
data1    float64
data2    float64
dtype: object


Unnamed: 0,float64,object
0,-0.466142,aone
1,2.727112,bone
2,-0.014193,atwo
3,-1.378753,bthree
4,1.681045,atwo
5,0.510685,btwo
6,-2.896778,aone
7,-0.179732,athree


### 3. 其他分组方法

0. 其实列表也是分组的一种方式
    ===用到列表时候，一般都是多层索引了

In [16]:
df_obj2 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['A', 'B', 'C', 'D', 'E'])
df_obj2.ix[1, 1:4] = np.NaN
df_obj2

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


Unnamed: 0,a,b,c,d,e
A,9,1.0,9.0,8.0,5
B,6,,,,1
C,4,7.0,5.0,9.0,8
D,6,8.0,6.0,1.0,2
E,6,1.0,8.0,3.0,5


1. 通过字典分组

In [17]:
# 通过字典分组
mapping_dict = {'A':'python', 'B':'python', 'C':'java', 'D':'C', 'E':'java'}
#df_obj2.groupby(mapping_dict, axis=1).size()
#df_obj2.groupby(mapping_dict, axis=1).count() # 非NaN的个数
print(df_obj2.groupby(mapping_dict, axis=0).sum())

         a    b     c     d   e
C        6  8.0   6.0   1.0   2
java    10  8.0  13.0  12.0  13
python  15  1.0   9.0   8.0   6


2. 通过函数分组

In [18]:
# 通过函数分组
df_obj3 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['AA', 'BBB', 'CC', 'D', 'EE'])
#df_obj3

def group_key(idx):
    """
        idx 为列索引或行索引
    """
    #return idx
    return len(idx)

df_obj3.groupby(group_key).size()

# 以上自定义函数等价于
#df_obj3.groupby(len).size()

1    1
2    3
3    1
dtype: int64

3. 通过层级索引级别分组

In [19]:
# 通过索引级别分组
columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],
                                     ['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])
df_obj4 = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
df_obj4

language,Python,Java,Python,Java,Python
index,A,A,B,C,B.1
0,4,2,1,8,3
1,5,1,5,4,5
2,6,7,6,5,1
3,3,5,3,9,5
4,5,4,5,6,3


In [20]:
# 根据language进行分组
df_obj4.groupby(level='language', axis=1).sum()
df_obj4.groupby(level='index', axis=1).sum()

index,A,B,C
0,6,4,8
1,6,10,4
2,13,7,5
3,8,8,9
4,9,8,6


### 4. 聚合

In [21]:
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1,10, 8),
            'data2': np.random.randint(1,10, 8)}
df_obj5 = pd.DataFrame(dict_obj)
print (df_obj5)

  key1   key2  data1  data2
0    a    one      4      8
1    b    one      7      9
2    a    two      1      4
3    b  three      3      3
4    a    two      6      2
5    b    two      7      3
6    a    one      2      3
7    a  three      7      5


1. 内置的聚合函数

In [22]:
df_obj5

Unnamed: 0,key1,key2,data1,data2
0,a,one,4,8
1,b,one,7,9
2,a,two,1,4
3,b,three,3,3
4,a,two,6,2
5,b,two,7,3
6,a,one,2,3
7,a,three,7,5


In [29]:
# 内置的聚合函数
#print (df_obj5.groupby('key1').sum())
#print (df_obj5.groupby('key1').max())
#print (df_obj5.groupby('key1').min())
print (df_obj5.groupby('key1').mean())
#print (df_obj5.groupby('key1').size())
#print (df_obj5.groupby('key1').count())
#print (df_obj5.groupby('key1').describe())
'''
count：分组中非NA的值
std：标准差
var：方差
median：非NA中的中位数
mean：非NA的平均值
25%||50%||75%是什么意思==不造？
'''

         data1  data2
key1                 
a     4.000000    4.4
b     5.666667    5.0


'\ncount：分组中非NA的值\nstd：标准差\nvar：方差\nmedian：非NA中的中位数\nmean：非NA的平均值\n25%||50%||75%是什么意思==不造？\n'

2. 自定义聚合函数

In [24]:
# 自定义聚合函数
def peak_range(df):
    """
        返回数值范围
    """
    #print type(df) #参数为索引所对应的记录
    return df.max() - df.min()

print (df_obj5.groupby('key1').agg(peak_range))
#print df_obj.groupby('key1').agg(lambda df : df.max() - df.min())
#默认列名就是函数名。

      data1  data2
key1              
a         6      6
b         4      6


3. 同时应用多个聚合函数：agg

In [25]:
# 同时应用多个聚合函数：agg
print (df_obj.groupby('key1').agg(['mean', 'std', 'count'])) 


         data1                    data2                
          mean      std count      mean       std count
key1                                                   
a    -0.062161  0.86726     5 -0.312999  1.009986     5
b     0.148839  0.75677     3  0.470842  1.621242     3


In [26]:
print (df_obj.groupby('key1').agg(['mean', 'std', 'count', ('range', peak_range)])) # 通过元组提供新的列名

         data1                              data2                          
          mean      std count     range      mean       std count     range
key1                                                                       
a    -0.062161  0.86726     5  2.172211 -0.312999  1.009986     5  2.655995
b     0.148839  0.75677     3  1.482909  0.470842  1.621242     3  3.102036


In [27]:
# 每列作用不同的聚合函数
dict_mapping = {'data1':'mean',
                'data2':'sum'}
print (df_obj.groupby('key1').agg(dict_mapping))

         data1     data2
key1                    
a    -0.062161 -1.564997
b     0.148839  1.412526


In [28]:
dict_mapping = {'data1':['mean','max'],
                'data2':'sum'}
print (df_obj.groupby('key1').agg(dict_mapping))

         data1               data2
          mean       max       sum
key1                              
a    -0.062161  0.588953 -1.564997
b     0.148839  0.977752  1.412526
