In [2]:
import pandas as pd
import numpy as np


# 分组

In [3]:
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randn(8),
            'data2': np.random.randn(8)}
df_obj = pd.DataFrame(dict_obj)
df_obj

Unnamed: 0,data1,data2,key1,key2
0,-0.766085,0.929862,a,one
1,-0.044919,2.451428,b,one
2,1.512204,0.398841,a,two
3,-0.966868,-2.248148,b,three
4,0.033175,-1.047117,a,two
5,1.408734,-0.656034,b,two
6,-0.391598,0.138052,a,one
7,0.653694,-0.747137,a,three


## groupby分组

In [4]:
# 1.分组操作，创建groupby对象

# 如果对整个数据集进行分组，groupby参数直接指定列名即可
grouped = df_obj.groupby('key2')
grouped


<pandas.core.groupby.DataFrameGroupBy object at 0x0000000008F17898>

In [5]:
# 2.分组运算，对groupby对象进行运算
# 在分组的基础上，对分组对象调用方法进行运算
# 注意： 分组运算只会作用于数据部分，非数据部分不参与运算
grouped.sum()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
one,-1.202602,3.519343
three,-0.313174,-2.995285
two,2.954113,-1.30431


In [6]:
# 如果对单独某个数据部分的列进行分组运算，那么groupby的参数必须制定数据集的某一列
group2 = df_obj['data2'].groupby(df_obj['key1'])
group2.mean()

key1
a   -0.065500
b   -0.150918
Name: data2, dtype: float64

In [7]:
# 自定义分组运算
# 如果现有的分组不满足业务需求，可以自己创建一个分组规则，实现数据分组运算


In [8]:
self_key = ["aa", "bb", "cc", "dd", "aa", "bb", "cc", "dd"]
grouped3 = df_obj.groupby(self_key)
grouped3.sum()

Unnamed: 0,data1,data2
aa,-0.73291,-0.117254
bb,1.363815,1.795394
cc,1.120606,0.536893
dd,-0.313174,-2.995285


In [12]:
# 多层分组，索引顺序按列表里的参数顺序来决定
#print(df_obj)
group4 = df_obj.groupby(["key2", "key1"])
#grouped4 = df_obj["data1"].groupby([df_obj["key2"], df_obj["key1"]])
group4.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key2,key1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,-1.157683,1.067914
one,b,-0.044919,2.451428
three,a,0.653694,-0.747137
three,b,-0.966868,-2.248148
two,a,1.545379,-0.648276
two,b,1.408734,-0.656034


## groupby 支持迭代操作


In [13]:
group4 = df_obj.groupby(['key2', 'key1'])
for name,data in group4:
    print(name)
    print(data)

('one', 'a')
      data1     data2 key1 key2
0 -0.766085  0.929862    a  one
6 -0.391598  0.138052    a  one
('one', 'b')
      data1     data2 key1 key2
1 -0.044919  2.451428    b  one
('three', 'a')
      data1     data2 key1   key2
7  0.653694 -0.747137    a  three
('three', 'b')
      data1     data2 key1   key2
3 -0.966868 -2.248148    b  three
('two', 'a')
      data1     data2 key1 key2
2  1.512204  0.398841    a  two
4  0.033175 -1.047117    a  two
('two', 'b')
      data1     data2 key1 key2
5  1.408734 -0.656034    b  two


In [14]:
# GroupBy对象可以直接转换为字典和列表

In [18]:
print(list(group4))
print('*'*50)
print(dict(list(group4)))

[(('one', 'a'),       data1     data2 key1 key2
0 -0.766085  0.929862    a  one
6 -0.391598  0.138052    a  one), (('one', 'b'),       data1     data2 key1 key2
1 -0.044919  2.451428    b  one), (('three', 'a'),       data1     data2 key1   key2
7  0.653694 -0.747137    a  three), (('three', 'b'),       data1     data2 key1   key2
3 -0.966868 -2.248148    b  three), (('two', 'a'),       data1     data2 key1 key2
2  1.512204  0.398841    a  two
4  0.033175 -1.047117    a  two), (('two', 'b'),       data1     data2 key1 key2
5  1.408734 -0.656034    b  two)]
**************************************************
{('one', 'a'):       data1     data2 key1 key2
0 -0.766085  0.929862    a  one
6 -0.391598  0.138052    a  one, ('one', 'b'):       data1     data2 key1 key2
1 -0.044919  2.451428    b  one, ('three', 'a'):       data1     data2 key1   key2
7  0.653694 -0.747137    a  three, ('three', 'b'):       data1     data2 key1   key2
3 -0.966868 -2.248148    b  three, ('two', 'a'):       data1

### 按照数据类型分组

In [20]:
# 可以通过数据乐行分组， 默认是按列，axis=1 表示轴方向为行,size() 返回每个分组的元素个数
print(df_obj.groupby(df_obj.dtypes, axis=1).size())

float64    2
object     2
dtype: int64


In [22]:
# 构建一个数据部分区间为1~10的5行5列，行索引是index列表，列索引是columns列表
df_obj2 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['A', 'B', 'C', 'D', 'E'])
# 给指定某个部分的数据重新赋值为np.NaN
df_obj2.ix[1, 1:4] = np.NaN
print(df_obj2)

   a    b    c    d  e
A  4  8.0  4.0  3.0  4
B  7  NaN  NaN  NaN  8
C  1  2.0  5.0  3.0  8
D  7  7.0  7.0  2.0  7
E  3  7.0  1.0  7.0  4


In [25]:
# 按字典分组
dic = {"a" : "C++", "b": "Python", "c" : "Java", "d" : "PHP", "e" : "C#"}
print(df_obj2.groupby(dic, axis = 1).size())

C#        1
C++       1
Java      1
PHP       1
Python    1
dtype: int64


In [26]:
# 通过自定义函数分组
def group_key(idx):
    """
        idx 为列索引或行索引
    """
    #return idx
    return len(idx)

#print(df_obj2.groupby(group_key, axis=1).sum())
print(df_obj2.groupby(len, axis = 1).sum())

      1
A  23.0
B  15.0
C  19.0
D  30.0
E  22.0


In [27]:
## 通过层级索引来分组
columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],
                                     ['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])
df_obj4 = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
print(df_obj4)

language Python Java Python Java Python
index         A    A      B    C      B
0             5    5      7    5      7
1             6    7      2    3      3
2             2    1      2    3      4
3             3    1      3    4      6
4             9    8      7    6      8


In [28]:
print(df_obj4.groupby(level ="language", axis=1).sum())

language  Java  Python
0           10      19
1           10      11
2            4       8
3            5      12
4           14      24
