In [1]:
import pandas as pd
import numpy as np

## 分组

In [2]:
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randn(8),
            'data2': np.random.randn(8)}
df_obj = pd.DataFrame(dict_obj)
print(df_obj)

      data1     data2 key1   key2
0  1.614626 -1.571301    a    one
1  0.460157  0.041038    b    one
2 -2.070866 -0.885139    a    two
3  0.770066  0.278396    b  three
4  0.040073 -1.834590    a    two
5 -0.105759 -0.115968    b    two
6  0.442917 -1.719780    a    one
7  1.368152 -0.798185    a  three


#### groupyby 分组

In [7]:
## 1. 分组操作，创建groupby对象

# 如果对整个数据集进行分组，groupby参数直接指定列名即可
grouped = df_obj.groupby("key2")
print(grouped)

<pandas.core.groupby.DataFrameGroupBy object at 0x00000134964429E8>


In [8]:
# 2. 分组运算，对groupby对象进行运算

## 在分组的基础上，对分组对象调用方法进行运算
## 注意：分组运算只会作用于数据部分，非数据部分不参与运算
print(grouped.sum())

          data1     data2
key2                     
one    2.517700 -3.250042
three  2.138218 -0.519789
two   -2.136552 -2.835698


In [12]:
# 如果对单独某个数据部分的列进行分组运算，那么groupby的参数必须指定数据集的某一列进行分组
grouped2 = df_obj["data2"].groupby(df_obj["key1"])
grouped2.mean()

key2
one     -1.083347
three   -0.259895
two     -0.945233
Name: data2, dtype: float64

In [None]:
# 3. 自定义分组运算
# 如果现有的分组不满足业务需求，可以自己创建一个分组规则，实现数据分组运算

In [16]:
self_key = ["aa", "bb", "cc", "dd", "aa", "bb", "cc", "dd"]
grouped3 = df_obj.groupby(self_key)
grouped3.sum()

Unnamed: 0,data1,data2
aa,1.654699,-3.405891
bb,0.354398,-0.07493
cc,-1.627948,-2.604919
dd,2.138218,-0.519789


In [25]:
# 4. 多层分组，索引顺序按列表里的参数顺序来决定
#print(df_obj)
grouped4 = df_obj.groupby(["key2", "key1"])
#grouped4 = df_obj["data1"].groupby([df_obj["key2"], df_obj["key1"]])
grouped4.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key2,key1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,2.057543,-3.29108
one,b,0.460157,0.041038
three,a,1.368152,-0.798185
three,b,0.770066,0.278396
two,a,-2.030793,-2.719729
two,b,-0.105759,-0.115968


### groupby 支持迭代操作

In [30]:
grouped4 = df_obj.groupby(["key2", "key1"])
#print(grouped4)
for name, data in grouped4:
    print(name)
    print(data)

('one', 'a')
      data1     data2 key1 key2
0  1.614626 -1.571301    a  one
6  0.442917 -1.719780    a  one
('one', 'b')
      data1     data2 key1 key2
1  0.460157  0.041038    b  one
('three', 'a')
      data1     data2 key1   key2
7  1.368152 -0.798185    a  three
('three', 'b')
      data1     data2 key1   key2
3  0.770066  0.278396    b  three
('two', 'a')
      data1     data2 key1 key2
2 -2.070866 -0.885139    a  two
4  0.040073 -1.834590    a  two
('two', 'b')
      data1     data2 key1 key2
5 -0.105759 -0.115968    b  two


In [None]:
## GroupBy对象可以直接转换为字典和列表

In [33]:
print(list(grouped4))

(('one', 'b'),       data1     data2 key1 key2
1  0.460157  0.041038    b  one)


In [35]:
print(dict(list(grouped4)))

{('one', 'a'):       data1     data2 key1 key2
0  1.614626 -1.571301    a  one
6  0.442917 -1.719780    a  one, ('one', 'b'):       data1     data2 key1 key2
1  0.460157  0.041038    b  one, ('three', 'a'):       data1     data2 key1   key2
7  1.368152 -0.798185    a  three, ('three', 'b'):       data1     data2 key1   key2
3  0.770066  0.278396    b  three, ('two', 'a'):       data1     data2 key1 key2
2 -2.070866 -0.885139    a  two
4  0.040073 -1.834590    a  two, ('two', 'b'):       data1     data2 key1 key2
5 -0.105759 -0.115968    b  two}


In [None]:
## 桉数据类型分组

In [38]:
print(df_obj.dtypes)

data1    float64
data2    float64
key1      object
key2      object
dtype: object


In [43]:
# 可以通过数据类型分组，默认是按列，axis=1 表示轴方向为行
print(df_obj.groupby(df_obj.dtypes, axis = 1).size())

float64    2
object     2
dtype: int64


In [44]:
# 构建一个数据部分区间为 1~10的5行5列， 行索引是 index列表， 列索引是 columns 列表
df_obj2 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['A', 'B', 'C', 'D', 'E'])
# 给指定某个部分的数据重新赋值为 np.NaN
df_obj2.ix[1, 1:4] = np.NaN
print(df_obj2)

   a    b    c    d  e
A  7  3.0  6.0  7.0  4
B  8  NaN  NaN  NaN  7
C  4  4.0  5.0  9.0  8
D  7  3.0  3.0  2.0  3
E  4  8.0  6.0  8.0  5


In [52]:
## 按字典分组
dic = {"a" : "C++", "b": "Python", "c" : "Java", "d" : "PHP", "e" : "C#"}
print(df_obj2.groupby(dic, axis = 1).size())

C#        1
C++       1
Java      1
PHP       1
Python    1
dtype: int64


In [58]:
## 通过自定义函数分组
def group_key(idx):
    """
        idx 为列索引或行索引
    """
    #return idx
    return len(idx)

#print(df_obj2.groupby(group_key, axis=1).sum())
print(df_obj2.groupby(len, axis = 1).sum())

      1
A  27.0
B  15.0
C  30.0
D  18.0
E  31.0


In [59]:
## 通过层级索引来分组
columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],
                                     ['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])
df_obj4 = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
print(df_obj4)

language Python Java Python Java Python
index         A    A      B    C      B
0             5    2      6    7      9
1             1    2      2    2      2
2             1    2      3    9      3
3             4    5      5    7      6
4             6    7      1    6      7


In [62]:
#print(df_obj4.groupby(level ="index", axis=1).sum())
print(df_obj4.groupby(level ="language", axis=1).sum())

language  Java  Python
0            9      20
1            4       5
2           11       7
3           12      15
4           13      14
