In [2]:
import pandas as pd
import numpy as np
# pd.set_option("display.show_dimensions", False)
# pd.set_option("display.float_format", "{:4.2g}".format)

## 分组运算

In [3]:
dose_df = pd.read_csv("dose.csv")
dose_df.head(3)
%C dose_df.Tmt.unique();dose_df.Age.unique()

dose_df.Tmt.unique()   dose_df.Age.unique()
--------------------  ---------------------
['C', 'D', 'A', 'B']  ['60s', '50s', '40s']


### `groupby()`方法

> **TIP**

> `groupby()`并不立即执行分组操作，而只是返回保存源数据和分组数据的`GroupBy`对象。在需要获取每个分组的实际数据时，`GroupBy`对象才会执行分组操作。

In [4]:
tmt_group = dose_df.groupby("Tmt")
print((type(tmt_group)))

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


In [5]:
tmt_age_group = dose_df.groupby(["Tmt", "Age"])

In [6]:
#按外部数据分组
random_values = np.random.randint(0, 5, dose_df.shape[0])
random_group = dose_df.groupby(random_values)

In [7]:
#按行索引的计算值进行分组
alternating_group = dose_df.groupby(lambda n:n % 3)

In [8]:
#组合分组
crazy_group = dose_df.groupby(["Gender", lambda n: n % 2, random_values])

### `GroupBy`对象

In [9]:
print((len(tmt_age_group), len(crazy_group)))

(10, 20)


In [10]:
#遍历分组对象
for key, df in tmt_age_group:
    print(f"key ={key},shape ={df.shape}")

key =('A', '50s'),shape =(39, 6)
key =('A', '60s'),shape =(26, 6)
key =('B', '40s'),shape =(13, 6)
key =('B', '50s'),shape =(13, 6)
key =('B', '60s'),shape =(39, 6)
key =('C', '40s'),shape =(13, 6)
key =('C', '50s'),shape =(13, 6)
key =('C', '60s'),shape =(39, 6)
key =('D', '50s'),shape =(52, 6)
key =('D', '60s'),shape =(13, 6)


In [11]:
#对tmt的分组数据命名
(_, df_A), (_, df_B), (_, df_C), (_, df_D) = tmt_group
df_A

Unnamed: 0,Dose,Response1,Response2,Tmt,Age,Gender
6,1.0,0.000,0.000,A,50s,F
10,15.0,5.225,5.163,A,60s,F
12,5.0,0.000,0.001,A,60s,F
17,5.0,0.000,0.003,A,50s,M
32,100.0,9.295,10.103,A,60s,F
...,...,...,...,...,...,...
247,60.0,10.527,10.745,A,50s,F
248,50.0,10.312,10.131,A,60s,F
255,80.0,9.239,10.103,A,60s,F
256,5.0,0.000,0.000,A,50s,M


> **TIP**

> 由于`GroupBy`对象有`keys`属性，因此无法通过`dict(tmt_group)`直接将其转换为字典，可以先将其转换为迭代器，再转换为字典`dict(iter(tmt_group))`。

In [12]:
#通过key get group
%C tmt_group.get_group("A").head(3);; tmt_age_group.get_group(("A", "50s")).head(3)

       tmt_group.get_group("A").head(3)       
----------------------------------------------
    Dose  Response1  Response2 Tmt  Age Gender
6    1.0      0.000      0.000   A  50s      F
10  15.0      5.225      5.163   A  60s      F
12   5.0      0.000      0.001   A  60s      F

tmt_age_group.get_group(("A", "50s")).head(3) 
----------------------------------------------
    Dose  Response1  Response2 Tmt  Age Gender
6    1.0      0.000      0.000   A  50s      F
17   5.0      0.000      0.003   A  50s      M
34  40.0     10.825     10.464   A  50s      M


In [13]:
#分组数据中筛选出某些列
%C tmt_group["Dose"].get_group("A")
print((tmt_group[["Response1", "Response2"]]))

   tmt_group["Dose"].get_group("A")   
--------------------------------------
6        1.0                          
10      15.0                          
12       5.0                          
17       5.0                          
32     100.0                          
       ...                            
247     60.0                          
248     50.0                          
255     80.0                          
256      5.0                          
257     25.0                          
Name: Dose, Length: 65, dtype: float64
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C6F0BC7100>


In [14]:
#跟上一单元格的[]调用一样的效果
%C tmt_group.Dose

                             tmt_group.Dose                             
------------------------------------------------------------------------
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001C6FF85E8C0>


### 分组－运算－合并

#### `agg()`－聚合
优先将分组中每个df每列传入进行计算，然后将结果聚合。如果回调函数无法按列传入计算，则尝试将每个df传入进行计算，然后将结果聚合  
agg聚合结果是单个数值

In [15]:
agg_res1 = tmt_group.agg(np.mean) #❶ 每组数据求均值
#idxmax()返回每列第一次出现最大值时的索引
agg_res2 = tmt_group.agg(lambda df:df.loc[df.Response1.idxmax()]) #❷ 每组中Response1最大的那一行
%C 4 agg_res1; agg_res2

AttributeError: 'Series' object has no attribute 'Response1'

#### `transform()`－转换
transform操作逻辑与agg类似，但transform传入列后，每个元素都会返回一个值。因此transform的结果为df且与原df的index一致

In [None]:
%C 4 tmt_group.agg(np.mean); dose_df.head()

       tmt_group.agg(np.mean)                           dose_df.head()               
------------------------------------    ---------------------------------------------
          Dose  Response1  Response2       Dose  Response1  Response2 Tmt  Age Gender
Tmt                                     0  50.0      9.872     10.032   C  60s      F
A    33.546154   6.728985   6.863185    1  15.0      0.002      0.004   D  60s      F
B    33.546154   5.573354   5.456415    2  25.0      0.626      0.803   C  50s      M
C    33.546154   4.040415   4.115323    3  25.0      1.372      1.557   C  60s      F
D    33.546154   3.320646   3.188369    4  15.0      0.010      0.020   C  60s      F


In [None]:
transform_res1 = tmt_group.transform(lambda s:s - s.mean()) #❶
# transform_res2 = tmt_group.transform(
#     lambda df:df.assign(Response1=df.Response1 - df.Response1.mean())) #❷
#第二种对df的操作不生效，原因未知
%C transform_res1.head(5)

      transform_res1.head(5)      
----------------------------------
        Dose  Response1  Response2
0  16.453846   5.831585   5.916677
1 -18.546154  -3.318646  -3.184369
2  -8.546154  -3.414415  -3.312323
3  -8.546154  -2.668415  -2.558323
4 -18.546154  -4.030415  -4.095323


  transform_res1 = tmt_group.transform(lambda s:s - s.mean()) #❶


#### `filter()`－过滤
过滤后返回的是合格的组组成的df

In [None]:
tmt_group.agg(np.max)

Unnamed: 0_level_0,Dose,Response1,Response2,Age,Gender
Tmt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,100.0,11.226,10.745,60s,M
B,100.0,10.824,10.34,60s,M
C,100.0,10.49,11.246,60s,M
D,100.0,10.911,9.863,60s,M


In [None]:
#不合格的组的数据将被剔除
tmt_group.filter(lambda df:df.Response1.max() > 11).head()


Unnamed: 0,Dose,Response1,Response2,Tmt,Age,Gender
6,1.0,0.0,0.0,A,50s,F
10,15.0,5.225,5.163,A,60s,F
12,5.0,0.0,0.001,A,60s,F
17,5.0,0.0,0.003,A,50s,M
32,100.0,9.295,10.103,A,60s,F


#### `apply()`－运用

> **WARNING**

> 注意目前的版本采用`is`判断索引是否相同，很容易引起混淆，未来的版本可能会对这一点进行修改。

In [21]:
%C 4 tmt_group.apply(pd.DataFrame.max); tmt_group.apply(pd.DataFrame.mean)

       tmt_group.apply(pd.DataFrame.max)             tmt_group.apply(pd.DataFrame.mean) 
------------------------------------------------    ------------------------------------
      Dose  Response1  Response2 Tmt  Age Gender              Dose  Response1  Response2
Tmt                                                 Tmt                                 
A    100.0     11.226     10.745   A  60s      M    A    33.546154   6.728985   6.863185
B    100.0     10.824     10.340   B  60s      M    B    33.546154   5.573354   5.456415
C    100.0     10.490     11.246   C  60s      M    C    33.546154   4.040415   4.115323
D    100.0     10.911      9.863   D  60s      M    D    33.546154   3.320646   3.188369




In [22]:
tmt_group.get_group("A")

Unnamed: 0,Dose,Response1,Response2,Tmt,Age,Gender
6,1.0,0.000,0.000,A,50s,F
10,15.0,5.225,5.163,A,60s,F
12,5.0,0.000,0.001,A,60s,F
17,5.0,0.000,0.003,A,50s,M
32,100.0,9.295,10.103,A,60s,F
...,...,...,...,...,...,...
247,60.0,10.527,10.745,A,50s,F
248,50.0,10.312,10.131,A,60s,F
255,80.0,9.239,10.103,A,60s,F
256,5.0,0.000,0.000,A,50s,M


In [23]:
sample_res1 = tmt_group.apply(lambda df:df.Response1.sample(2)) #❶
sample_res2 = tmt_group.apply(
    lambda df:df.Response1.sample(2).reset_index(drop=True)) #❷
%C 4 sample_res1; sample_res2

          sample_res1                     sample_res2       
-------------------------------    -------------------------
Tmt                                Response1       0       1
A    257     9.976                 Tmt                      
     102     9.644                 A           0.000   9.689
B    239    10.275                 B          10.073   9.858
     31      0.398                 C           9.025   0.038
C    38      9.345                 D           9.858  10.568
     37      0.000                                          
D    253     1.239                                          
     30      0.000                                          
Name: Response1, dtype: float64                             


In [34]:
group = tmt_group[["Response1", "Response1"]]
apply_res1 = group.apply(lambda df:df - df.mean())
apply_res2 = group.apply(lambda df:(df - df.mean())[:])

%C 4 apply_res1.head(); apply_res2.head()

   apply_res1.head()          apply_res2.head()   
-----------------------    -----------------------
   Response1  Response1       Response1  Response1
0   5.831585   5.831585    0   5.831585   5.831585
1  -3.318646  -3.318646    1  -3.318646  -3.318646
2  -3.414415  -3.414415    2  -3.414415  -3.414415
3  -2.668415  -2.668415    3  -2.668415  -2.668415
4  -4.030415  -4.030415    4  -4.030415  -4.030415


In [35]:
#使用None实现筛选效果
tmt_group.apply(lambda df:None if df.Response1.mean() < 5 else df.sample(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,Dose,Response1,Response2,Tmt,Age,Gender
Tmt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,34,40.0,10.825,10.464,A,50s,M
A,10,15.0,5.225,5.163,A,60s,F
B,171,60.0,10.133,10.158,B,50s,M
B,101,100.0,8.866,10.234,B,60s,F


In [40]:
%C 4 tmt_group.mean(); tmt_group[["Dose","Response1","Response2"]].quantile(q=0.75)

          tmt_group.mean()              tmt_group[["Dose","Response1","Response2"]].quantile(q=0.75)
------------------------------------    ------------------------------------------------------------
          Dose  Response1  Response2         Dose  Response1  Response2                             
Tmt                                     Tmt                                                         
A    33.546154   6.728985   6.863185    A    50.0     10.283     10.217                             
B    33.546154   5.573354   5.456415    B    50.0      9.814      9.997                             
C    33.546154   4.040415   4.115323    C    50.0      9.627      9.561                             
D    33.546154   3.320646   3.188369    D    50.0      8.899      8.351                             
