<img src="https://pandas.pydata.org/static/img/pandas.svg" width="250">

## <center> Transform your data with `groupby`

In [8]:
import pandas as pd

In [9]:
iris = pd.read_csv('/iris.csv')
iris.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


A simple groupby on one dimension with one aggregation for all variables

In [10]:
# can flatten hierarchical index with reset_index()
iris.groupby(['species']).max()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.8,4.4,1.9,0.6
versicolor,7.0,3.4,5.1,1.8
virginica,7.9,3.8,6.9,2.5


Multiple aggregation methods to different variables

In [11]:
df = iris.groupby(['species']).agg({'sepal_length':['mean','min','max'],'sepal_width':'count'})
df

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_width
Unnamed: 0_level_1,mean,min,max,count
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
setosa,5.006,4.3,5.8,50
versicolor,5.936,4.9,7.0,50
virginica,6.588,4.9,7.9,50


In [12]:
df['sepal_length']

Unnamed: 0_level_0,mean,min,max
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,5.006,4.3,5.8
versicolor,5.936,4.9,7.0
virginica,6.588,4.9,7.9


Flattening hierarchical indexes

In [13]:
df.columns = ['_'.join(col).strip() for col in df.columns.values]
df.reset_index()
df

Unnamed: 0_level_0,sepal_length_mean,sepal_length_min,sepal_length_max,sepal_width_count
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,4.3,5.8,50
versicolor,5.936,4.9,7.0,50
virginica,6.588,4.9,7.9,50


Specify groupings prior to any aggregation

In [14]:
groupings = iris.groupby(['species'])

In [15]:
groupings.get_group('setosa').head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [16]:
groupings.max()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.8,4.4,1.9,0.6
versicolor,7.0,3.4,5.1,1.8
virginica,7.9,3.8,6.9,2.5


In [17]:
groupings.apply(lambda x: x.max())

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.8,4.4,1.9,0.6
versicolor,7.0,3.4,5.1,1.8
virginica,7.9,3.8,6.9,2.5


In [18]:
groupings.filter(lambda x: x['petal_length'].max() <5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1
