In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

In [3]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [4]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [5]:
ser.sum()

2.811925491708157

In [6]:
ser.mean()

0.5623850983416314

In [7]:
df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})
df

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [8]:
df.mean()

A    0.477888
B    0.443420
dtype: float64

In [9]:
df.mean(axis='columns')

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [10]:
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [12]:
df = pd.DataFrame({'key': ['A','B','C','A','B','C'],
                   'data': range(6)},
                  columns=['key','data'])
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [13]:
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000161DBEE5BB0>

In [14]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [15]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [16]:
planets.groupby('method')['orbital_period']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000161DBF064F0>

In [17]:
planets.groupby('method')['orbital_period'].median()

method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [18]:
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


In [19]:
planets.groupby('method')['year'].describe().unstack()

       method                       
count  Astrometry                          2.0
       Eclipse Timing Variations           9.0
       Imaging                            38.0
       Microlensing                       23.0
       Orbital Brightness Modulation       3.0
                                         ...  
max    Pulsar Timing                    2011.0
       Pulsation Timing Variations      2007.0
       Radial Velocity                  2014.0
       Transit                          2014.0
       Transit Timing Variations        2014.0
Length: 80, dtype: float64

In [23]:
rng = np.random.RandomState(42)
df = pd.DataFrame({'key':['A','B','C','A','B','C'],
                   'data1': range(6),
                   'data2': rng.randint(0,10,6)},
                 columns=['key','data1','data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,6
1,B,1,3
2,C,2,7
3,A,3,4
4,B,4,6
5,C,5,9


In [24]:
df.groupby('key').aggregate(['min',np.median,max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,4,5.0,6
B,1,2.5,4,3,4.5,6
C,2,3.5,5,7,8.0,9


In [25]:
df.groupby('key').aggregate({'data1': 'min',
                             'data2': 'max'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,6
B,1,6
C,2,9


In [27]:
def filter_func(x):
    return x['data2'].std() > 2

print(df); print(df.groupby('key').std());
print(df.groupby('key').filter(filter_func))

  key  data1  data2
0   A      0      6
1   B      1      3
2   C      2      7
3   A      3      4
4   B      4      6
5   C      5      9
       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  2.121320
C    2.12132  1.414214
  key  data1  data2
1   B      1      3
4   B      4      6


In [28]:
df.groupby('key').transform(lambda x: x - x.mean())

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-1.5
2,-1.5,-1.0
3,1.5,-1.0
4,1.5,1.5
5,1.5,1.0


In [29]:
# apply() method
def app_func(x):
    x['data1'] /= x['data2'].sum()
    return x

print(df);
print(df.groupby('key').apply(app_func))

  key  data1  data2
0   A      0      6
1   B      1      3
2   C      2      7
3   A      3      4
4   B      4      6
5   C      5      9
  key     data1  data2
0   A  0.000000      6
1   B  0.111111      3
2   C  0.125000      7
3   A  0.300000      4
4   B  0.444444      6
5   C  0.312500      9


In [30]:
L = [0, 1, 0, 1, 2, 0]
print(df); print(df.groupby(L).sum())

  key  data1  data2
0   A      0      6
1   B      1      3
2   C      2      7
3   A      3      4
4   B      4      6
5   C      5      9
   data1  data2
0      7     22
1      4      7
2      4      6


In [31]:
 print(df); print(df.groupby(df['key']).sum())

  key  data1  data2
0   A      0      6
1   B      1      3
2   C      2      7
3   A      3      4
4   B      4      6
5   C      5      9
     data1  data2
key              
A        3     10
B        5      9
C        7     16


In [32]:
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
print(df2); print(df2.groupby(mapping).sum())

     data1  data2
key              
A        0      6
B        1      3
C        2      7
A        3      4
B        4      6
C        5      9
           data1  data2
key                    
consonant     12     25
vowel          3     10


In [33]:
print(df2); print(df2.groupby(str.lower).mean())

     data1  data2
key              
A        0      6
B        1      3
C        2      7
A        3      4
B        4      6
C        5      9
     data1  data2
key              
a      1.5    5.0
b      2.5    4.5
c      3.5    8.0
