In [0]:
import numpy as np
import pandas as pd

In [42]:
# example dataset
data = {'Company':['Microsoft', 'Alibaba', 'Microsoft', 'Google',  
                 'Netflix', 'Alibaba', 'Google', 'Tecent'],  
        'Name':['Jason', 'Wiki', 'Sara', 'Lucy', 
                   'Tom', 'Zach', 'Ted', 'Nancy'],  
        'Age':[27, 24, 22, 32, 33, 36, 27, 32],  
        'Degree':['BA', 'MA', 'MA', 'Phd', 
                         'BA', 'BA', 'Phd', 'MA']}  

df = pd.DataFrame(data) 
df

Unnamed: 0,Company,Name,Age,Degree
0,Microsoft,Jason,27,BA
1,Alibaba,Wiki,24,MA
2,Microsoft,Sara,22,MA
3,Google,Lucy,32,Phd
4,Netflix,Tom,33,BA
5,Alibaba,Zach,36,BA
6,Google,Ted,27,Phd
7,Tecent,Nancy,32,MA


### 1. Grouping data with one key:

In [43]:
df.groupby('Company') 
df.groupby('Company').groups

{'Alibaba': Int64Index([1, 5], dtype='int64'),
 'Google': Int64Index([3, 6], dtype='int64'),
 'Microsoft': Int64Index([0, 2], dtype='int64'),
 'Netflix': Int64Index([4], dtype='int64'),
 'Tecent': Int64Index([7], dtype='int64')}

In [44]:
# print the first entries in all the groups formed.  
gf = df.groupby('Company')  
gf.first()

Unnamed: 0_level_0,Name,Age,Degree
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alibaba,Wiki,24,MA
Google,Lucy,32,Phd
Microsoft,Jason,27,BA
Netflix,Tom,33,BA
Tecent,Nancy,32,MA


In [45]:
grp = df.groupby('Company') 
for company, group in grp: 
    print(company) 
    print(group) 
    print()

Alibaba
   Company  Name  Age Degree
1  Alibaba  Wiki   24     MA
5  Alibaba  Zach   36     BA

Google
  Company  Name  Age Degree
3  Google  Lucy   32    Phd
6  Google   Ted   27    Phd

Microsoft
     Company   Name  Age Degree
0  Microsoft  Jason   27     BA
2  Microsoft   Sara   22     MA

Netflix
   Company Name  Age Degree
4  Netflix  Tom   33     BA

Tecent
  Company   Name  Age Degree
7  Tecent  Nancy   32     MA



In [46]:
grp = df.groupby('Company') 
grp.get_group('Google') 

Unnamed: 0,Company,Name,Age,Degree
3,Google,Lucy,32,Phd
6,Google,Ted,27,Phd


### 2. Grouping data with multiple key:

In [47]:
# Using multiple keys in groupby() function 
df.groupby(['Company', 'Degree'])  
df.groupby(['Company', 'Degree']).groups

{('Alibaba', 'BA'): Int64Index([5], dtype='int64'),
 ('Alibaba', 'MA'): Int64Index([1], dtype='int64'),
 ('Google', 'Phd'): Int64Index([3, 6], dtype='int64'),
 ('Microsoft', 'BA'): Int64Index([0], dtype='int64'),
 ('Microsoft', 'MA'): Int64Index([2], dtype='int64'),
 ('Netflix', 'BA'): Int64Index([4], dtype='int64'),
 ('Tecent', 'MA'): Int64Index([7], dtype='int64')}

In [48]:
grp = df.groupby(['Company', 'Degree']) 
for info, group in grp: 
    print(info) 
    print(group) 
    print() 

('Alibaba', 'BA')
   Company  Name  Age Degree
5  Alibaba  Zach   36     BA

('Alibaba', 'MA')
   Company  Name  Age Degree
1  Alibaba  Wiki   24     MA

('Google', 'Phd')
  Company  Name  Age Degree
3  Google  Lucy   32    Phd
6  Google   Ted   27    Phd

('Microsoft', 'BA')
     Company   Name  Age Degree
0  Microsoft  Jason   27     BA

('Microsoft', 'MA')
     Company  Name  Age Degree
2  Microsoft  Sara   22     MA

('Netflix', 'BA')
   Company Name  Age Degree
4  Netflix  Tom   33     BA

('Tecent', 'MA')
  Company   Name  Age Degree
7  Tecent  Nancy   32     MA



In [49]:
grp = df.groupby(['Company', 'Degree']) 
grp.get_group(('Alibaba', 'MA')) 

Unnamed: 0,Company,Name,Age,Degree
1,Alibaba,Wiki,24,MA


### 3. Applying function to group
* Aggregation 
* Transformation
* Filtration 


In [50]:
# example dataset
data1 = {'Company':['Microsoft', 'Alibaba', 'Microsoft', 'Google',  
                 'Netflix', 'Alibaba', 'Google', 'Tecent'],  
        'Name':['Jason', 'Wiki', 'Sara', 'Lucy', 
                   'Tom', 'Zach', 'Ted', 'Nancy'],  
        'Age':[27, 24, 22, 32, 33, 36, 27, 32],  
        'Salary':[120, 80, 100, 150, 120, 90, 190, 100], 
        'Degree':['BA', 'MA', 'MA', 'Phd', 
                         'BA', 'BA', 'Phd', 'MA']}  
df = pd.DataFrame(data1) 
df

Unnamed: 0,Company,Name,Age,Salary,Degree
0,Microsoft,Jason,27,120,BA
1,Alibaba,Wiki,24,80,MA
2,Microsoft,Sara,22,100,MA
3,Google,Lucy,32,150,Phd
4,Netflix,Tom,33,120,BA
5,Alibaba,Zach,36,90,BA
6,Google,Ted,27,190,Phd
7,Tecent,Nancy,32,100,MA


In [51]:
# performing aggregation using aggregate method 
grp1 = df.groupby('Company') 
grp1.aggregate(np.sum) 


Unnamed: 0_level_0,Age,Salary
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Alibaba,60,170
Google,59,340
Microsoft,49,220
Netflix,33,120
Tecent,32,100


In [52]:
# applying a function by passing a list of functions 
grp = df.groupby('Company') 
grp['Salary'].agg([np.sum, np.mean, np.std]) 


Unnamed: 0_level_0,sum,mean,std
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alibaba,170,85,7.071068
Google,340,170,28.284271
Microsoft,220,110,14.142136
Netflix,120,120,
Tecent,100,100,


In [35]:
# using different aggregation function by passing dictionary to aggregate 
grp = df.groupby('Company') 
grp.agg({'Age' : 'sum', 'Salary' : 'std'}) 


Unnamed: 0_level_0,Age,Salary
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Alibaba,60,7.071068
Google,59,28.284271
Microsoft,49,14.142136
Netflix,33,
Tecent,32,


In [36]:
# performing aggregation on group containing multiple keys 
grp1 = df.groupby(['Company', 'Degree']) 
grp1.aggregate(np.sum) 


Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Salary
Company,Degree,Unnamed: 2_level_1,Unnamed: 3_level_1
Alibaba,BA,36,90
Alibaba,MA,24,80
Google,Phd,59,340
Microsoft,BA,27,120
Microsoft,MA,22,100
Netflix,BA,33,120
Tecent,MA,32,100


In [37]:
# using transform function 
grp = df.groupby('Company') 
sc = lambda x: (x - x.mean()) / x.std()*10
grp.transform(sc) 

Unnamed: 0,Age,Salary
0,7.071068,7.071068
1,-7.071068,-7.071068
2,-7.071068,-7.071068
3,7.071068,-7.071068
4,,
5,7.071068,7.071068
6,-7.071068,7.071068
7,,


In [40]:
# filtering data using filter data 
grp = df.groupby('Company') 
grp.filter(lambda x: len(x) >= 2) 


Unnamed: 0,Company,Name,Age,Salary,Degree
0,Microsoft,Jason,27,120,BA
1,Alibaba,Wiki,24,80,MA
2,Microsoft,Sara,22,100,MA
3,Google,Lucy,32,150,Phd
5,Alibaba,Zach,36,90,BA
6,Google,Ted,27,190,Phd
