### groupby() function is used to split the data into groups based on some criteria.

Syntax: DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)

Parameters :

by : mapping, function, str, or iterable

axis : int, default 0

level : If the axis is a MultiIndex (hierarchical), group by a particular level or levels

as_index : For aggregated output, return object with group labels as the index. Only relevant for DataFrame input. 

as_index=False is effectively “SQL-style” grouped output

sort : Sort group keys. Get better performance by turning this off. Note this does not influence the order of observations within each group. groupby preserves the order of rows within each group.

group_keys : When calling apply, add group keys to index to identify pieces

squeeze : Reduce the dimensionality of the return type if possible, otherwise return a consistent type

Returns : GroupBy object

In [2]:
# importing pandas as pd 
import pandas as pd 

In [3]:
# Creating the dataframe  
df = pd.read_csv("nba.csv") 
  
# Print the dataframe 
df 

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,25,2-Jun,180,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99,SF,25,6-Jun,235,Marquette,6796117.0
2,John Holland,Boston Celtics,30,SG,27,5-Jun,205,Boston University,
3,R.J. Hunter,Boston Celtics,28,SG,22,5-Jun,185,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8,PF,29,10-Jun,231,,5000000.0
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41,PF,20,10-Jun,234,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8,PG,26,3-Jun,203,Butler,2433333.0
454,Raul Neto,Utah Jazz,25,PG,24,1-Jun,179,,900000.0
455,Tibor Pleiss,Utah Jazz,21,C,26,3-Jul,256,,2900000.0


In [4]:
# applying groupby() function to 
# group the data on team value. 
gk = df.groupby('Team') 
  
# Let's print the first entries 
# in all the groups formed. 
gk.first()

Unnamed: 0_level_0,Name,Number,Position,Age,Height,Weight,College,Salary
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Atlanta Hawks,Kent Bazemore,24,SF,26,5-Jun,201,Old Dominion,2000000.0
Boston Celtics,Avery Bradley,0,PG,25,2-Jun,180,Texas,7730337.0
Brooklyn Nets,Bojan Bogdanovic,44,SG,27,8-Jun,216,Oklahoma State,3425510.0
Charlotte Hornets,Nicolas Batum,5,SG,27,8-Jun,200,Virginia Commonwealth,13125306.0
Chicago Bulls,Cameron Bairstow,41,PF,25,9-Jun,250,New Mexico,845059.0
Cleveland Cavaliers,Matthew Dellavedova,8,PG,25,4-Jun,198,Saint Mary's,1147276.0
Dallas Mavericks,Justin Anderson,1,SG,22,6-Jun,228,Virginia,1449000.0
Denver Nuggets,Darrell Arthur,0,PF,28,9-Jun,235,Kansas,2814000.0
Detroit Pistons,Joel Anthony,50,C,33,9-Jun,245,UNLV,2500000.0
Golden State Warriors,Leandro Barbosa,19,SG,33,3-Jun,194,North Carolina,2500000.0


In [5]:
# Finding the values contained in the "Boston Celtics" group 
gk.get_group('Boston Celtics') 

Unnamed: 0,Name,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,0,PG,25,2-Jun,180,Texas,7730337.0
1,Jae Crowder,99,SF,25,6-Jun,235,Marquette,6796117.0
2,John Holland,30,SG,27,5-Jun,205,Boston University,
3,R.J. Hunter,28,SG,22,5-Jun,185,Georgia State,1148640.0
4,Jonas Jerebko,8,PF,29,10-Jun,231,,5000000.0
5,Amir Johnson,90,PF,29,9-Jun,240,,12000000.0
6,Jordan Mickey,55,PF,21,8-Jun,235,LSU,1170960.0
7,Kelly Olynyk,41,C,25,Jul-00,238,Gonzaga,2165160.0
8,Terry Rozier,12,PG,22,2-Jun,190,Louisville,1824360.0
9,Marcus Smart,36,PG,22,4-Jun,220,Oklahoma State,3431040.0


In [6]:
# First grouping based on "Team" 
# Within each team we are grouping based on "Position" 
gkk = df.groupby(['Team', 'Position']) 
  
# Print the first value in each group 
gkk.first() 

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Number,Age,Height,Weight,College,Salary
Team,Position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Atlanta Hawks,C,Al Horford,15,30,10-Jun,245,Florida,12000000.0
Atlanta Hawks,PF,Kris Humphries,43,31,9-Jun,235,Minnesota,1000000.0
Atlanta Hawks,PG,Dennis Schroder,17,22,1-Jun,172,Wake Forest,1763400.0
Atlanta Hawks,SF,Kent Bazemore,24,26,5-Jun,201,Old Dominion,2000000.0
Atlanta Hawks,SG,Tim Hardaway Jr.,10,24,6-Jun,205,Michigan,1304520.0
...,...,...,...,...,...,...,...,...
Washington Wizards,C,Marcin Gortat,13,32,11-Jun,240,North Carolina State,11217391.0
Washington Wizards,PF,Drew Gooden,90,34,10-Jun,250,Kansas,3300000.0
Washington Wizards,PG,Ramon Sessions,7,30,3-Jun,190,Nevada,2170465.0
Washington Wizards,SF,Jared Dudley,1,30,7-Jun,225,Boston College,4375000.0


In [7]:
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})

In [8]:
df

Unnamed: 0,Animal,Max Speed
0,Falcon,380.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [11]:
df.groupby(['Animal']).mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,375.0
Parrot,25.0


In [12]:
arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
          ['Captive', 'Wild', 'Captive', 'Wild']]

index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))

df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
                  index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Max Speed
Animal,Type,Unnamed: 2_level_1
Falcon,Captive,390.0
Falcon,Wild,350.0
Parrot,Captive,30.0
Parrot,Wild,20.0


In [13]:
df.groupby(level=0).mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,370.0
Parrot,25.0


In [14]:
df.groupby(level="Type").mean()

Unnamed: 0_level_0,Max Speed
Type,Unnamed: 1_level_1
Captive,210.0
Wild,185.0


In [17]:
l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
df = pd.DataFrame(l, columns=["a", "b", "c"])

df

Unnamed: 0,a,b,c
0,1,2.0,3
1,1,,4
2,2,1.0,3
3,1,2.0,2


In [18]:
df.groupby(by=["b"]).sum()

Unnamed: 0_level_0,a,c
b,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,2,3
2.0,2,5


In [22]:
df.groupby(by=["b"], dropna = False).sum()

TypeError: groupby() got an unexpected keyword argument 'dropna'

In [23]:
df.groupby(by="a").sum()

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.0,9
2,1.0,3


In [26]:
df.groupby(by="a", dropna=False).sum()

TypeError: groupby() got an unexpected keyword argument 'dropna'