In [2]:
# 3.3: Grouping, Aggregation and Pivot Tables
import pandas as pd

In [14]:
# 3.3.1: Grouping and Aggregation

In [39]:
# 3.3.1.1: Grouping
df = pd.DataFrame({'gender': ['m', 'm', 'f'], 'rate': [5000, 5500, 6000], 'claims': [4500, 5000, 5500]})
print(df)
df_grouped = df.groupby(['gender']) # can group by multiple columns in list
print(df_grouped)
print(type(df_grouped))

   claims gender  rate
0    4500      m  5000
1    5000      m  5500
2    5500      f  6000
<pandas.core.groupby.DataFrameGroupBy object at 0x10e4a6f60>
<class 'pandas.core.groupby.DataFrameGroupBy'>


In [40]:
# 3.3.1.2: Standard Aggregation - use if possible due to performance
print('Mean:')
print(df_grouped.mean()) # applies the function to every column in the data frame
print('Sum:')
print(df_grouped['claims'].sum()) # can run on only a subset of columns
print('Count:')
print(df_grouped.count())

Mean:
        claims  rate
gender              
f         5500  6000
m         4750  5250
Sum:
gender
f    5500
m    9500
Name: claims, dtype: int64
Count:
        claims  rate
gender              
f            1     1
m            2     2


In [56]:
# 3.3.1.3: Custom Aggregation
charges = pd.DataFrame({'patient': ['1234', '1234', '1235', '1235', '1235'],
                        'hospital': ['memorial', 'memorial', 'community', 'community', 'community'],
                       'charge': [100, 200, 500, 10, 600],
                      'rev_code': ['0300-lab', '0350-CT', '0610-MRI','0300-lab', '0450-ER']})    

In [57]:
#find the revenue code with the most charges for each patient
charges_grouped = charges.groupby('patient') # use the levels argument with multiple indices
print(charges_grouped.groups) # dictionary with keys the unique groupby values

{'1234': Int64Index([0, 1], dtype='int64'), '1235': Int64Index([2, 3, 4], dtype='int64')}


In [58]:
charges_grouped.aggregate({'charge': sum})  # can use shortcut agg function

Unnamed: 0_level_0,charge
patient,Unnamed: 1_level_1
1234,300
1235,1110


In [59]:
def largest_rev_code(x): # assume that x is a sub data frame with only values for the dimension value
    max_charge = max(x['charge']) # column variables are referenced similar to apply
    row = x['charge'].index(max_charge)
    return x.loc[row, 'rev_code']

In [60]:
charges_grouped.transform(sum)

Unnamed: 0,charge,hospital,rev_code
0,300,memorialmemorial,0300-lab0350-CT
1,300,memorialmemorial,0300-lab0350-CT
2,1110,communitycommunitycommunity,0610-MRI0300-lab0450-ER
3,1110,communitycommunitycommunity,0610-MRI0300-lab0450-ER
4,1110,communitycommunitycommunity,0610-MRI0300-lab0450-ER


In [67]:
# 3.3.2.1: Pivot Tables
# index and columns are for dimensions, values is for measures and aggfunc is the aggregation function applied to the measure
charges.pivot_table(index=['patient'], columns=['hospital'], values=['charge'], aggfunc=[sum, len])

Unnamed: 0_level_0,sum,sum,len,len
Unnamed: 0_level_1,charge,charge,charge,charge
hospital,community,memorial,community,memorial
patient,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
1234,,300.0,,2.0
1235,1110.0,,3.0,
