In [1]:
import pandas as pd
pd.set_option("display.max.columns", None)

In [2]:
pd.set_option("display.max.rows", 6)

In [3]:
pd.set_option("display.precision", 2)

In [4]:
path = "../data/"

In [5]:
df = pd.read_csv(path+"avocado.csv", index_col=0)

In [6]:
df.reset_index(inplace=True, drop=True)

In [7]:
df.head()

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


# Understanding Pandas GroupBy Objects

In [8]:
df.groupby('year')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D0224FD150>

`.ngroups`, which holds the number of groups available in that grouping:



In [9]:
# Counting the Groups in a Pandas GroupBy Object
print(df.groupby('year').ngroups)

4


In [10]:
df.year.unique()

array([2015, 2016, 2017, 2018], dtype=int64)

In [11]:
# Accessing the Groups in a GroupBy object
type(df.groupby('year').groups)

pandas.io.formats.printing.PrettyDict

In [12]:
# Accessing the Groups in a GroupBy object
print(df.groupby('year').groups)

{2015: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...], 2016: [2808, 2809, 2810, 2811, 2812, 2813, 2814, 2815, 2816, 2817, 2818, 2819, 2820, 2821, 2822, 2823, 2824, 2825, 2826, 2827, 2828, 2829, 2830, 2831, 2832, 2833, 2834, 2835, 2836, 2837, 2838, 2839, 2840, 2841, 2842, 2843, 2844, 2845, 2846, 2847, 2848, 2849, 2850, 2851, 2852, 2853, 2854, 2855, 2856, 2857, 2858, 2859, 2860, 2861, 2862, 2863, 2864, 2865, 2866, 2867, 2868, 2869, 2870, 2871, 2872, 2873, 2874, 2875, 2876, 2877, 2878, 2879, 2880, 2881, 2882, 2883, 2884, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 2902, 2903, 2904, 2905, 2

If we only wanted to see the group names of our GroupBy object, we could simply return only the keys of this dictionary.

In [13]:

type(df.groupby('year').groups)

pandas.io.formats.printing.PrettyDict

In [14]:

print(df.groupby('year').groups.keys())

dict_keys([2015, 2016, 2017, 2018])


In [15]:

df.groupby('year').groups[2015]

Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
       ...
       11923, 11924, 11925, 11926, 11927, 11928, 11929, 11930, 11931, 11932],
      dtype='int64', length=5615)

In [16]:
df.year.value_counts()

year
2017    5722
2016    5616
2015    5615
2018    1296
Name: count, dtype: int64

###### Iterating through Groups

In [17]:
grouped = df.groupby('year')

for name,group in grouped:
   print(name, group.shape)

2015 (5615, 13)
2016 (5616, 13)
2017 (5722, 13)
2018 (1296, 13)


###### Selecting a Pandas GroupBy Group


In [18]:
# Selecting a Pandas GroupBy Group
df.groupby('year').get_group(2018)

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
8478,2018-03-25,1.57,149396.50,16361.69,109045.03,65.45,23924.33,19273.80,4270.53,380.00,conventional,2018,Albany
8479,2018-03-18,1.35,105304.65,13234.86,61037.58,55.00,30977.21,26755.90,3721.31,500.00,conventional,2018,Albany
8480,2018-03-11,1.12,144648.75,15823.35,110950.68,70.00,17804.72,14480.52,3033.09,291.11,conventional,2018,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18246,2018-01-21,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.00,organic,2018,WestTexNewMexico
18247,2018-01-14,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.00,organic,2018,WestTexNewMexico
18248,2018-01-07,1.62,17489.58,2894.77,2356.13,224.53,12014.15,11988.14,26.01,0.00,organic,2018,WestTexNewMexico


In [19]:
df.groupby('year').get_group(2016)

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
2808,2016-12-25,1.52,73341.73,3202.39,58280.33,426.92,11432.09,11017.32,411.83,2.94,conventional,2016,Albany
2809,2016-12-18,1.53,68938.53,3345.36,55949.79,138.72,9504.66,8876.65,587.73,40.28,conventional,2016,Albany
2810,2016-12-11,1.49,71777.85,2323.39,56545.79,86.65,12822.02,12176.75,645.27,0.00,conventional,2016,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14738,2016-01-17,1.45,13237.48,1912.60,4779.17,25.72,6519.99,6447.25,72.74,0.00,organic,2016,WestTexNewMexico
14739,2016-01-10,1.37,12647.90,1591.60,4070.29,23.41,6962.60,6780.56,182.04,0.00,organic,2016,WestTexNewMexico
14740,2016-01-03,1.58,9667.05,1820.39,2783.11,16.41,5047.14,4651.16,395.98,0.00,organic,2016,WestTexNewMexico


In [20]:
type(df.groupby('year').get_group(2016))

pandas.core.frame.DataFrame

###### Group by with multiple columns 

In [21]:
grouped2 = df.groupby(['year','type'])
grouped2.groups.keys()

dict_keys([(2015, 'conventional'), (2015, 'organic'), (2016, 'conventional'), (2016, 'organic'), (2017, 'conventional'), (2017, 'organic'), (2018, 'conventional'), (2018, 'organic')])

In [22]:
grouped2 = df.groupby(['year','type'])

In [23]:
for name,group in grouped2:
   print(f"Group - {name}\tShape - {group.shape}")

Group - (2015, 'conventional')	Shape - (2808, 13)
Group - (2015, 'organic')	Shape - (2807, 13)
Group - (2016, 'conventional')	Shape - (2808, 13)
Group - (2016, 'organic')	Shape - (2808, 13)
Group - (2017, 'conventional')	Shape - (2862, 13)
Group - (2017, 'organic')	Shape - (2860, 13)
Group - (2018, 'conventional')	Shape - (648, 13)
Group - (2018, 'organic')	Shape - (648, 13)


###### Select a Group

In [24]:
grouped2.get_group((2015, 'conventional')).head()

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


# Understanding Pandas GroupBy Split-Apply-Combine


###### List of Aggregation Functions(aggfunc) for GroupBy in Pandas
<ul>
<li><code>count</code> / <code>nunique</code> – non-null values / count number of unique values</li>
<li><code>min</code> / <code>max</code> – minimum/maximum</li>
<li><code>first</code> / <code>last</code> - return first or last value per group</li>
<li><code>unique</code> - all unique values from the group</li>
<li><code>std</code> – standard deviation</li>
<li><code>sum</code> – sum of values</li>
<li><code>mean</code> / <code>median</code> / <code>mode</code> – mean/median/mode</li>
<li><code>var</code> - unbiased variance</li>
<li><code>mad</code> - mean absolute deviation</li>
<li><code>skew</code> - unbiased skew</li>
<li><code>sem</code> - standard error of the mean</li>
<li><code>quantile</code></li>
</ul>

In [25]:
df.groupby('year').sum(numeric_only=True)

Unnamed: 0_level_0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015,7723.94,4390000000.0,1710000000.0,1760000000.0,143000000.0,772000000.0,635000000.0,132000000.0,5440000.0
2016,7517.8,4820000000.0,1530000000.0,1670000000.0,160000000.0,1460000000.0,1110000000.0,337000000.0,20000000.0
2017,8669.56,4930000000.0,1650000000.0,1540000000.0,91200000.0,1650000000.0,1220000000.0,399000000.0,24000000.0
2018,1746.4,1380000000.0,460000000.0,408000000.0,22900000.0,492000000.0,361000000.0,124000000.0,7210000.0


In [26]:
df.groupby('year').count()

Unnamed: 0_level_0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,region
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015,5615,5615,5615,5615,5615,5615,5615,5615,5615,5615,5615,5615
2016,5616,5616,5616,5616,5616,5616,5616,5616,5616,5616,5616,5616
2017,5722,5722,5722,5722,5722,5722,5722,5722,5722,5722,5722,5722
2018,1296,1296,1296,1296,1296,1296,1296,1296,1296,1296,1296,1296


In [27]:
df.groupby('year').nunique()

Unnamed: 0_level_0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,region
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015,52,203,5615,5526,5575,3789,5524,5112,4196,1310,2,54
2016,52,225,5612,5462,5581,3931,5611,5485,4826,1856,2,54
2017,53,253,5721,5607,5700,3871,5721,5701,5099,2135,2,54
2018,12,154,1296,1285,1296,863,1296,1296,1173,568,2,54


###### GroupBy on Single Column Python Pandas 

GroupBy can be used to summarize year column can using THREE methods listed below:

In [28]:
df.groupby('year')['Total Volume'].sum()

year
2015    4.39e+09
2016    4.82e+09
2017    4.93e+09
2018    1.38e+09
Name: Total Volume, dtype: float64

In [29]:
type(df.groupby('year')['Total Volume'].sum())

pandas.core.series.Series

In [30]:
df.groupby('year')['Total Volume'].sum().reset_index()

Unnamed: 0,year,Total Volume
0,2015,4390000000.0
1,2016,4820000000.0
2,2017,4930000000.0
3,2018,1380000000.0


In [31]:
type(df.groupby('year')['Total Volume'].sum().reset_index())

pandas.core.frame.DataFrame

In [32]:
df.groupby('year', as_index=False)['Total Volume'].sum()

Unnamed: 0,year,Total Volume
0,2015,4390000000.0
1,2016,4820000000.0
2,2017,4930000000.0
3,2018,1380000000.0


###### Perform various aggregation operations on a group of data

In [33]:
df.groupby('year', as_index=False).agg(min)

Unnamed: 0,year,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,region
0,2015,2015-01-04,0.49,84.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,conventional,Albany
1,2016,2016-01-03,0.51,385.55,0.0,0.0,0.0,7.02,0.0,0.0,0.0,conventional,Albany
2,2017,2017-01-01,0.44,515.01,0.0,0.0,0.0,106.45,0.0,0.0,0.0,conventional,Albany
3,2018,2018-01-07,0.56,2064.9,0.0,6.6,0.0,988.45,284.43,0.0,0.0,conventional,Albany


In [34]:
type(min)

builtin_function_or_method

In [35]:
try:
    type(count) 
except Exception as e:
    print("An error occurred:", type(e).__name__, "–", e)  

An error occurred: NameError – name 'count' is not defined


In [36]:
try:
    df.groupby('year', as_index=False).agg(count)
except Exception as e:
    print("An error occurred:", type(e).__name__, "–", e)  

An error occurred: NameError – name 'count' is not defined


In [37]:
df.groupby('year', as_index=False).agg('count')

Unnamed: 0,year,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,region
0,2015,5615,5615,5615,5615,5615,5615,5615,5615,5615,5615,5615,5615
1,2016,5616,5616,5616,5616,5616,5616,5616,5616,5616,5616,5616,5616
2,2017,5722,5722,5722,5722,5722,5722,5722,5722,5722,5722,5722,5722
3,2018,1296,1296,1296,1296,1296,1296,1296,1296,1296,1296,1296,1296


In [38]:
df.groupby('year', as_index=False).agg([max,'count'])

Unnamed: 0_level_0,Date,Date,AveragePrice,AveragePrice,Total Volume,Total Volume,4046,4046,4225,4225,4770,4770,Total Bags,Total Bags,Small Bags,Small Bags,Large Bags,Large Bags,XLarge Bags,XLarge Bags,type,type,region,region
Unnamed: 0_level_1,max,count,max,count,max,count,max,count,max,count,max,count,max,count,max,count,max,count,max,count,max,count,max,count
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
2015,2015-12-27,5615,2.79,5615,44700000.0,5615,18900000.0,5615,19000000.0,5615,1610000.0,5615,6740000.0,5615,5890000.0,5615,1370000.0,5615,199305.12,5615,organic,5615,WestTexNewMexico,5615
2016,2016-12-25,5616,3.25,5616,52300000.0,5616,16600000.0,5616,20500000.0,5616,2550000.0,5616,12700000.0,5616,9970000.0,5616,3370000.0,5616,551693.65,5616,organic,5616,WestTexNewMexico,5616
2017,2017-12-31,5722,3.17,5722,61000000.0,5722,22700000.0,5722,20300000.0,5722,1660000.0,5722,16300000.0,5722,12600000.0,5722,4320000.0,5722,377661.06,5722,organic,5722,WestTexNewMexico,5722
2018,2018-03-25,1296,2.3,1296,62500000.0,1296,21600000.0,1296,20400000.0,1296,1070000.0,1296,19400000.0,1296,13400000.0,1296,5720000.0,1296,309467.53,1296,organic,1296,WestTexNewMexico,1296


In [39]:
pd.set_option("display.max.rows", 30)
df.groupby('year', as_index=False).agg([max,'count']).T

Unnamed: 0,year,2015,2016,2017,2018
Date,max,2015-12-27,2016-12-25,2017-12-31,2018-03-25
Date,count,5615,5616,5722,1296
AveragePrice,max,2.79,3.25,3.17,2.3
AveragePrice,count,5615,5616,5722,1296
Total Volume,max,44655461.51,52288697.89,61034457.1,62505646.52
Total Volume,count,5615,5616,5722,1296
4046,max,18933038.04,16573573.78,22743616.17,21620180.9
4046,count,5615,5616,5722,1296
4225,max,18956479.74,20470572.61,20328161.55,20445501.03
4225,count,5615,5616,5722,1296


In [40]:
df.groupby('year').agg([max,'count']).index

Index([2015, 2016, 2017, 2018], dtype='int64', name='year')

In [41]:
df.groupby('year').agg([max,'count']).columns

MultiIndex([(        'Date',   'max'),
            (        'Date', 'count'),
            ('AveragePrice',   'max'),
            ('AveragePrice', 'count'),
            ('Total Volume',   'max'),
            ('Total Volume', 'count'),
            (        '4046',   'max'),
            (        '4046', 'count'),
            (        '4225',   'max'),
            (        '4225', 'count'),
            (        '4770',   'max'),
            (        '4770', 'count'),
            (  'Total Bags',   'max'),
            (  'Total Bags', 'count'),
            (  'Small Bags',   'max'),
            (  'Small Bags', 'count'),
            (  'Large Bags',   'max'),
            (  'Large Bags', 'count'),
            ( 'XLarge Bags',   'max'),
            ( 'XLarge Bags', 'count'),
            (        'type',   'max'),
            (        'type', 'count'),
            (      'region',   'max'),
            (      'region', 'count')],
           )

In [42]:
df.groupby('year', as_index=False)['Total Volume'].agg([max,'count'])

Unnamed: 0,year,max,count
0,2015,44700000.0,5615
1,2016,52300000.0,5616
2,2017,61000000.0,5722
3,2018,62500000.0,1296


In [43]:
def value_range(x):
    return (x.max()) - (x.min())

In [44]:
df.groupby('year')[[ 'AveragePrice', 'Total Volume', ]].agg([max,min,value_range])

Unnamed: 0_level_0,AveragePrice,AveragePrice,AveragePrice,Total Volume,Total Volume,Total Volume
Unnamed: 0_level_1,max,min,value_range,max,min,value_range
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2015,2.79,0.49,2.3,44700000.0,84.56,44700000.0
2016,3.25,0.51,2.74,52300000.0,385.55,52300000.0
2017,3.17,0.44,2.73,61000000.0,515.01,61000000.0
2018,2.3,0.56,1.74,62500000.0,2064.9,62500000.0


In [45]:
df.groupby('year')[[ 'AveragePrice', 'Total Volume', ]].agg({
                                             'AveragePrice': [max,min,value_range],
                                             'Total Volume': [sum, 'mean']
                                                            })

Unnamed: 0_level_0,AveragePrice,AveragePrice,AveragePrice,Total Volume,Total Volume
Unnamed: 0_level_1,max,min,value_range,sum,mean
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2015,2.79,0.49,2.3,4390000000.0,781000.0
2016,3.25,0.51,2.74,4820000000.0,858000.0
2017,3.17,0.44,2.73,4930000000.0,862000.0
2018,2.3,0.56,1.74,1380000000.0,1070000.0


In [46]:
grp = df.groupby('year')[[ 'AveragePrice', 'Total Volume', ]].agg({
                                             'AveragePrice': [max,min,value_range],
                                             'Total Volume': [sum, 'mean']
                                                            })

In [47]:
grp.columns

MultiIndex([('AveragePrice',         'max'),
            ('AveragePrice',         'min'),
            ('AveragePrice', 'value_range'),
            ('Total Volume',         'sum'),
            ('Total Volume',        'mean')],
           )

In [48]:
grp.columns =['_'.join(col) for col in grp.columns]

In [49]:
grp.columns

Index(['AveragePrice_max', 'AveragePrice_min', 'AveragePrice_value_range',
       'Total Volume_sum', 'Total Volume_mean'],
      dtype='object')

In [50]:
grp

Unnamed: 0_level_0,AveragePrice_max,AveragePrice_min,AveragePrice_value_range,Total Volume_sum,Total Volume_mean
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015,2.79,0.49,2.3,4390000000.0,781000.0
2016,3.25,0.51,2.74,4820000000.0,858000.0
2017,3.17,0.44,2.73,4930000000.0,862000.0
2018,2.3,0.56,1.74,1380000000.0,1070000.0


In [51]:
grp = grp.reset_index()
grp

Unnamed: 0,year,AveragePrice_max,AveragePrice_min,AveragePrice_value_range,Total Volume_sum,Total Volume_mean
0,2015,2.79,0.49,2.3,4390000000.0,781000.0
1,2016,3.25,0.51,2.74,4820000000.0,858000.0
2,2017,3.17,0.44,2.73,4930000000.0,862000.0
3,2018,2.3,0.56,1.74,1380000000.0,1070000.0


######  named aggregation : to specify the output column names when you aggregate a groupby

In [52]:
df.groupby('year',as_index=False)[[ 'AveragePrice', 'Total Volume']].agg(
                  AveragePrice_max = pd.NamedAgg(column='AveragePrice',aggfunc=max),
                  AveragePrice_min=pd.NamedAgg(column='AveragePrice',aggfunc=min),
                  AveragePrice_value_range=pd.NamedAgg(column='AveragePrice',aggfunc=value_range),
                  Total_volume_sum=pd.NamedAgg(column='Total Volume',aggfunc=sum),
                  )

Unnamed: 0,year,AveragePrice_max,AveragePrice_min,AveragePrice_value_range,Total_volume_sum
0,2015,2.79,0.49,2.3,4390000000.0
1,2016,3.25,0.51,2.74,4820000000.0
2,2017,3.17,0.44,2.73,4930000000.0
3,2018,2.3,0.56,1.74,1380000000.0


# Filtering Data with Pandas GroupBy


In [53]:
df.groupby('region',as_index=False)['Total Volume'].sum()

Unnamed: 0,region,Total Volume
0,Albany,1.61e+07
1,Atlanta,8.86e+07
2,BaltimoreWashington,1.35e+08
3,Boise,1.44e+07
4,Boston,9.73e+07
...,...,...
49,Syracuse,1.09e+07
50,Tampa,6.60e+07
51,TotalUS,5.86e+09
52,West,1.09e+09


In [54]:
df_filtered = df.groupby('region',as_index=False).filter(
                    lambda x: x['Total Volume'].sum() < 20000000)

In [55]:
df_filtered.shape

(2028, 13)

In [56]:
df_filtered.region.unique()

array(['Albany', 'Boise', 'Louisville', 'Pittsburgh', 'Spokane',
       'Syracuse'], dtype=object)

In [57]:
df_filtered.head(2)

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany


# Transforming Data with Pandas GroupBy 

In [58]:
df.groupby('region', as_index=False)['Total Volume'].sum().head()

Unnamed: 0,region,Total Volume
0,Albany,16100000.0
1,Atlanta,88600000.0
2,BaltimoreWashington,135000000.0
3,Boise,14400000.0
4,Boston,97300000.0


For region Albany, the total volume is 1.61e+07.

In [59]:
df['Regional total Sales'] = df.groupby('region')['Total Volume'].transform('sum')
df.head(5)

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,Regional total Sales
0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany,16100000.0
1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany,16100000.0
2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany,16100000.0
3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany,16100000.0
4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany,16100000.0


In [60]:
df.tail(2)

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,Regional total Sales
18247,2018-01-14,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.0,0.0,organic,2018,WestTexNewMexico,145000000.0
18248,2018-01-07,1.62,17489.58,2894.77,2356.13,224.53,12014.15,11988.14,26.01,0.0,organic,2018,WestTexNewMexico,145000000.0


In the next example, we’ll calculate the percentage of each region’s total sales is represented by each sale. 

In [61]:

df['Percent Of Region Sales'] = 100 * df['Total Volume'] / df.groupby(
                                'region')['Total Volume'].transform('sum')


df.head()

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,Regional total Sales,Percent Of Region Sales
0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany,16100000.0,0.4
1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany,16100000.0,0.34
2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany,16100000.0,0.74
3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany,16100000.0,0.49
4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany,16100000.0,0.32
