# Data Grouping and Aggregation In Pandas

## Data Grouping: 'groupby'

#### The purpose of pivot tables for reporting and data visualization is to analyze the data based on certain grouping mechanics mthe pandas 'groupby' method will serve this purpose with high demand.

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.DataFrame.groupby?

In [None]:
marks = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a', 'a'],
                      'key2' : ['one', 'two', 'one', 'two', 'one', 'one'],
                     'data1' : np.arange(10, 16),
                     'data2' : np.arange(16, 22)})
marks

In [None]:
marks['data1']

In [None]:
grouped = marks['data1'].groupby(by=marks['key1'])
grouped

In [None]:
grouped.mean()

In [None]:
print('a mean:', (10 + 11 + 14 + 15) / 4)
print('b mean:', (12 + 13)/2)

In [None]:
grouped.sum()

In [None]:
group_tk = marks['data1'].groupby(by=[marks['key1'], marks['key2']])
group_tk

In [None]:
print(marks)
print()
print('result:', group_tk.mean())

In [None]:
print((10 + 14 +15)/3)

In [None]:
group_tk.mean().unstack()

In [None]:
feedb = np.array(['good', 'avg', 'good', 'avg', 'good', 'avg'])
actual = np.array(['good', 'med', 'good', 'med', 'good', 'med'])
mean1 = marks['data1'].groupby(by=[feedb, actual]).mean()
mean1

In [None]:
mean_df = marks.groupby(by=['key1']).mean()
mean_df

In [None]:
print(marks)
print()
marks.groupby(by=['key1', 'key2']).size()

#### How To Iterate Over Groups?

In [None]:
marks

In [None]:
marks.groupby(by='key1')

In [None]:
for key_name, group_name in marks.groupby(by='key1'):
    print(key_name)
    print(group_name)
    print(type(key_name))
    print(type(group_name))

In [None]:
for (k1_name, k2_name), group_name in marks.groupby(by=['key1', 'key2']):
    print(k1_name, k2_name)
    print(group_name)

In [None]:
for (k1_name, k2_name), group_name in marks.groupby(by=['data1', 'data2']):
    print(k1_name, k2_name)
    print(group_name)

In [None]:
marks.dtypes

In [None]:
grouped = marks.groupby(marks.dtypes, axis=1)
for datatype, group in grouped:
    print(datatype)
    print(group)

#### Column Selection For Aggregation via 'groupby'

In [None]:
print(marks['data1']); print(marks[['data1', 'data2']]) 


In [None]:
sk_g = marks['data1'].groupby(by=marks['key1'])
dk_g = marks[['data1', 'data2']].groupby(by=marks['key2'])
sk_g
dk_g

In [None]:
print(sk_g.sum())
print(dk_g.sum())

In [None]:
sk_g = marks.groupby(by='key1')['data1'] 
dk_g = marks.groupby(by='key2')[['data1', 'data2']]   
print(sk_g.sum())
print(dk_g.sum())

#### How To Group With Dictionaries and Series?

In [None]:
rmlist = pd.DataFrame(np.random.randn(4, 5),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['one', 'two', 'three', 'four'])
rmlist

In [None]:
dic_map = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}
dic_map

In [None]:
g_column = rmlist.groupby(by=dic_map, axis=1)
g_column

In [None]:
g_column.sum()

In [None]:
s_map = pd.Series(dic_map)

In [None]:
g_column = rmlist.groupby(by=s_map, axis=1)
g_column.sum()

#### How To Group With Functions?

In [None]:
print(rmlist)
rmlist.groupby(len).sum()

In [None]:
key_list = ['one', 'one', 'one', 'two']
print(rmlist)
rmlist.groupby([len, key_list]).sum() 

In [None]:
key_list = ['one', 'two', 'three', 'four']
print(rmlist)
rmlist.groupby([len, key_list]).sum() 

#### How To Group by Index Level?

In [None]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'UK', 'RS', 'RS'],
                                    [1, 3, 5, 1, 3]],
                                    names=['city', 'tenor'])

hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

In [None]:
hier_df.groupby(level='city', axis=1).count()

## Data Aggregation

#### Aggregations refer to any data transformation that produces scalar values from arrays.
#### Some common aggregation methods are
* count 
* sum 
* mean 
* median 
* std, var 
* min, max 
* prod 
* first, last

#### still you can find many methods, these are just to illustrate

In [None]:
book = pd.read_csv(r'dataset/books_discount.csv', encoding='latin')
book.head() 

In [None]:
print(book['price'].min()); print(book['price'].max())

In [None]:
print(book['min_dis'].min()); print(book['max_dis'].min())

In [None]:
grouped = book.groupby(by=['feedback', 'author'], axis=0)

In [None]:
def max_min(arr):
    return arr.max(), arr.min()

grouped.agg(max_min)

In [None]:
grouped.describe()

#### How To Aggregate Column-wise and with Multiple Functions?

In [None]:
grouped = book.groupby(by=['feedback', 'author'])

In [None]:
grouped['price'].agg('min')

In [None]:
grouped['price'].agg('max')

In [None]:
grouped['price'].agg('mean')

In [None]:
grouped['price'].agg(['min', 'max', 'mean', 'std'])

In [None]:
grouped['price'].agg([('min_value', 'min'), ('max_value', 'max'), 'mean', 'std'])

In [None]:
functions = [('min_value', 'min'), ('max_value', 'max'), 'mean', 'std']
result = grouped['price', 'max_dis'].agg(functions)
result

In [None]:
result['price']

In [None]:
functions = {'min_value': 'min', 'max_value':'max', 'mean_value':'mean', 'std_value':'std'}
result = grouped['price', 'max_dis'].agg(functions)
result

In [None]:
grouped = book.groupby(by=['feedback', 'author'], as_index=False).min()
grouped