# Chapter 10: Data Aggregation and Group Operation

In [None]:
# Categorizing a dataset and applying a function to each group
# Aggregation / Transformation.

In [None]:
import os

# Change my directory to where the dataset lives in
os.chdir('pydata-book/')
os.getcwd()

In [1]:
# Pandas's groupby - allow you to slice/dice and summarize
# dataset in an easy way. You can perform complex group
# operation by using any function that accepts a pandas 
# object or numpy array.

# Learn how to:
# 1. Split a pandas object into pieces (by keys)
# 2. Calculate group summary statistics (count, mean, sd, f)
# 3. Apply within group transformation / manipulation
#    such as normalization, linear regression, rank, subset
# 4. Compute pivot table and cross tabulations
# 5. Perform quatile analysis and statistical group analysis

In [2]:
# Aggregation of time series data - is known as RESAMPLING
# see chapter 11 next.

In [3]:
import pandas as pd
import numpy as np

In [4]:
# Think of group operation as split-apply-combine

# 1. Data are split into groups by one/more keys
#    It can be done on a particular axis (rows, axis=0)
#    or (coloumns, axis=1)

# 2. Then a function (such as sum) is applied to each group
#    and it produces a new value

# 3. Finally the reuslts are combined into a result object

In [5]:
# To start, here is a small tabular datasets (DataFrame)
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [6]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.001746,-0.66698
1,a,two,1.413556,1.404884
2,b,one,1.535305,1.439479
3,b,two,-1.284124,-0.969941
4,a,one,-0.87501,-1.800434


In [12]:
# Suppose you want to compute the mean (average) of data1
# using labels from key1.

# grouped is a groupby object
grouped = df['data1'].groupby(df['key1'])

In [13]:
grouped.mean()

key1
a    0.180098
b    0.125591
Name: data1, dtype: float64