# Chapter 10: Data Aggregation and Group Operation

In [1]:
# Categorizing a dataset and applying a function to each group
# Aggregation / Transformation.

In [2]:
import os

# Change my directory to where the dataset lives in
os.chdir('pydata-book/')
os.getcwd()

'/Users/Study/pydata-book'

In [3]:
# Pandas's groupby - allow you to slice/dice and summarize
# dataset in an easy way. You can perform complex group
# operation by using any function that accepts a pandas 
# object or numpy array.

# Learn how to:
# 1. Split a pandas object into pieces (by keys)
# 2. Calculate group summary statistics (count, mean, sd, f)
# 3. Apply within group transformation / manipulation
#    such as normalization, linear regression, rank, subset
# 4. Compute pivot table and cross tabulations
# 5. Perform quatile analysis and statistical group analysis

In [4]:
# Aggregation of time series data - is known as RESAMPLING
# see chapter 11 next.

In [5]:
import pandas as pd
import numpy as np

In [6]:
# Think of group operation as split-apply-combine

# 1. Data are split into groups by one/more keys
#    It can be done on a particular axis (rows, axis=0)
#    or (coloumns, axis=1)

# 2. Then a function (such as sum) is applied to each group
#    and it produces a new value

# 3. Finally the reuslts are combined into a result object

In [7]:
# To start, here is a small tabular datasets (DataFrame)
df = pd.DataFrame({
    'key1': 'a a b b a'.split(),
    'key2': 'one two one two one'.split(),
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

In [8]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.232329,-1.458341
1,a,two,-1.328359,0.451496
2,b,one,-0.459066,-0.129273
3,b,two,0.620973,0.800309
4,a,one,0.981402,0.289108


In [9]:
# Suppose you want to compute the mean (average) of data1
# using labels from key1.

# grouped is a groupby object
grouped = df['data1'].groupby(df['key1'])

In [10]:
grouped.mean()

key1
a   -0.526428
b    0.080954
Name: data1, dtype: float64

In [13]:
# Passing multiple arrays as a list
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [14]:
means

key1  key2
a     one    -0.125463
      two    -1.328359
b     one    -0.459066
      two     0.620973
Name: data1, dtype: float64

In [15]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.125463,-1.328359
b,-0.459066,0.620973


In [21]:
states = np.array('Ohio California California Ohio Ohio'.split())
years = np.array([2005, 2005, 2006, 2005, 2006])

In [22]:
df['data1'].groupby([states, years]).mean()

California  2005   -1.328359
            2006   -0.459066
Ohio        2005   -0.305678
            2006    0.981402
Name: data1, dtype: float64

In [23]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.232329,-1.458341
1,a,two,-1.328359,0.451496
2,b,one,-0.459066,-0.129273
3,b,two,0.620973,0.800309
4,a,one,0.981402,0.289108


In [25]:
# You get the A and B average for data1 and data2
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.526428,-0.239246
b,0.080954,0.335518


In [26]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.125463,-0.584616
a,two,-1.328359,0.451496
b,one,-0.459066,-0.129273
b,two,0.620973,0.800309


In [27]:
# A generally useful groupby method
# Return the group sizes
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [28]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.232329,-1.458341
1,a,two,-1.328359,0.451496
2,b,one,-0.459066,-0.129273
3,b,two,0.620973,0.800309
4,a,one,0.981402,0.289108


In [30]:
# This can be an interesting way to display it to the terminal
# or printing it out to user
for name, group in df.groupby(['key1']):
    print('name:', name)
    print(group)

name: a
  key1 key2     data1     data2
0    a  one -1.232329 -1.458341
1    a  two -1.328359  0.451496
4    a  one  0.981402  0.289108
name: b
  key1 key2     data1     data2
2    b  one -0.459066 -0.129273
3    b  two  0.620973  0.800309


In [31]:
# When there are multiple keys
# The first element becomes a tuple of key values
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -1.232329 -1.458341
4    a  one  0.981402  0.289108
('a', 'two')
  key1 key2     data1     data2
1    a  two -1.328359  0.451496
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.459066 -0.129273
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.620973  0.800309


In [32]:
list(df.groupby('key1'))

[('a',   key1 key2     data1     data2
  0    a  one -1.232329 -1.458341
  1    a  two -1.328359  0.451496
  4    a  one  0.981402  0.289108), ('b',   key1 key2     data1     data2
  2    b  one -0.459066 -0.129273
  3    b  two  0.620973  0.800309)]

In [36]:
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,-0.459066,-0.129273
3,b,two,0.620973,0.800309


In [38]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [41]:
# Group by column example
grouped = df.groupby(df.dtypes, axis=1)
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11efa8be0>

In [44]:
# The group is actually a grouped DataFrame
for name, group in grouped:
    print(name)
    print(type(group))
    print(group)

float64
<class 'pandas.core.frame.DataFrame'>
      data1     data2
0 -1.232329 -1.458341
1 -1.328359  0.451496
2 -0.459066 -0.129273
3  0.620973  0.800309
4  0.981402  0.289108
object
<class 'pandas.core.frame.DataFrame'>
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [47]:
# Selecting a Column or Subset of Columns
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11e9a71d0>

In [48]:
# The same as this, only syntactic surgar difference
# (Programming looks different but do the same thing)
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11efa88d0>

In [50]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.584616
a,two,0.451496
b,one,-0.129273
b,two,0.800309


In [51]:
# My practice
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one    -0.584616
      two     0.451496
b     one    -0.129273
      two     0.800309
Name: data2, dtype: float64

In [52]:
s_grouped = df.groupby(['key1', 'key2'])['data2']

In [53]:
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x11efc2160>

In [54]:
s_grouped.mean()

key1  key2
a     one    -0.584616
      two     0.451496
b     one    -0.129273
      two     0.800309
Name: data2, dtype: float64