# <b>Python for Data Analysis</b>
# 10.  Data Aggregation and Group Operations

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision = 4, suppress = True)
from pandas import Series, DataFrame
%matplotlib inline

## 10.1 How to Think About Group Operations

In [4]:
df = pd.DataFrame({"key1":  ["a", "a", None, "b", "b", "a", None],
                   "key2":  pd.Series([1, 2, 1, 2, 1, None, 1], dtype = "Int64"),
                   "data1": np.random.standard_normal(7),
                   "data2": np.random.standard_normal(7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,1.631905,0.30431
1,a,2.0,1.248543,-0.011803
2,,1.0,1.255946,0.163522
3,b,2.0,-0.005433,0.629722
4,b,1.0,-0.559055,1.482669
5,a,,0.023074,-0.528604
6,,1.0,0.32888,1.126454


In [5]:
grouped = df["data1"].groupby(df["key1"])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000157C14E3F10>

In [6]:
grouped.mean()

key1
a    0.967840
b   -0.282244
Name: data1, dtype: float64

In [7]:
means = df["data1"].groupby([df["key1"], df["key2"]]).mean()
means

key1  key2
a     1       1.631905
      2       1.248543
b     1      -0.559055
      2      -0.005433
Name: data1, dtype: float64

In [8]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.631905,1.248543
b,-0.559055,-0.005433


In [9]:
states = np.array(["OH", "CA", "CA", "OH", "OH", "CA", "OH"])
years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]

In [10]:
states

array(['OH', 'CA', 'CA', 'OH', 'OH', 'CA', 'OH'], dtype='<U2')

In [11]:
years

[2005, 2005, 2006, 2005, 2006, 2005, 2006]

In [13]:
df["data1"].groupby([states, years]).mean()

CA  2005    0.635808
    2006    1.255946
OH  2005    0.813236
    2006   -0.115087
Name: data1, dtype: float64

In [14]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,1.631905,0.30431
1,a,2.0,1.248543,-0.011803
2,,1.0,1.255946,0.163522
3,b,2.0,-0.005433,0.629722
4,b,1.0,-0.559055,1.482669
5,a,,0.023074,-0.528604
6,,1.0,0.32888,1.126454


In [15]:
df.groupby("key1").mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,0.96784,-0.078699
b,1.5,-0.282244,1.056195


In [16]:
df.groupby("key2").mean(numeric_only = True)

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.664419,0.769239
2,0.621555,0.308959


In [17]:
df.groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1.631905,0.30431
a,2,1.248543,-0.011803
b,1,-0.559055,1.482669
b,2,-0.005433,0.629722


In [20]:
df.groupby(["key1", "key2"], dropna = False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [19]:
df.groupby("key1", dropna = False).size()

key1
a      3
b      2
NaN    2
dtype: int64

In [20]:
df.groupby(["key1", "key2"], dropna = False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [23]:
df.groupby(["key1", "key2"], dropna = False).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.0,1,1
a,2.0,1,1
a,,1,1
b,1.0,1,1
b,2.0,1,1
,1.0,2,2


In [23]:
df.groupby(["key1", "key2"], dropna = False).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.0,1,1
a,2.0,1,1
a,,1,1
b,1.0,1,1
b,2.0,1,1
,1.0,2,2


In [23]:
df.groupby(["key1", "key2"], dropna = False).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.0,1,1
a,2.0,1,1
a,,1,1
b,1.0,1,1
b,2.0,1,1
,1.0,2,2


### Iterating over Groups

In [24]:
for name, group in df.groupby("key1"):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1  1.631905  0.304310
1    a     2  1.248543 -0.011803
5    a  <NA>  0.023074 -0.528604
b
  key1  key2     data1     data2
3    b     2 -0.005433  0.629722
4    b     1 -0.559055  1.482669


In [25]:
for (k1, k2), group in df.groupby(["key1", "key2"]):
    print((k1, k2))
    print(group)

('a', 1)
  key1  key2     data1    data2
0    a     1  1.631905  0.30431
('a', 2)
  key1  key2     data1     data2
1    a     2  1.248543 -0.011803
('b', 1)
  key1  key2     data1     data2
4    b     1 -0.559055  1.482669
('b', 2)
  key1  key2     data1     data2
3    b     2 -0.005433  0.629722


In [26]:
pieces = {name: group for name, group in df.groupby("key1")}
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
3,b,2,-0.005433,0.629722
4,b,1,-0.559055,1.482669


In [27]:
grouped = df.groupby({"key1": "key", "key2": "key",
                      "data1": "data", "data2": "data"},
                     axis = "columns")

In [28]:
for group_key, group_values in grouped:
    print(group_key)
    print(group_values)

data
      data1     data2
0  1.631905  0.304310
1  1.248543 -0.011803
2  1.255946  0.163522
3 -0.005433  0.629722
4 -0.559055  1.482669
5  0.023074 -0.528604
6  0.328880  1.126454
key
   key1  key2
0     a     1
1     a     2
2  None     1
3     b     2
4     b     1
5     a  <NA>
6  None     1


### Selecting a Column or Subset of Columns

In [29]:
df.groupby("key1")["data1"]
df.groupby("key1")[["data2"]]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000157C6EE8EB0>

In [30]:
df["data1"].groupby(df["key1"])
df[["data2"]].groupby(df["key1"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000157C6EE8F40>

In [31]:
df.groupby(["key1", "key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,0.30431
a,2,-0.011803
b,1,1.482669
b,2,0.629722


In [32]:
s_grouped = df.groupby(["key1", "key2"])["data2"]
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000157C70495B0>

In [33]:
s_grouped.mean()

key1  key2
a     1       0.304310
      2      -0.011803
b     1       1.482669
      2       0.629722
Name: data2, dtype: float64

### Grouping with Dictionaries and Series

In [35]:
people = pd.DataFrame(np.random.standard_normal((5, 5)),
                      columns = ["a", "b", "c", "d", "e"],
                      index = ["Joe", "Steve", "Wanda", "Jill", "Trey"])
people

Unnamed: 0,a,b,c,d,e
Joe,-1.988791,0.738247,0.911159,0.30613,1.137284
Steve,0.037886,0.515261,0.548994,1.173018,0.742976
Wanda,0.598955,-0.752222,1.567979,1.718285,0.237434
Jill,-0.197546,0.665066,-0.773768,0.652881,-0.292093
Trey,-0.918161,-0.574675,0.640393,1.308219,-0.084637


In [36]:
people.iloc[2:3, [1, 2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-1.988791,0.738247,0.911159,0.30613,1.137284
Steve,0.037886,0.515261,0.548994,1.173018,0.742976
Wanda,0.598955,,,1.718285,0.237434
Jill,-0.197546,0.665066,-0.773768,0.652881,-0.292093
Trey,-0.918161,-0.574675,0.640393,1.308219,-0.084637


In [40]:
mapping = {"a": "red", "b": "red", "c": "blue",
           "d": "blue", "e": "red", "f" : "orange"}
mapping

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [41]:
by_column = people.groupby(mapping, axis = "columns")
by_column.sum()

Unnamed: 0,blue,red
Joe,1.217288,-0.11326
Steve,1.722012,1.296123
Wanda,1.718285,0.836389
Jill,-0.120887,0.175427
Trey,1.948612,-1.577473


In [42]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [44]:
people.groupby(map_series, axis = 1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wanda,1,2
Jill,2,3
Trey,2,3


### Grouping with Functions

In [45]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.988791,0.738247,0.911159,0.30613,1.137284
Steve,0.037886,0.515261,0.548994,1.173018,0.742976
Wanda,0.598955,,,1.718285,0.237434
Jill,-0.197546,0.665066,-0.773768,0.652881,-0.292093
Trey,-0.918161,-0.574675,0.640393,1.308219,-0.084637


In [46]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-1.988791,0.738247,0.911159,0.30613,1.137284
4,-1.115707,0.09039,-0.133375,1.9611,-0.37673
5,0.636841,0.515261,0.548994,2.891303,0.98041


In [47]:
key_list = ["one", "one", "one", "two", "two"]

In [48]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.988791,0.738247,0.911159,0.30613,1.137284
4,two,-0.918161,-0.574675,-0.773768,0.652881,-0.292093
5,one,0.037886,0.515261,0.548994,1.173018,0.237434


### Grouping by Index Levels

In [49]:
columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"],
                                     [1, 3, 5, 1, 3]],
                                    names = ["cty", "tenor"])
columns

MultiIndex([('US', 1),
            ('US', 3),
            ('US', 5),
            ('JP', 1),
            ('JP', 3)],
           names=['cty', 'tenor'])

In [51]:
hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), 
                       columns = columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.053065,0.504816,-0.923734,2.13703,1.060918
1,1.001431,-1.139409,-1.479037,0.828989,-1.393544
2,0.205064,-0.64418,1.890185,1.058631,1.546093
3,-0.320384,1.97403,0.973298,-0.453654,0.941031


In [52]:
hier_df.groupby(level = "cty", axis = 1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## 10.2 Data Aggregation