- Split a pandas object into pieces using one or more keys (in the form of functions, arrays, or DataFrame column names)

- Calculate group summary statistics, like count, mean, or standard deviation, or a user-defined function

- Apply within-group transformations or other manipulations, like normalization, linear regression, rank, or subset selection

- Compute pivot tables and cross-tabulations

- Perform quantile analysis and other statistical group analyses

In [3]:
import pandas as pd
import numpy as np

columns=pd.MultiIndex.from_arrays([["US","US","US","JP","JP"],[1,3,5,1,3]],names=["cty","tenor"])

hief_df=pd.DataFrame(np.random.randn(4,5),columns=columns)

hief_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.034852,1.679543,-1.826537,2.004086,-1.311023
1,-0.433576,-1.773994,-0.278428,-0.8042,-0.138707
2,-0.687512,-1.316228,0.138486,-1.063413,-1.242186
3,2.528902,-0.407576,0.758973,1.27106,-0.199388


In [4]:
hief_df.groupby(level="cty",axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [5]:
frame=pd.DataFrame({"data1":np.random.randn(1000),
                   "data2":np.random.randn(1000)})

quartiles=pd.cut(frame.data1,4)

quartiles[:10]

0     (0.171, 1.841]
1     (0.171, 1.841]
2    (-1.499, 0.171]
3    (-1.499, 0.171]
4     (0.171, 1.841]
5    (-1.499, 0.171]
6    (-1.499, 0.171]
7     (0.171, 1.841]
8    (-1.499, 0.171]
9     (0.171, 1.841]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.176, -1.499] < (-1.499, 0.171] < (0.171, 1.841] < (1.841, 3.512]]

In [10]:
def get_stats(group):
    return {"min":group.min(),"max":group.max(),
           "count":group.count(),"mean":group.mean()}

grouped=frame.data2.groupby(quartiles)


grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.176, -1.499]",57.0,2.934282,-0.210856,-2.57519
"(-1.499, 0.171]",488.0,2.659131,0.063489,-2.627577
"(0.171, 1.841]",418.0,2.924808,0.019954,-2.445204
"(1.841, 3.512]",37.0,1.752489,-0.101989,-2.437732


In [11]:
grouping=pd.cut(frame.data1,10,labels=False)

grouped=frame.data2.groupby(grouping)

grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,3.0,0.752727,-0.410399,-2.301245
1,25.0,1.945392,-0.087562,-2.144291
2,69.0,2.934282,-0.231488,-2.57519
3,193.0,2.659131,0.100234,-2.627577
4,255.0,2.233886,0.074555,-2.487339
5,242.0,2.674451,-0.052123,-2.355731
6,142.0,2.924808,0.083643,-2.445204
7,50.0,1.868294,0.10243,-1.499299
8,18.0,1.752489,0.173798,-2.037353
9,3.0,0.967973,-0.982019,-2.437732


In [13]:
s=pd.Series(np.random.randn(6))

s[::2]=np.nan

s

0         NaN
1    1.036027
2         NaN
3    0.835081
4         NaN
5   -0.680273
dtype: float64

In [14]:
s.fillna(s.mean())

0    0.396945
1    1.036027
2    0.396945
3    0.835081
4    0.396945
5   -0.680273
dtype: float64

In [15]:
states=["Ohio","New York","Vermont","Florida","Oregon","Nevada","California","Idaho"]

group_key=["East"]*4+["West"]*4

data=pd.Series(np.random.randn(8),index=states)

data

Ohio         -0.358950
New York      0.980727
Vermont      -0.131643
Florida       1.737017
Oregon       -1.146479
Nevada        1.554613
California   -0.234942
Idaho        -1.158991
dtype: float64

In [16]:
group_key

['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']

In [17]:
data[["Vermont","Nevada","Idaho"]]=np.nan

data

Ohio         -0.358950
New York      0.980727
Vermont            NaN
Florida       1.737017
Oregon       -1.146479
Nevada             NaN
California   -0.234942
Idaho              NaN
dtype: float64

In [18]:
data.groupby(group_key).mean()

East    0.786264
West   -0.690711
dtype: float64

In [19]:
fill_mean=lambda g:g.fillna(g.mean())

data.groupby(group_key).apply(fill_mean)

Ohio         -0.358950
New York      0.980727
Vermont       0.786264
Florida       1.737017
Oregon       -1.146479
Nevada       -0.690711
California   -0.234942
Idaho        -0.690711
dtype: float64

In [21]:
fill_values={"East":0.5,"West":-1}


fill_func=lambda g:g.fillna(fill_values[g.name])


data.groupby(group_key).apply(fill_func)

Ohio         -0.358950
New York      0.980727
Vermont       0.500000
Florida       1.737017
Oregon       -1.146479
Nevada       -1.000000
California   -0.234942
Idaho        -1.000000
dtype: float64

In [23]:
suits=["H","S","C","D"]
card_val=(list(range(1,11))+[10]*3)*4

base_names=["A"]+list(range(2,11))+["J","K","Q"]
cards=[]

for suit in ["H","S","C","D"]:
    cards.extend(str(num)+suit for num in base_names)

deck=pd.Series(card_val,index=cards)


In [28]:
def draw(deck,n=5):
    return deck.sample(n)

In [30]:
draw(deck)

4S     4
9C     9
7S     7
8C     8
QS    10
dtype: int64