- Split a pandas object into pieces using one or more keys (in the form of functions, arrays, or DataFrame column names)

- Calculate group summary statistics, like count, mean, or standard deviation, or a user-defined function

- Apply within-group transformations or other manipulations, like normalization, linear regression, rank, or subset selection

- Compute pivot tables and cross-tabulations

- Perform quantile analysis and other statistical group analyses

In [1]:
import pandas as pd
import numpy as np

columns=pd.MultiIndex.from_arrays([["US","US","US","JP","JP"],[1,3,5,1,3]],names=["cty","tenor"])

hief_df=pd.DataFrame(np.random.randn(4,5),columns=columns)

hief_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.30696,0.983776,0.028804,-0.853934,1.810119
1,-0.672569,0.2113,0.813884,0.539932,1.717877
2,0.069272,1.115117,-0.864187,0.291381,0.883679
3,0.905884,0.667191,-0.839634,-0.238647,1.529546


In [2]:
hief_df.groupby(level="cty",axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [3]:
frame=pd.DataFrame({"data1":np.random.randn(1000),
                   "data2":np.random.randn(1000)})

quartiles=pd.cut(frame.data1,4)

quartiles[:10]

0        (1.55, 3.178]
1    (-1.706, -0.0779]
2      (-0.0779, 1.55]
3      (-0.0779, 1.55]
4        (1.55, 3.178]
5      (-0.0779, 1.55]
6      (-0.0779, 1.55]
7    (-1.706, -0.0779]
8    (-1.706, -0.0779]
9      (-0.0779, 1.55]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.34, -1.706] < (-1.706, -0.0779] < (-0.0779, 1.55] < (1.55, 3.178]]

In [4]:
def get_stats(group):
    return {"min":group.min(),"max":group.max(),
           "count":group.count(),"mean":group.mean()}

grouped=frame.data2.groupby(quartiles)


grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.34, -1.706]",35.0,1.70265,-0.043932,-2.547749
"(-1.706, -0.0779]",455.0,2.769459,0.06382,-2.868367
"(-0.0779, 1.55]",455.0,3.450067,-0.02337,-3.351633
"(1.55, 3.178]",55.0,3.575772,0.196132,-2.509256


In [5]:
grouping=pd.cut(frame.data1,10,labels=False)

grouped=frame.data2.groupby(grouping)

grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,6.0,0.969593,0.189055,-0.723794
1,9.0,0.198752,-0.398892,-1.589243
2,57.0,1.70265,0.101608,-2.547749
3,197.0,2.496927,-0.002877,-2.868367
4,221.0,2.769459,0.111908,-2.738247
5,245.0,3.275601,-0.01079,-2.387747
6,162.0,3.450067,-0.012959,-3.351633
7,76.0,3.575772,-0.011891,-2.107632
8,22.0,1.308911,0.070041,-2.509256
9,5.0,2.3819,0.851943,-0.516965


In [6]:
s=pd.Series(np.random.randn(6))

s[::2]=np.nan

s

0         NaN
1   -0.192871
2         NaN
3    1.219973
4         NaN
5   -1.511377
dtype: float64

In [7]:
s.fillna(s.mean())

0   -0.161425
1   -0.192871
2   -0.161425
3    1.219973
4   -0.161425
5   -1.511377
dtype: float64

In [8]:
states=["Ohio","New York","Vermont","Florida","Oregon","Nevada","California","Idaho"]

group_key=["East"]*4+["West"]*4

data=pd.Series(np.random.randn(8),index=states)

data

Ohio          0.769611
New York      0.685061
Vermont      -0.999411
Florida       0.160398
Oregon       -0.634871
Nevada       -0.037831
California   -0.599503
Idaho        -0.264724
dtype: float64

In [9]:
group_key

['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']

In [10]:
data[["Vermont","Nevada","Idaho"]]=np.nan

data

Ohio          0.769611
New York      0.685061
Vermont            NaN
Florida       0.160398
Oregon       -0.634871
Nevada             NaN
California   -0.599503
Idaho              NaN
dtype: float64

In [11]:
data.groupby(group_key).mean()

East    0.538357
West   -0.617187
dtype: float64

In [12]:
fill_mean=lambda g:g.fillna(g.mean())

data.groupby(group_key).apply(fill_mean)

Ohio          0.769611
New York      0.685061
Vermont       0.538357
Florida       0.160398
Oregon       -0.634871
Nevada       -0.617187
California   -0.599503
Idaho        -0.617187
dtype: float64

In [13]:
fill_values={"East":0.5,"West":-1}


fill_func=lambda g:g.fillna(fill_values[g.name])


data.groupby(group_key).apply(fill_func)

Ohio          0.769611
New York      0.685061
Vermont       0.500000
Florida       0.160398
Oregon       -0.634871
Nevada       -1.000000
California   -0.599503
Idaho        -1.000000
dtype: float64

In [14]:
suits=["H","S","C","D"]
card_val=(list(range(1,11))+[10]*3)*4

base_names=["A"]+list(range(2,11))+["J","K","Q"]
cards=[]

for suit in ["H","S","C","D"]:
    cards.extend(str(num)+suit for num in base_names)

deck=pd.Series(card_val,index=cards)


In [15]:
def draw(deck,n=5):
    return deck.sample(n)

In [16]:
draw(deck)

10C    10
5H      5
8D      8
6H      6
10H    10
dtype: int64

In [17]:
get_suit=lambda card:card[-1]

In [18]:
deck.groupby(get_suit).apply(draw,n=2)

C  9C     9
   QC    10
D  2D     2
   KD    10
H  4H     4
   QH    10
S  2S     2
   QS    10
dtype: int64

In [19]:
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)

AC    1
7C    7
2D    2
5D    5
6H    6
8H    8
3S    3
8S    8
dtype: int64

In [20]:
df=pd.DataFrame({"category":["a","a","a","a","b","b","b","b"],
                "data":np.random.randn(8),
                "weights":np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,-0.53438,0.641894
1,a,0.139894,0.112854
2,a,0.412425,0.262338
3,a,1.417842,0.948226
4,b,-0.35907,0.581653
5,b,1.037794,0.095024
6,b,-0.37966,0.836094
7,b,-0.646748,0.394461


In [21]:
grouped=df.groupby("category")

get_wavg=lambda g:np.average(g["data"],weights=g["weights"])

grouped.apply(get_wavg)

category
a    0.572633
b   -0.357999
dtype: float64

In [25]:
close_px=pd.read_csv("C;/Users/Administrator/pydata-book/ch09/stock_px.csv",parse_dates=True,index_col=0)

close_px.info()

FileNotFoundError: File b'C;//Users//Administrator//pydata-book//ch09//stock_px.csv' does not exist