In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

### GroupBy技术

In [2]:
df = DataFrame({"key1": ["a", "a", "b", "b", "a"],
               "key2": ["one", "two", "one", "two", "one"],
               "data1": np.random.randn(5),
               "data2": np.random.randn(5)})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.68715,-1.774263,a,one
1,-0.434054,-0.405102,a,two
2,1.398337,-0.782773,b,one
3,-0.931734,-1.292521,b,two
4,-0.684119,-2.272806,a,one


In [4]:
# 根据key1分组，并计算data1的平均值
grouped = df["data1"].groupby(df["key1"])

In [5]:
grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x1083a94e0>

In [6]:
# 计算算术平均值
grouped.mean()

key1
a   -0.601775
b    0.233301
Name: data1, dtype: float64

In [7]:
means= df["data1"].groupby([df["key1"], df["key2"]]).mean()

In [8]:
means

key1  key2
a     one    -0.685635
      two    -0.434054
b     one     1.398337
      two    -0.931734
Name: data1, dtype: float64

In [9]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.685635,-0.434054
b,1.398337,-0.931734


In [10]:
# 分区键可以是任何长度适当的数组
states = np.array(["Ohio", "California", "California", "Ohio", "Ohio"])

In [11]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [12]:
df["data1"].groupby([states, years]).mean()

California  2005   -0.434054
            2006    1.398337
Ohio        2005   -0.809442
            2006   -0.684119
Name: data1, dtype: float64

In [13]:
# 将列名（可以是字符串、数字或其他Python对象）用作分组键
df.groupby("key1").mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.601775,-1.484057
b,0.233301,-1.037647


In [14]:
df.groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.685635,-2.023534
a,two,-0.434054,-0.405102
b,one,1.398337,-0.782773
b,two,-0.931734,-1.292521


In [15]:
df.groupby(["key1", "key2"]).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

#### 1.对分组进行迭代

In [16]:
# GroupBy对象支持迭代，可以产生一组二元元组（由分组名和数据库组成）
for name, group in df.groupby("key1"):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -0.687150 -1.774263    a  one
1 -0.434054 -0.405102    a  two
4 -0.684119 -2.272806    a  one
b
      data1     data2 key1 key2
2  1.398337 -0.782773    b  one
3 -0.931734 -1.292521    b  two


In [17]:
# 对于多重键的情况，元组的第一个元素将会是由键值组成的元组
for (k1, k2), group in df.groupby(["key1", "key2"]):
    print(k1, k2)
    print(group)

a one
      data1     data2 key1 key2
0 -0.687150 -1.774263    a  one
4 -0.684119 -2.272806    a  one
a two
      data1     data2 key1 key2
1 -0.434054 -0.405102    a  two
b one
      data1     data2 key1 key2
2  1.398337 -0.782773    b  one
b two
      data1     data2 key1 key2
3 -0.931734 -1.292521    b  two


In [18]:
# 将数据片段做成一个字典
pieces = dict(list(df.groupby("key1")))

In [19]:
pieces["b"]

Unnamed: 0,data1,data2,key1,key2
2,1.398337,-0.782773,b,one
3,-0.931734,-1.292521,b,two


In [20]:
# groupby默认是在axis=0上进行分组的，通过设置也可以在其他任何轴上进行分组
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [21]:
grouped = df.groupby(df.dtypes, axis=1)

In [22]:
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -0.687150 -1.774263
 1 -0.434054 -0.405102
 2  1.398337 -0.782773
 3 -0.931734 -1.292521
 4 -0.684119 -2.272806, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

#### 2.选取一个或一组列

In [23]:
df.groupby("key1")["data1"]
df.groupby("key1")[["data2"]]

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x1083dca90>

In [24]:
# 代码的语法糖
df["data1"].groupby(df["key1"])
df[["data2"]].groupby(df["key1"])

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x1083dcdd8>

In [25]:
# 计算data2列的平均值并以DataFrame形式得到结果
df.groupby(["key1", "key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-2.023534
a,two,-0.405102
b,one,-0.782773
b,two,-1.292521


In [26]:
s_grouped = df.groupby(["key1", "key2"])["data2"]

In [27]:
s_grouped

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x1083dcc50>

In [28]:
s_grouped.mean()

key1  key2
a     one    -2.023534
      two    -0.405102
b     one    -0.782773
      two    -1.292521
Name: data2, dtype: float64

#### 3.通过字典或Series进行分组

In [29]:
people = DataFrame(np.random.randn(5, 5),
                  columns=["a", "b", "c", "d", "e"],
                  index=["Joe", "Steve", "Wes", "Jim", "Travis"])

In [30]:
# 添加几个NA值
people.loc[2:3, ["b", "c"]] = np.nan

In [31]:
people

Unnamed: 0,a,b,c,d,e
Joe,1.891973,-0.700203,-2.878941,-0.749868,-0.563877
Steve,-0.589142,-1.080077,0.817692,0.96449,-1.200756
Wes,-0.349806,,,-1.397491,1.620202
Jim,0.948535,-0.294875,-0.514823,-0.373288,0.360737
Travis,0.021419,0.396398,-0.818838,-0.533214,-0.616636


In [32]:
mapping = {"a": "red", "b": "red", "c": "blue",
          "d": "blue", "e": "red", "f": "orange"}

In [33]:
by_column = people.groupby(mapping, axis=1)

In [34]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-3.628809,0.627893
Steve,1.782182,-2.869975
Wes,-1.397491,1.270396
Jim,-0.888111,1.014397
Travis,-1.352052,-0.198819


In [35]:
map_series = Series(mapping)

In [36]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [37]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


#### 4.通过函数进行分组

In [38]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,2.490702,-0.995078,-3.393764,-2.520648,1.417062
5,-0.589142,-1.080077,0.817692,0.96449,-1.200756
6,0.021419,0.396398,-0.818838,-0.533214,-0.616636


In [39]:
# 将函数跟数组、列表、字典、Series混合使用也不是问题，因为任何东西最终都会被转换为数组
key_list = ["one", "one", "one", "two", "two"]

In [40]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.349806,-0.700203,-2.878941,-1.397491,-0.563877
3,two,0.948535,-0.294875,-0.514823,-0.373288,0.360737
5,one,-0.589142,-1.080077,0.817692,0.96449,-1.200756
6,two,0.021419,0.396398,-0.818838,-0.533214,-0.616636


#### 5.根据索引级别分组

In [41]:
columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"],
                                    [1, 3, 5, 1, 3]], names=["city", "tenor"])

In [42]:
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)

In [43]:
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.365825,-1.02316,-1.95938,-0.286463,-1.013048
1,1.309183,0.601422,0.634056,-1.065313,-0.44558
2,-0.69103,1.889217,0.390313,0.584518,1.603914
3,0.043819,-0.190291,0.915312,0.368626,0.968169


In [44]:
hier_df.groupby(level="city", axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


### 数据聚合

In [45]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.68715,-1.774263,a,one
1,-0.434054,-0.405102,a,two
2,1.398337,-0.782773,b,one
3,-0.931734,-1.292521,b,two
4,-0.684119,-2.272806,a,one


In [46]:
grouped = df.groupby("key1")

In [47]:
# GroupBy会高效地对Series进行切片，然后对各片调用piece.quantitle(0.9)，最后将这些结果组装成最终结果。
grouped["data1"].quantile(0.9)

key1
a   -0.484067
b    1.165330
Name: data1, dtype: float64

In [48]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [49]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.253097,1.867704
b,2.330071,0.509749


In [50]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.601775,0.145258,-0.68715,-0.685635,-0.684119,-0.559087,-0.434054,3.0,-1.484057,0.96708,-2.272806,-2.023534,-1.774263,-1.089682,-0.405102
b,2.0,0.233301,1.647609,-0.931734,-0.349216,0.233301,0.815819,1.398337,2.0,-1.037647,0.360447,-1.292521,-1.165084,-1.037647,-0.91021,-0.782773


In [52]:
tips = pd.read_csv("tips.csv")

In [53]:
# 添加“小费占总金额百分比”的列
tips["tip_pct"] = tips["tip"] / tips["total_bill"]

In [54]:
tips[:6]

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.18624


#### 1.面向列的多函数应用

In [56]:
grouped = tips.groupby(["day", "smoker"])

In [58]:
grouped_pct = grouped["tip_pct"]

In [59]:
grouped_pct.agg("mean")

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [60]:
grouped_pct.agg(["mean", "std", peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [61]:
grouped_pct.agg([("foo", "mean"), ("bar", np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [62]:
functions = ["count", "mean", "max"]

In [63]:
result = grouped["tip_pct", "total_bill"].agg(functions)

In [64]:
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [65]:
result["tip_pct"]

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [66]:
# 传入自定义名称的元组列表
ftuples = [("Durchschnitt", "mean"), ("Abweichung", np.var)]

In [67]:
grouped["tip_pct", "total_bill"].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [68]:
# 对不同的列应用不同的函数
grouped.agg({"tip": np.max, "size": "sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,9,3.5
Fri,Yes,31,4.73
Sat,No,115,9.0
Sat,Yes,104,10.0
Sun,No,167,6.0
Sun,Yes,49,6.5
Thur,No,112,6.7
Thur,Yes,40,5.0


In [69]:
grouped.agg({"tip_pct": ["min", "max", "mean", "std"],
            "size": "sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip_pct,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,max,mean,std
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Fri,No,9,0.120385,0.187735,0.15165,0.028123
Fri,Yes,31,0.103555,0.26348,0.174783,0.051293
Sat,No,115,0.056797,0.29199,0.158048,0.039767
Sat,Yes,104,0.035638,0.325733,0.147906,0.061375
Sun,No,167,0.059447,0.252672,0.160113,0.042347
Sun,Yes,49,0.06566,0.710345,0.18725,0.154134
Thur,No,112,0.072961,0.266312,0.160298,0.038774
Thur,Yes,40,0.090014,0.241255,0.163863,0.039389


#### 2.以“无索引”的形式返回聚合数据

In [70]:
tips.groupby(["day", "smoker"], as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Fri,No,18.42,2.8125,2.25,0.15165
1,Fri,Yes,16.813333,2.714,2.066667,0.174783
2,Sat,No,19.661778,3.102889,2.555556,0.158048
3,Sat,Yes,21.276667,2.875476,2.47619,0.147906
4,Sun,No,20.506667,3.167895,2.929825,0.160113
5,Sun,Yes,24.12,3.516842,2.578947,0.18725
6,Thur,No,17.113111,2.673778,2.488889,0.160298
7,Thur,Yes,19.190588,3.03,2.352941,0.163863


### 分组运算和转换

In [71]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.68715,-1.774263,a,one
1,-0.434054,-0.405102,a,two
2,1.398337,-0.782773,b,one
3,-0.931734,-1.292521,b,two
4,-0.684119,-2.272806,a,one


In [72]:
k1_means = df.groupby("key1").mean().add_prefix("mean_")

In [73]:
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.601775,-1.484057
b,0.233301,-1.037647


In [74]:
pd.merge(df, k1_means, left_on="key1", right_index=True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,-0.68715,-1.774263,a,one,-0.601775,-1.484057
1,-0.434054,-0.405102,a,two,-0.601775,-1.484057
4,-0.684119,-2.272806,a,one,-0.601775,-1.484057
2,1.398337,-0.782773,b,one,0.233301,-1.037647
3,-0.931734,-1.292521,b,two,0.233301,-1.037647


In [75]:
key = ["one", "two", "one", "two", "one"]

In [76]:
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,0.521195,-0.151902,-1.848889,-0.893525,0.146563
two,0.179697,-0.687476,0.151435,0.295601,-0.42001


In [77]:
people.groupby(key).transform(np.mean)

Unnamed: 0,a,b,c,d,e
Joe,0.521195,-0.151902,-1.848889,-0.893525,0.146563
Steve,0.179697,-0.687476,0.151435,0.295601,-0.42001
Wes,0.521195,-0.151902,-1.848889,-0.893525,0.146563
Jim,0.179697,-0.687476,0.151435,0.295601,-0.42001
Travis,0.521195,-0.151902,-1.848889,-0.893525,0.146563


In [78]:
def demean(arr):
    return arr - arr.mean()

In [79]:
demeaned = people.groupby(key).transform(demean)

In [80]:
demeaned

Unnamed: 0,a,b,c,d,e
Joe,1.370778,-0.5483,-1.030051,0.143656,-0.71044
Steve,-0.768839,-0.392601,0.666258,0.668889,-0.780746
Wes,-0.871001,,,-0.503966,1.473639
Jim,0.768839,0.392601,-0.666258,-0.668889,0.780746
Travis,-0.499776,0.5483,1.030051,0.36031,-0.763199


In [81]:
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,3.700743e-17,0.0,0.0,-3.700743e-17,3.700743e-17
two,0.0,2.775558e-17,0.0,0.0,0.0


#### 1.apply：一般性的“拆分-应用-合并”

In [84]:
def top(df, n=5, column="tip_pct"):
    return df.sort_values(by=column)[-n:]

In [85]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [86]:
tips.groupby("smoker").apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [87]:
tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [88]:
result = tips.groupby("smoker")["tip_pct"].describe()

In [89]:
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [90]:
result.unstack("smoker")

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

#### 2.禁止分组键

In [91]:
# 将group_keys=False传入groupby即可禁止分组键
tips.groupby("smoker", group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


#### 3.分位数和桶分析

In [92]:
frame = DataFrame({"data1": np.random.randn(1000),
                  "data2": np.random.randn(1000)})

In [93]:
factor = pd.cut(frame.data1, 4)

In [94]:
factor[:10]

0     (0.0352, 1.796]
1    (-1.726, 0.0352]
2    (-3.493, -1.726]
3    (-1.726, 0.0352]
4     (0.0352, 1.796]
5    (-1.726, 0.0352]
6    (-1.726, 0.0352]
7    (-1.726, 0.0352]
8     (0.0352, 1.796]
9     (0.0352, 1.796]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.493, -1.726] < (-1.726, 0.0352] < (0.0352, 1.796] < (1.796, 3.557]]

In [95]:
def get_stats(group):
    return {"min": group.min(), "max": group.max(),
           "count": group.count(), "mean": group.mean()}

In [96]:
grouped = frame.data2.groupby(factor)

In [97]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.493, -1.726]",33.0,2.347357,-0.209854,-2.145654
"(-1.726, 0.0352]",476.0,3.468942,0.010259,-3.097345
"(0.0352, 1.796]",461.0,2.904927,0.014811,-3.271628
"(1.796, 3.557]",30.0,2.760595,0.384066,-1.377826


In [98]:
# 返回分位数编号
grouping = pd.qcut(frame.data1, 10, labels=False)

In [99]:
grouped = frame.data2.groupby(grouping)

In [100]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,2.433572,0.052637,-3.097345
1,100.0,2.156158,-0.192324,-2.241774
2,100.0,3.468942,0.025494,-2.036837
3,100.0,3.032014,0.012069,-1.832242
4,100.0,2.657701,0.059396,-2.037194
5,100.0,2.369096,0.036094,-2.344102
6,100.0,2.162911,0.08481,-2.169351
7,100.0,2.904927,-0.107136,-3.122783
8,100.0,2.104799,-0.020859,-3.271628
9,100.0,2.833465,0.212898,-2.193501


#### 4.示例：用特定分组的值填充缺失值

In [101]:
s = Series(np.random.randn(6))

In [102]:
s[::2] = np.nan

In [103]:
s

0         NaN
1    0.513820
2         NaN
3    1.544453
4         NaN
5    0.537672
dtype: float64

In [105]:
# 用平均值填充NA值
s.fillna(s.mean())

0    0.865315
1    0.513820
2    0.865315
3    1.544453
4    0.865315
5    0.537672
dtype: float64

In [106]:
# 对不同的分组填充不同的值
states = ["Ohio", "New York", "Vermont", "Florida",
         "Oregon", "Nevada", "California", "Idaho"]
group_key = ["East"] * 4 + ["West"] * 4

In [107]:
data = Series(np.random.randn(8), index=states)

In [108]:
data[["Vermont", "Nevada", "Idaho"]] = np.nan

In [109]:
data

Ohio         -0.520150
New York      0.984346
Vermont            NaN
Florida      -2.709375
Oregon       -0.250345
Nevada             NaN
California   -0.075430
Idaho              NaN
dtype: float64

In [110]:
data.groupby(group_key).mean()

East   -0.748393
West   -0.162888
dtype: float64

In [112]:
# 用平均值去填充NA值
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

Ohio         -0.520150
New York      0.984346
Vermont      -0.748393
Florida      -2.709375
Oregon       -0.250345
Nevada       -0.162888
California   -0.075430
Idaho        -0.162888
dtype: float64

In [113]:
fill_values = {"East": 0.5, "West": -1}

In [114]:
fill_func = lambda g: g.fillna(fill_values[g.name])

In [115]:
data.groupby(group_key).apply(fill_func)

Ohio         -0.520150
New York      0.984346
Vermont       0.500000
Florida      -2.709375
Oregon       -0.250345
Nevada       -1.000000
California   -0.075430
Idaho        -1.000000
dtype: float64

#### 5.示例：随机采样和排列

In [119]:
# 红桃（Hearts）、黑桃（Spades）、梅花（Clubs）、方片（Diamonds）
suits = ["H", "S", "C", "D"]
card_val = ([i for i in range(1, 11)] + [10]*3) * 4
base_names = ["A"] + [i for i in range(2, 11)] + ["J", "K", "Q"]
cards = []
for suit in ["H", "S", "C", "D"]:
    cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val, index=cards)

In [120]:
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [121]:
# 从整副牌中抽取5张
def draw(deck, n=5):
    return deck.take(np.random.permutation(len(deck))[:n])

In [122]:
draw(deck)

QD     10
10C    10
JH     10
7D      7
JD     10
dtype: int64

In [123]:
# 从每种花色中随机抽取两张牌
get_suit = lambda card: card[-1]
deck.groupby(get_suit).apply(draw, 2)

C  6C      6
   8C      8
D  JD     10
   8D      8
H  3H      3
   10H    10
S  JS     10
   KS     10
dtype: int64

In [124]:
# 从每种花色中随机抽取两张牌的另一种实现方法
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

3C     3
9C     9
9D     9
KD    10
2H     2
9H     9
QS    10
2S     2
dtype: int64

#### 6.示例：分组加权平均数和相关系数

In [125]:
df = DataFrame({"category": ["a", "a", "a", "a", "b", "b", "b", "b"],
                "data": np.random.randn(8),
                "weights": np.random.rand(8)})

In [126]:
df

Unnamed: 0,category,data,weights
0,a,-0.026247,0.534807
1,a,-0.776326,0.585366
2,a,-1.792602,0.352003
3,a,-0.768951,0.219337
4,b,1.238022,0.82244
5,b,0.141418,0.506482
6,b,0.302332,0.439672
7,b,-0.6479,0.258767


In [127]:
# 利用category计算分组加权平均数
grouped = df.groupby("category")

In [128]:
get_wavg = lambda g: np.average(g["data"], weights=g["weights"])

In [129]:
grouped.apply(get_wavg)

category
a   -0.749703
b    0.520429
dtype: float64

In [130]:
# 查看标准普尔500指数（SPX字段）
close_px = pd.read_csv("stock_px.csv", parse_dates=True, index_col=0)

In [131]:
close_px

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990-02-01,4.98,7.86,2.87,16.79,4.27,0.51,6.04,328.79,6.12
1990-02-02,5.04,8.00,2.87,16.89,4.37,0.51,6.09,330.92,6.24
1990-02-05,5.07,8.18,2.87,17.32,4.34,0.51,6.05,331.85,6.25
1990-02-06,5.01,8.12,2.88,17.56,4.32,0.51,6.15,329.66,6.23
1990-02-07,5.04,7.77,2.91,17.93,4.38,0.51,6.17,333.75,6.33
1990-02-08,5.04,7.71,2.92,17.86,4.46,0.51,6.22,332.96,6.35
1990-02-09,5.06,8.00,2.94,17.82,4.49,0.52,6.24,333.62,6.37
1990-02-12,4.96,7.94,2.89,17.58,4.46,0.52,6.23,330.08,6.22
1990-02-13,4.91,8.06,2.88,17.95,4.43,0.52,6.09,331.02,6.23
1990-02-14,4.94,8.00,2.89,18.04,4.47,0.52,6.10,332.01,6.20


In [132]:
close_px[-4:]

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
2011-10-11,10.3,400.29,16.14,185.0,63.96,27.0,60.95,1195.54,76.27
2011-10-12,10.05,402.19,16.4,186.12,64.33,26.96,62.7,1207.25,77.16
2011-10-13,10.1,408.43,16.22,186.82,64.23,27.18,62.36,1203.66,76.37
2011-10-14,10.26,422.0,16.6,190.53,64.72,27.27,62.24,1224.58,78.11


In [133]:
# 计算一个由日收益率（通过百分数变化计算）与SPX之间的年度相关系数组成的DataFrame。
rets = close_px.pct_change().dropna()

In [134]:
spx_corr = lambda x: x.corrwith(x["SPX"])

In [135]:
by_year = rets.groupby(lambda x: x.year)

In [136]:
by_year.apply(spx_corr)

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990,0.595024,0.545067,0.752187,0.738361,0.801145,0.586691,0.783168,1.0,0.517586
1991,0.453574,0.365315,0.759607,0.557046,0.646401,0.524225,0.641775,1.0,0.569335
1992,0.39818,0.498732,0.632685,0.262232,0.51574,0.492345,0.473871,1.0,0.318408
1993,0.259069,0.238578,0.447257,0.211269,0.451503,0.425377,0.385089,1.0,0.318952
1994,0.428549,0.26842,0.572996,0.385162,0.372962,0.436585,0.450516,1.0,0.395078
1995,0.291532,0.161829,0.519126,0.41639,0.315733,0.45366,0.413144,1.0,0.368752
1996,0.292344,0.191482,0.750724,0.388497,0.569232,0.564015,0.421477,1.0,0.538736
1997,0.564427,0.211435,0.827512,0.646823,0.703538,0.606171,0.509344,1.0,0.695653
1998,0.533802,0.379883,0.815243,0.623982,0.591988,0.698773,0.494213,1.0,0.369264
1999,0.099033,0.425584,0.710928,0.486167,0.517061,0.631315,0.336593,1.0,0.315383


In [138]:
by_year.apply(lambda g: g["AAPL"].corr(g["MSFT"]))

1990    0.408271
1991    0.266807
1992    0.450592
1993    0.236917
1994    0.361638
1995    0.258642
1996    0.147539
1997    0.196144
1998    0.364106
1999    0.329484
2000    0.275298
2001    0.563156
2002    0.571435
2003    0.486262
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

#### 7.示例：面向分组的线性回归

In [145]:
# 对各数据块执行普通最小二乘法（Ordinary Least Squares, OLS）回归
import statsmodels.api as sm
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X["interecpt"] = 1
    result = sm.OLS(Y, X).fit()
    return result.params

In [146]:
by_year.apply(regress, "AAPL", ["SPX"])

Unnamed: 0,SPX,interecpt
1990,1.512772,0.001395
1991,1.187351,0.000396
1992,1.832427,0.000164
1993,1.39047,-0.002657
1994,1.190277,0.001617
1995,0.858818,-0.001423
1996,0.829389,-0.001791
1997,0.749928,-0.001901
1998,1.164582,0.004075
1999,1.384989,0.003273


### 透视表和交叉表

In [148]:
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.50,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.139780
4,24.59,3.61,No,Sun,Dinner,4,0.146808
5,25.29,4.71,No,Sun,Dinner,4,0.186240
6,8.77,2.00,No,Sun,Dinner,2,0.228050
7,26.88,3.12,No,Sun,Dinner,4,0.116071
8,15.04,1.96,No,Sun,Dinner,2,0.130319
9,14.78,3.23,No,Sun,Dinner,2,0.218539


In [154]:
# 透视表
tips.pivot_table(columns=["smoker"])

smoker,No,Yes
size,2.668874,2.408602
tip,2.991854,3.00871
tip_pct,0.159328,0.163196
total_bill,19.188278,20.756344


### 示例：2012联邦选举委员会数据库


In [155]:
fec = pd.read_csv("./datasets/fec/P00000001-ALL.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [156]:
fec

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num
0,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,3.6601e+08,RETIRED,RETIRED,250.0,20-JUN-11,,,,SA17A,736166
1,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,3.6601e+08,RETIRED,RETIRED,50.0,23-JUN-11,,,,SA17A,736166
2,C00410118,P20002978,"Bachmann, Michelle","SMITH, LANIER",LANETT,AL,3.68633e+08,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,05-JUL-11,,,,SA17A,749073
3,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,7.24548e+08,NONE,RETIRED,250.0,01-AUG-11,,,,SA17A,749073
4,C00410118,P20002978,"Bachmann, Michelle","WARDENBURG, HAROLD",HOT SPRINGS NATION,AR,7.19016e+08,NONE,RETIRED,300.0,20-JUN-11,,,,SA17A,736166
5,C00410118,P20002978,"Bachmann, Michelle","BECKMAN, JAMES",SPRINGDALE,AR,7.27647e+08,NONE,RETIRED,500.0,23-JUN-11,,,,SA17A,736166
6,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,7.24548e+08,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,21-JUN-11,,,,SA17A,736166
7,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,7.24548e+08,NONE,RETIRED,250.0,05-JUL-11,,,,SA17A,749073
8,C00410118,P20002978,"Bachmann, Michelle","COLLINS, SARAH",MESA,AZ,8.52107e+08,ST. JOSEPH HOSPITAL,RN,250.0,21-JUN-11,,,,SA17A,736166
9,C00410118,P20002978,"Bachmann, Michelle","COLEMAN, RONALD",TUCSON,AZ,8.57499e+08,RAYTHEON,ELECTRICAL ENGINEER,250.0,20-JUN-11,,,,SA17A,736166


In [157]:
fec.loc[123456]

cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                          50
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object

In [158]:
unique_cands = fec.cand_nm.unique()

In [159]:
unique_cands

array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
       'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
       'Huntsman, Jon', 'Perry, Rick'], dtype=object)

In [160]:
unique_cands[2]

'Obama, Barack'

In [161]:
# 利用字典说明党派关系
parties = {
    "Bachmann, Michelle": "Republican",
    "Obama, Barack": "Democrat"
}

In [162]:
fec.cand_nm[123456:123461]

123456    Obama, Barack
123457    Obama, Barack
123458    Obama, Barack
123459    Obama, Barack
123460    Obama, Barack
Name: cand_nm, dtype: object

In [163]:
fec.cand_nm[123456:123461].map(parties)

123456    Democrat
123457    Democrat
123458    Democrat
123459    Democrat
123460    Democrat
Name: cand_nm, dtype: object

In [164]:
# 将其添加为一个新列
fec["party"] = fec.cand_nm.map(parties)

In [165]:
fec["party"].value_counts()

Democrat      593746
Republican     13140
Name: party, dtype: int64

In [166]:
(fec.contb_receipt_amt > 0).value_counts()

True     991475
False     10256
Name: contb_receipt_amt, dtype: int64

In [167]:
# 数据集只能有正的出资额
fec = fec[fec.contb_receipt_amt > 0]

In [168]:
fec

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,party
0,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,3.6601e+08,RETIRED,RETIRED,250.0,20-JUN-11,,,,SA17A,736166,Republican
1,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,3.6601e+08,RETIRED,RETIRED,50.0,23-JUN-11,,,,SA17A,736166,Republican
2,C00410118,P20002978,"Bachmann, Michelle","SMITH, LANIER",LANETT,AL,3.68633e+08,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,05-JUL-11,,,,SA17A,749073,Republican
3,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,7.24548e+08,NONE,RETIRED,250.0,01-AUG-11,,,,SA17A,749073,Republican
4,C00410118,P20002978,"Bachmann, Michelle","WARDENBURG, HAROLD",HOT SPRINGS NATION,AR,7.19016e+08,NONE,RETIRED,300.0,20-JUN-11,,,,SA17A,736166,Republican
5,C00410118,P20002978,"Bachmann, Michelle","BECKMAN, JAMES",SPRINGDALE,AR,7.27647e+08,NONE,RETIRED,500.0,23-JUN-11,,,,SA17A,736166,Republican
6,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,7.24548e+08,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,21-JUN-11,,,,SA17A,736166,Republican
7,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,7.24548e+08,NONE,RETIRED,250.0,05-JUL-11,,,,SA17A,749073,Republican
8,C00410118,P20002978,"Bachmann, Michelle","COLLINS, SARAH",MESA,AZ,8.52107e+08,ST. JOSEPH HOSPITAL,RN,250.0,21-JUN-11,,,,SA17A,736166,Republican
9,C00410118,P20002978,"Bachmann, Michelle","COLEMAN, RONALD",TUCSON,AZ,8.57499e+08,RAYTHEON,ELECTRICAL ENGINEER,250.0,20-JUN-11,,,,SA17A,736166,Republican


In [169]:
fec_mrbo = fec[fec.cand_nm.isin(["Obama, Barack", "Romney, Mitt"])]

In [170]:
fec_mrbo

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,party
411,C00431171,P80003353,"Romney, Mitt","ELDERBAUM, WILLIAM",DPO,AA,3.4023e+08,US GOVERNMENT,FOREIGN SERVICE OFFICER,25.0,01-FEB-12,,,,SA17A,780124,
412,C00431171,P80003353,"Romney, Mitt","ELDERBAUM, WILLIAM",DPO,AA,3.4023e+08,US GOVERNMENT,FOREIGN SERVICE OFFICER,110.0,01-FEB-12,,,,SA17A,780124,
413,C00431171,P80003353,"Romney, Mitt","CARLSEN, RICHARD",APO,AE,9.128e+07,DEFENSE INTELLIGENCE AGENCY,INTELLIGENCE ANALYST,250.0,13-APR-12,,,,SA17A,785689,
414,C00431171,P80003353,"Romney, Mitt","DELUCA, PIERRE",APO,AE,9.128e+07,CISCO,ENGINEER,30.0,21-AUG-11,,,,SA17A,760261,
415,C00431171,P80003353,"Romney, Mitt","SARGENT, MICHAEL",APO,AE,9.01201e+07,RAYTHEON TECHNICAL SERVICES CORP,COMPUTER SYSTEMS ENGINEER,100.0,07-MAR-12,,,,SA17A,780128,
416,C00431171,P80003353,"Romney, Mitt","WILSON, ANDREW C. MR.",DPO,AE,9.87e+07,US DEPT OF STATE,FOREIGN SERVICE OFFICER,50.0,17-MAR-12,,,,SA17A,780128,
417,C00431171,P80003353,"Romney, Mitt","GRIFFIS, JOHN",APO,AE,9.128e+07,US ARMY,MILITARY OFFICER,250.0,20-MAR-12,,,,SA17A,780128,
418,C00431171,P80003353,"Romney, Mitt","SARGENT, MICHAEL",APO,AE,9.01201e+07,RAYTHEON TECHNICAL SERVICES CORP,COMPUTER SYSTEMS ENGINEER,100.0,09-APR-12,,,,SA17A,785689,
419,C00431171,P80003353,"Romney, Mitt","GRIFFIS, JOHN",APO,AE,9.128e+07,US ARMY,MILITARY OFFICER,250.0,28-JAN-12,,,,SA17A,771933,
420,C00431171,P80003353,"Romney, Mitt","DELUCA, PIERRE MR.",APO,AE,9.128e+07,US ARMY,ENGINEER,50.0,01-FEB-12,,,,SA17A,780124,


#### 1.根据职业和雇主统计赞助信息

In [171]:
# 基于职业的赞助信息统计是另一种经常被研究的统计任务。
fec.contbr_occupation.value_counts()[:10]

RETIRED                                   233990
INFORMATION REQUESTED                      35107
ATTORNEY                                   34286
HOMEMAKER                                  29931
PHYSICIAN                                  23432
INFORMATION REQUESTED PER BEST EFFORTS     21138
ENGINEER                                   14334
TEACHER                                    13990
CONSULTANT                                 13273
PROFESSOR                                  12555
Name: contbr_occupation, dtype: int64

In [173]:
occ_mapping = {
    "INFORMATION REQUESTED PER BEST EFFORTS": "NOT PROVIDED",
    "INFORMATION REQUESTED": "NOT PROVIDED",
    "INFORMATION REQUESTED (BEST EFFORTS)":  "NOT PROVIDED",
    "C.E.O.": "CEO"
}

In [174]:
# 如果没有提供相关映射，则返回x
f = lambda x: occ_mapping.get(x, x)
fec.contbr_occupation = fec.contbr_occupation.map(f)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [175]:
emp_mapping = {
    "INFORMATION REQUESTED PER BEST EFFORTS": "NOT PROVIDED",
    "INFORMATION REQUESTED": "NOT PROVIDED",
    "SELF": "SELF-EMPLOYED",
    "SELF EMPLOYED": "SELF_EMPLOYED",
}

In [176]:
f = lambda x: emp_mapping.get(x, x)
fec.contbr_employer = fec.contbr_employer.map(f)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
