In [5]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

split-apply-combine模式：
1. 数据集按照key的方式分成小的数据片
2. 对每一数据片进行操作
3. 将结果在组合起来

![](./img/sac.jpg)
以上模式实际和MapReduce很象，MapReduce是把在单一计算机无法处理的大数据集变成分布到多个系统上小数据集，然后在每个系统进行计算，之后每个分布系统上结果在组合起来。

[The Split-Apply-Combine Strategy for Data Analysis](http://www.jstatsoft.org/v40/i01/paper)

+ 分组
+ 分组数据统计
+ Matplotlib以及pandas数据可视化初步



### groupby 基础

In [3]:
url="https://en.wikipedia.org/wiki/List_of_European_Cup_and_UEFA_Champions_League_finals"
eu_champions=pd.read_html(url)

In [6]:
eu_champions

[                                      0                               1
 0  European Cup/Champions League trophy                             NaN
 1                               Founded                            1955
 2                                Region                   Europe (UEFA)
 3                       Number of teams  32 (group stage) 2 (finalists)
 4                     Current champions        Real Madrid (11th title)
 5               Most successful club(s)         Real Madrid (11 titles)
 6         2016–17 UEFA Champions League                             NaN,
      0                                     1
 0  NaN       Match was won during extra time
 1    *  Match was won on a penalty shoot-out
 2    &          Match was won after a replay,
           0       1            2        3                4       5  \
 0    Season  Nation      Winners    Score       Runners-up  Nation   
 1   1955–56   Spain  Real Madrid      4–3   Stade de Reims  France   
 2   1956–57   Sp

In [4]:
eu_champions[2]

Unnamed: 0,0,1,2,3,4,5,6,7
0,Season,Nation,Winners,Score,Runners-up,Nation,Venue,Attendance[11]
1,1955–56,Spain,Real Madrid,4–3,Stade de Reims,France,"Parc des Princes, Paris",38239
2,1956–57,Spain,Real Madrid,2–0,Fiorentina,Italy,"Santiago Bernabéu Stadium, Madrid",124000
3,1957–58,Spain,Real Madrid,3–2,Milan,Italy,"Heysel Stadium, Brussels",67000
4,1958–59,Spain,Real Madrid,2–0,Stade de Reims,France,"Neckarstadion, Stuttgart",72000
...,...,...,...,...,...,...,...,...
59,2013–14,Spain,Real Madrid,4–1,Atlético Madrid,Spain,"Estádio da Luz, Lisbon",60976
60,2014–15,Spain,Barcelona,3–1,Juventus,Italy,"Olympiastadion, Berlin",70442
61,2015–16,Spain,Real Madrid,1–1*[L],Atlético Madrid,Spain,"San Siro, Milan",71942
62,2016–17,,,–,,,"Millennium Stadium, Cardiff",


In [7]:
eu_champions=eu_champions[2]

In [8]:
eu_champions.columns=eu_champions.loc[0]

In [9]:
eu_champions.head()

Unnamed: 0,Season,Nation,Winners,Score,Runners-up,Nation.1,Venue,Attendance[11]
0,Season,Nation,Winners,Score,Runners-up,Nation,Venue,Attendance[11]
1,1955–56,Spain,Real Madrid,4–3,Stade de Reims,France,"Parc des Princes, Paris",38239
2,1956–57,Spain,Real Madrid,2–0,Fiorentina,Italy,"Santiago Bernabéu Stadium, Madrid",124000
3,1957–58,Spain,Real Madrid,3–2,Milan,Italy,"Heysel Stadium, Brussels",67000
4,1958–59,Spain,Real Madrid,2–0,Stade de Reims,France,"Neckarstadion, Stuttgart",72000


In [10]:
eu_champions.drop(0,inplace=True)

In [11]:
eu_champions.head()

Unnamed: 0,Season,Nation,Winners,Score,Runners-up,Nation.1,Venue,Attendance[11]
1,1955–56,Spain,Real Madrid,4–3,Stade de Reims,France,"Parc des Princes, Paris",38239
2,1956–57,Spain,Real Madrid,2–0,Fiorentina,Italy,"Santiago Bernabéu Stadium, Madrid",124000
3,1957–58,Spain,Real Madrid,3–2,Milan,Italy,"Heysel Stadium, Brussels",67000
4,1958–59,Spain,Real Madrid,2–0,Stade de Reims,France,"Neckarstadion, Stuttgart",72000
5,1959–60,Spain,Real Madrid,7–3,Eintracht Frankfurt,West Germany,"Hampden Park, Glasgow",127621


In [12]:
#eu_champions.rename(columns={"Runners-up":"Runners_up","Attendance[12]":"Attendance"},inplace=True)
eu_champions.columns=['Season', 'Nation', 'Winners', 'Score', 'Runners_up', 'Runners_up_Nation', 'Venue','Attendance']

In [13]:
eu_champions.head()

Unnamed: 0,Season,Nation,Winners,Score,Runners_up,Runners_up_Nation,Venue,Attendance
1,1955–56,Spain,Real Madrid,4–3,Stade de Reims,France,"Parc des Princes, Paris",38239
2,1956–57,Spain,Real Madrid,2–0,Fiorentina,Italy,"Santiago Bernabéu Stadium, Madrid",124000
3,1957–58,Spain,Real Madrid,3–2,Milan,Italy,"Heysel Stadium, Brussels",67000
4,1958–59,Spain,Real Madrid,2–0,Stade de Reims,France,"Neckarstadion, Stuttgart",72000
5,1959–60,Spain,Real Madrid,7–3,Eintracht Frankfurt,West Germany,"Hampden Park, Glasgow",127621


In [14]:
eu_champions.drop([62,63],inplace=True)

In [15]:
eu_champions

Unnamed: 0,Season,Nation,Winners,Score,Runners_up,Runners_up_Nation,Venue,Attendance
1,1955–56,Spain,Real Madrid,4–3,Stade de Reims,France,"Parc des Princes, Paris",38239
2,1956–57,Spain,Real Madrid,2–0,Fiorentina,Italy,"Santiago Bernabéu Stadium, Madrid",124000
3,1957–58,Spain,Real Madrid,3–2,Milan,Italy,"Heysel Stadium, Brussels",67000
4,1958–59,Spain,Real Madrid,2–0,Stade de Reims,France,"Neckarstadion, Stuttgart",72000
5,1959–60,Spain,Real Madrid,7–3,Eintracht Frankfurt,West Germany,"Hampden Park, Glasgow",127621
...,...,...,...,...,...,...,...,...
57,2011–12,England,Chelsea,1–1*[K],Bayern Munich,Germany,"Allianz Arena, Munich",62500
58,2012–13,Germany,Bayern Munich,2–1,Borussia Dortmund,Germany,"Wembley Stadium, London",86298
59,2013–14,Spain,Real Madrid,4–1,Atlético Madrid,Spain,"Estádio da Luz, Lisbon",60976
60,2014–15,Spain,Barcelona,3–1,Juventus,Italy,"Olympiastadion, Berlin",70442


In [16]:
eu_championsGrp=eu_champions.groupby('Nation')

In [17]:
type(eu_championsGrp)

pandas.core.groupby.DataFrameGroupBy

In [18]:
eu_championsGrp.groups

{'England': [13, 22, 23, 24, 25, 26, 27, 29, 44, 50, 53, 57],
 'France': [38],
 'Germany': [42, 46, 58],
 'Italy': [8, 9, 10, 14, 30, 34, 35, 39, 41, 48, 52, 55],
 'Netherlands': [15, 16, 17, 18, 33, 40],
 'Portugal': [6, 7, 32, 49],
 'Romania': [31],
 'Scotland': [12],
 'Spain': [1, 2, 3, 4, 5, 11, 37, 43, 45, 47, 51, 54, 56, 59, 60, 61],
 'West Germany': [19, 20, 21, 28],
 'Yugoslavia': [36]}

In [19]:
len(eu_championsGrp.groups)

11

In [20]:
nation_Wins=eu_championsGrp.size() #size是一个aggregrate 函数

In [21]:
nation_Wins

Nation
England         12
France           1
Germany          3
Italy           12
Netherlands      6
                ..
Romania          1
Scotland         1
Spain           16
West Germany     4
Yugoslavia       1
dtype: int64

In [51]:
nation_Wins.sort_values(ascending=False)

Nation
Spain           16
Italy           12
England         12
Netherlands      6
West Germany     4
                ..
Germany          3
Yugoslavia       1
Scotland         1
Romania          1
France           1
dtype: int64

In [22]:
winners_Grp=eu_champions.groupby(['Nation','Winners'])
club_Wins=winners_Grp.size()
club_Wins

Nation        Winners          
England       Aston Villa           1
              Chelsea               1
              Liverpool             5
              Manchester United     3
              Nottingham Forest     2
                                   ..
Spain         Barcelona             5
              Real Madrid          11
West Germany  Bayern Munich         3
              Hamburg               1
Yugoslavia    Red Star Belgrade     1
dtype: int64

In [23]:
club_Wins.sort_values(ascending=False)

Nation       Winners         
Spain        Real Madrid         11
Italy        Milan                7
Spain        Barcelona            5
England      Liverpool            5
Netherlands  Ajax                 4
                                 ..
             Feyenoord            1
             PSV Eindhoven        1
Romania      Steaua București     1
Scotland     Celtic               1
England      Aston Villa          1
dtype: int64

### groupby进阶

In [34]:
goals_league2012=pd.read_csv('../data/goal_stats_euro_leagues_2012-13.csv')
goals_league2012.head()

Unnamed: 0,Month,Stat,EPL,La Liga,Serie A,Bundesliga
0,08/01/2012,MatchesPlayed,20.0,20,10.0,10.0
1,09/01/2012,MatchesPlayed,38.0,39,50.0,44.0
2,10/01/2012,MatchesPlayed,31.0,31,39.0,27.0
3,11/01/2012,MatchesPlayed,50.0,41,42.0,46.0
4,12/01/2012,MatchesPlayed,59.0,39,39.0,26.0


In [35]:
goals_league2012=goals_league2012.set_index('Month')
goals_league2012.head()

Unnamed: 0_level_0,Stat,EPL,La Liga,Serie A,Bundesliga
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
08/01/2012,MatchesPlayed,20.0,20,10.0,10.0
09/01/2012,MatchesPlayed,38.0,39,50.0,44.0
10/01/2012,MatchesPlayed,31.0,31,39.0,27.0
11/01/2012,MatchesPlayed,50.0,41,42.0,46.0
12/01/2012,MatchesPlayed,59.0,39,39.0,26.0


In [36]:
goals_league2012.tail()

Unnamed: 0_level_0,Stat,EPL,La Liga,Serie A,Bundesliga
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
02/01/2013,GoalsScored,87.0,110,100.0,101.0
03/01/2013,GoalsScored,91.0,101,99.0,106.0
04/01/2013,GoalsScored,105.0,127,102.0,104.0
05/01/2013,GoalsScored,96.0,109,102.0,92.0
06/01/2013,GoalsScored,,80,,


In [37]:
goals_league2012_Grp=goals_league2012.groupby(lambda Month: Month.split('/')[2])  #lambda后面讲

In [38]:
for name,group in goals_league2012_Grp:
    print(name)
    print(group)

2012
                     Stat    EPL  La Liga  Serie A  Bundesliga
Month                                                         
08/01/2012  MatchesPlayed   20.0       20     10.0        10.0
09/01/2012  MatchesPlayed   38.0       39     50.0        44.0
10/01/2012  MatchesPlayed   31.0       31     39.0        27.0
11/01/2012  MatchesPlayed   50.0       41     42.0        46.0
12/01/2012  MatchesPlayed   59.0       39     39.0        26.0
08/01/2012    GoalsScored   57.0       60     21.0        23.0
09/01/2012    GoalsScored  111.0      112    133.0       135.0
10/01/2012    GoalsScored   95.0       88     97.0        77.0
11/01/2012    GoalsScored  121.0      116    120.0       137.0
12/01/2012    GoalsScored  183.0      109    125.0        72.0
2013
                     Stat    EPL  La Liga  Serie A  Bundesliga
Month                                                         
01/01/2013  MatchesPlayed   42.0       40     40.0        18.0
02/01/2013  MatchesPlayed   30.0       40    

In [39]:
goals_league2012_Grp_month=goals_league2012.groupby(level=0)  # by row index

In [40]:
for name,group in goals_league2012_Grp_month:
    print(name)
    print(group)

01/01/2013
                     Stat    EPL  La Liga  Serie A  Bundesliga
Month                                                         
01/01/2013  MatchesPlayed   42.0       40     40.0        18.0
01/01/2013    GoalsScored  117.0      121    104.0        51.0
02/01/2013
                     Stat   EPL  La Liga  Serie A  Bundesliga
Month                                                        
02/01/2013  MatchesPlayed  30.0       40     40.0        36.0
02/01/2013    GoalsScored  87.0      110    100.0       101.0
03/01/2013
                     Stat   EPL  La Liga  Serie A  Bundesliga
Month                                                        
03/01/2013  MatchesPlayed  35.0       38     39.0        36.0
03/01/2013    GoalsScored  91.0      101     99.0       106.0
04/01/2013
                     Stat    EPL  La Liga  Serie A  Bundesliga
Month                                                         
04/01/2013  MatchesPlayed   42.0       42     41.0        36.0
04/01/2013    Goals

In [41]:
goals_league2012=goals_league2012.reset_index()
goals_league2012.head(2)

Unnamed: 0,Month,Stat,EPL,La Liga,Serie A,Bundesliga
0,08/01/2012,MatchesPlayed,20.0,20,10.0,10.0
1,09/01/2012,MatchesPlayed,38.0,39,50.0,44.0


In [42]:
goals_league2012=goals_league2012.set_index(['Month','Stat'])
goals_league2012.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,EPL,La Liga,Serie A,Bundesliga
Month,Stat,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
08/01/2012,MatchesPlayed,20.0,20,10.0,10.0
09/01/2012,MatchesPlayed,38.0,39,50.0,44.0


In [43]:
months_stat_Grp=goals_league2012.groupby(level=['Month','Stat'])

In [44]:
for name,group in months_stat_Grp:
    print(name)
    print(group)

('01/01/2013', 'GoalsScored')
                          EPL  La Liga  Serie A  Bundesliga
Month      Stat                                            
01/01/2013 GoalsScored  117.0      121    104.0        51.0
('01/01/2013', 'MatchesPlayed')
                           EPL  La Liga  Serie A  Bundesliga
Month      Stat                                             
01/01/2013 MatchesPlayed  42.0       40     40.0        18.0
('02/01/2013', 'GoalsScored')
                         EPL  La Liga  Serie A  Bundesliga
Month      Stat                                           
02/01/2013 GoalsScored  87.0      110    100.0       101.0
('02/01/2013', 'MatchesPlayed')
                           EPL  La Liga  Serie A  Bundesliga
Month      Stat                                             
02/01/2013 MatchesPlayed  30.0       40     40.0        36.0
('03/01/2013', 'GoalsScored')
                         EPL  La Liga  Serie A  Bundesliga
Month      Stat                                           
03/01

### 使用multiple index进行统计

In [45]:
stat_Grp=goals_league2012.groupby(level='Stat')
stat_Grp.sum()

Unnamed: 0_level_0,EPL,La Liga,Serie A,Bundesliga
Stat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GoalsScored,1063.0,1133,1003.0,898.0
MatchesPlayed,380.0,380,380.0,306.0


In [46]:
goals_league2012.sum(level='Stat')

Unnamed: 0_level_0,EPL,La Liga,Serie A,Bundesliga
Stat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GoalsScored,1063.0,1133,1003.0,898.0
MatchesPlayed,380.0,380,380.0,306.0


In [47]:
totals=stat_Grp.sum()
totals

Unnamed: 0_level_0,EPL,La Liga,Serie A,Bundesliga
Stat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GoalsScored,1063.0,1133,1003.0,898.0
MatchesPlayed,380.0,380,380.0,306.0


In [71]:
#平均每场进球
totals.loc['GoalsScored']/totals.loc['MatchesPlayed']

EPL           2.797368
La Liga       2.981579
Serie A       2.639474
Bundesliga    2.934641
dtype: float64

In [48]:
totals.ix['GoalsScored']/totals.ix['MatchesPlayed']

EPL           2.797368
La Liga       2.981579
Serie A       2.639474
Bundesliga    2.934641
dtype: float64

### aggregate函数

In [51]:
stat_Grp.aggregate(np.sum)

Unnamed: 0_level_0,EPL,La Liga,Serie A,Bundesliga
Stat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GoalsScored,1063.0,1133,1003.0,898.0
MatchesPlayed,380.0,380,380.0,306.0


In [52]:
stat_Grp.agg([np.sum,np.mean,np.size])

Unnamed: 0_level_0,EPL,EPL,EPL,La Liga,La Liga,...,Serie A,Serie A,Bundesliga,Bundesliga,Bundesliga
Unnamed: 0_level_1,sum,mean,size,sum,mean,...,mean,size,sum,mean,size
Stat,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
GoalsScored,1063.0,106.3,11.0,1133,103.0,...,100.3,11.0,898.0,89.8,11.0
MatchesPlayed,380.0,38.0,11.0,380,34.545455,...,38.0,11.0,306.0,30.6,11.0


In [53]:
eu_champions.dtypes

Season               object
Nation               object
Winners              object
Score                object
Runners_up           object
Runners_up_Nation    object
Venue                object
Attendance           object
dtype: object

In [54]:
eu_champions['Attendance']=eu_champions['Attendance'].astype(int)

In [55]:
eu_champions.dtypes

Season               object
Nation               object
Winners              object
Score                object
Runners_up           object
Runners_up_Nation    object
Venue                object
Attendance            int32
dtype: object

In [87]:
eu_championsGrp=eu_champions.groupby('Nation')

In [88]:
eu_championsGrp['Attendance'].agg({'total':np.sum,'average':np.mean,'deviation':np.std})

Unnamed: 0_level_0,deviation,average,total
Nation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
England,17091.309877,66534.250000,798411
France,,64400.000000,64400
Germany,13665.111342,72266.000000,216798
Italy,17441.027188,65766.083333,789193
Netherlands,16048.580972,67489.000000,404934
...,...,...,...
Romania,,70000.000000,70000
Scotland,,45000.000000,45000
Spain,24761.024716,72410.187500,1158563
West Germany,12511.708180,62196.250000,248785


In [50]:
np.mean?