# Chapter 7: 集約、フィルタ、変換のためのグループ分け

## レシピ
* [集約の定義](#レシピ53-集約の定義)
* [複数のカラムと関数のグループ分けと集約](#レシピ54-複数のカラムと関数のグループ分けと集約)
* [グループ分けの後でMultiIndex解消](#レシピ55-グループ分けの後でMultiIndex解消)
* [集約関数のカスタマイズ](#レシピ56-集約関数のカスタマイズ)
* [集約関数の*argsと**kwargsをカスタマイズ](#レシピ57-集約関数の*argsと**kwargsをカスタマイズ)
* [groupbyオブジェクトの検討](#レシピ58-groupbyオブジェクトの検討)
* [マイノリティが多数派の米国州をフィルタリング](#レシピ59-マイノリティが多数派の米国州をフィルタリング)
* [減量の勝負でtransform](#レシピ60-減量の勝負でtransform)
* [SATの加重平均点を州ごとにapplyで計算](#レシピ61-SATの加重平均点を州ごとにapplyで計算)
* [連続変数でグループ分け](#レシピ62-連続変数でグループ分け)
* [都市間の航空便の総数](#レシピ63-都市間の航空便の総数)
* [定時運行便の最長ストリーク](#レシピ64-定時運行便の最長ストリーク)

In [1]:
import pandas as pd
import numpy as np

# レシピ53 集約の定義

#### 1つのグループ分けカラム、1つの集約カラム、1つの集約関数を行う

In [2]:
# 航空便のデータセットを読み込む
flights = pd.read_csv('data/flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0
3,1,1,4,AA,DFW,DCA,1555,7.0,126.0,1192,1935,-7.0,0,0
4,1,1,4,WN,LAX,MCI,1720,48.0,166.0,1363,2225,39.0,0,0


In [3]:
# groupbyメソッドにグループ分けカラムを渡し、aggメソッドに集約カラムと集約関数を辞書風に対して渡す
flights.groupby('AIRLINE').agg({'ARR_DELAY':'mean'}).head()

Unnamed: 0_level_0,ARR_DELAY
AIRLINE,Unnamed: 1_level_1
AA,5.542661
AS,-0.833333
B6,8.692593
DL,0.339691
EV,7.03458


In [4]:
# 集約カラムをインデックス演算子に渡し、集約関数を文字列でaggに渡す
flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean').head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [5]:
# NumPyのmean関数を直接aggメソッドに渡す
flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.mean).head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [6]:
# aggメソッドを省略してmeanメソッドを直接つかってもよい
flights.groupby('AIRLINE')['ARR_DELAY'].mean().head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

# レシピ54 複数のカラムと関数のグループ分けと集約

- 曜日ごとに全航空会社でキャンセル便数を求める
- 曜日ことに全航空会社でキャンセル便と行先変更便の数とパーセントを求める
- 出発及び到着飛行場について、便の総数、キャンセル便の数とパーセント、飛行時間の平均と分散を求める

In [7]:
# 曜日ごとに全航空会社でキャンセル便数を求める
flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED'].agg('sum').head()

AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
Name: CANCELLED, dtype: int64

In [8]:
# 曜日ことに全航空会社でキャンセル便と行先変更便の数とパーセントを求める
flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED', 'DIVERTED'].agg(['sum', 'mean']).head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,41,0.032106,6,0.004699
AA,2,9,0.007341,2,0.001631
AA,3,16,0.011949,2,0.001494
AA,4,20,0.015004,5,0.003751
AA,5,18,0.014151,1,0.000786
AA,6,21,0.018667,9,0.008
AA,7,29,0.021837,1,0.000753


In [9]:
# aggメソッドの辞書でマップして第3の質問に答える
group_cols = ['ORG_AIR', 'DEST_AIR']
agg_dict = {'CANCELLED':['sum', 'mean', 'size'],
           'AIR_TIME':['mean', 'var']}
flights.groupby(group_cols).agg(agg_dict).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0,0.0,31,96.387097,45.778495
ATL,ABQ,0,0.0,16,170.5,87.866667
ATL,ABY,0,0.0,19,28.578947,6.590643
ATL,ACY,0,0.0,6,91.333333,11.466667
ATL,AEX,0,0.0,40,78.725,47.332692


# レシピ55 グループ分けの後でMultiIndex解消

In [10]:
# 航空会社別に曜日ごとに飛行距離の平均と総計、到着遅延時間の最大と最小を求める
airline_info = flights.groupby(['AIRLINE', 'WEEKDAY']).agg({'DIST':['sum', 'mean'],
                                                           'ARR_DELAY':['min', 'max']}).astype(int)
airline_info.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST,DIST,ARR_DELAY,ARR_DELAY
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
AA,6,1265340,1124,-50,858
AA,7,1461906,1100,-49,626


In [11]:
# 2階層のカラムを１つにまとめる
level0 = airline_info.columns.get_level_values(0)
level0

Index(['DIST', 'DIST', 'ARR_DELAY', 'ARR_DELAY'], dtype='object')

In [12]:
level1 = airline_info.columns.get_level_values(1)
level1

Index(['sum', 'mean', 'min', 'max'], dtype='object')

In [13]:
airline_info.columns = level0 + '_' + level1
airline_info.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
AIRLINE,WEEKDAY,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
AA,6,1265340,1124,-50,858
AA,7,1461906,1100,-49,626


In [14]:
# reset_indexで行ラベルを1階層にする
airline_info.reset_index().head(7)

Unnamed: 0,AIRLINE,WEEKDAY,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
0,AA,1,1455386,1139,-60,551
1,AA,2,1358256,1107,-52,725
2,AA,3,1496665,1117,-45,473
3,AA,4,1452394,1089,-46,349
4,AA,5,1427749,1122,-41,732
5,AA,6,1265340,1124,-50,858
6,AA,7,1461906,1100,-49,626


# レシピ56 集約関数のカスタマイズ

#### collegeデータセットを使い、州ごとの学部学生数の平均と標準偏差を求め、学生数の最大偏差値を州ごとに求める

In [15]:
# 州ごとに学部学生の平均と標準偏差を求める
college = pd.read_csv('data/college.csv')
college.groupby('STABBR')['UGDS'].agg(['mean', 'std']).round(0).head()

Unnamed: 0_level_0,mean,std
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,2493.0,4052.0
AL,2790.0,4658.0
AR,1644.0,3143.0
AS,1276.0,
AZ,4130.0,14894.0


In [16]:
# 大学の偏差値の最大を求める関数を作成する
def max_deviation(s):
    std_score = (s - s.mean()) / s.std()
    return std_score.abs().max()

In [17]:
# 作成した関数をaggメソッドに渡す
college.groupby('STABBR')['UGDS'].agg(max_deviation).round(1).head()

STABBR
AK    2.6
AL    5.8
AR    6.3
AS    NaN
AZ    9.9
Name: UGDS, dtype: float64

In [18]:
# pandasのデフォルトでは(標本数-1)で割るので値が１つしかない場合はNaNになる
college.groupby('STABBR')['UGDS', 'SATVRMID', 'SATMTMID'].agg(max_deviation).round(1).head()

Unnamed: 0_level_0,UGDS,SATVRMID,SATMTMID
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,2.6,,
AL,5.8,1.6,1.8
AR,6.3,2.2,2.3
AS,,,
AZ,9.9,1.9,1.4


In [19]:
# 自作関数と組み込み関数を同時に使用可能
college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'] \
        .agg([max_deviation, 'mean', 'std']).round(1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATVRMID,SATVRMID,SATVRMID,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,max_deviation,mean,std,max_deviation,mean,std,max_deviation,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,,,,,
AK,1,1.1,123.3,132.9,,555.0,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,514.9,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,498.0,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,481.1,37.9,2.0,503.6,39.0


In [20]:
max_deviation.__name__

'max_deviation'

In [21]:
# 特殊属性__name__を変更してカラム名を直接変える
max_deviation.__name__ = 'Max Deviation'
college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'] \
        .agg([max_deviation, 'mean', 'std']).round(1).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATVRMID,SATVRMID,SATVRMID,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,Max Deviation,mean,std,Max Deviation,mean,std,Max Deviation,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,,,,,
AK,1,1.1,123.3,132.9,,555.0,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,514.9,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,498.0,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,481.1,37.9,2.0,503.6,39.0


# レシピ57 集約関数の*argsと**kwargsをカスタマイズ

In [25]:
# aggメソッドの機能をinspectモジュールで確認する
college = pd.read_csv('data/college.csv')
grouped = college.groupby(['STABBR', 'RELAFFIL'])

import inspect
inspect.signature(grouped.agg)

<Signature (arg=None, *args, **kwargs)>

#### 学部学生数が2つの値の間で、州立と宗教系立かどうかによるグループ分けでの大学の割合を求める

In [26]:
# 学部学生が1000から3000の間の大学のパーセントを返す関数を作成する
def pct_between_1_3k(s):
    return s.between(1000, 3000).mean()

In [27]:
# 上記関数を州および宗教系かどうかのグループ分けで計算する
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between_1_3k).head(9)

STABBR  RELAFFIL
AK      0           0.142857
        1           0.000000
AL      0           0.236111
        1           0.333333
AR      0           0.279412
        1           0.111111
AS      0           1.000000
AZ      0           0.096774
        1           0.000000
Name: UGDS, dtype: float64

In [28]:
# ユーザが上限や下限を定義できる新たな関数を作成する
def pct_between(s, low, high):
    return s.between(low, high).mean()

In [29]:
# 上記関数に下限、上限を渡す
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, 1000, 10000).head(9)

STABBR  RELAFFIL
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: UGDS, dtype: float64

In [30]:
# 明示的にパラメータ名をつかうことも出来る
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, high=10000, low=1000).head(9)

STABBR  RELAFFIL
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: UGDS, dtype: float64

In [33]:
# クロージャを使って入れ子関数を作れば、作数の関数で引数が使える
def make_agg_func(func, name, *args, **kwargs):
    def wrapper(x):
        return func(x, *args, **kwargs)
    wrapper.__name__ = name
    return wrapper

my_agg1 = make_agg_func(pct_between, 'pct_1_3k', low=1000, high=3000)
my_agg2 = make_agg_func(pct_between, 'pct_10_30k', 10000, 30000)

college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(['mean', my_agg1, my_agg2]).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,pct_1_3k,pct_10_30k
STABBR,RELAFFIL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,0,3508.857143,0.142857,0.142857
AK,1,123.333333,0.0,0.0
AL,0,3248.774648,0.236111,0.083333
AL,1,979.722222,0.333333,0.0
AR,0,1793.691176,0.279412,0.014706


# レシピ58 groupbyオブジェクトの検討

#### groupbyオブジェクトそのものをメソッド呼び出しやそのグループでイテレーションとして調べる

In [34]:
# 型を調べる
college = pd.read_csv('data/college.csv')
grouped = college.groupby(['STABBR', 'RELAFFIL'])
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [36]:
# dir関数を使って、利用可能な機能を確認する
print([attr for attr in dir(grouped) if not attr.startswith('_')])

['CITY', 'CURROPER', 'DISTANCEONLY', 'GRAD_DEBT_MDN_SUPP', 'HBCU', 'INSTNM', 'MD_EARN_WNE_P10', 'MENONLY', 'PCTFLOAN', 'PCTPELL', 'PPTUG_EF', 'RELAFFIL', 'SATMTMID', 'SATVRMID', 'STABBR', 'UG25ABV', 'UGDS', 'UGDS_2MOR', 'UGDS_AIAN', 'UGDS_ASIAN', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_NHPI', 'UGDS_NRA', 'UGDS_UNKN', 'UGDS_WHITE', 'WOMENONLY', 'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change', 'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'var']


In [37]:
# ngroups属性でグループの個数を知る
grouped.ngroups

112

In [38]:
# 各グループで識別可能なラベルを調べるため、indexラベルにマップした辞書をもつgroup属性の中を調べる
groups = list(grouped.groups.keys())
groups[:6]

[('AK', 0), ('AK', 1), ('AL', 0), ('AL', 1), ('AR', 0), ('AR', 1)]

In [39]:
# get_groupメソッドに正確なグループラベルのタプルを渡してグループを取り出す
# フロリダ州の宗教系大学全てを取得する
grouped.get_group(('FL', 1)).head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
712,The Baptist College of Florida,Graceville,FL,0.0,0.0,0.0,1,545.0,465.0,0.0,...,0.0308,0.0,0.0507,0.2291,1,0.5878,0.5602,0.3531,30800.0,20052
713,Barry University,Miami,FL,0.0,0.0,0.0,1,470.0,462.0,0.0,...,0.0164,0.0741,0.0841,0.1518,1,0.5045,0.6733,0.4361,44100.0,28250
714,Gooding Institute of Nurse Anesthesia,Panama City,FL,0.0,0.0,0.0,1,,,0.0,...,,,,,0,,,,,PrivacySuppressed
715,Bethune-Cookman University,Daytona Beach,FL,1.0,0.0,0.0,1,405.0,395.0,0.0,...,0.0198,0.0205,0.019,0.0523,1,0.7758,0.8867,0.0647,29400.0,36250
724,Johnson University Florida,Kissimmee,FL,0.0,0.0,0.0,1,480.0,470.0,0.0,...,0.0045,0.0045,0.0136,0.1636,1,0.6689,0.7384,0.2185,26300.0,20199


In [42]:
# 各グループを確認するのに、gorupbyオブジェクトでイテレーションする
from IPython.display import display

i = 0
for name, group in grouped:
    print(name)
    display(group.head(2))
    i += 1
    if i == 5:
        break

('AK', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
60,University of Alaska Anchorage,Anchorage,AK,0.0,0.0,0.0,0,,,0.0,...,0.098,0.0181,0.0457,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5
62,University of Alaska Fairbanks,Fairbanks,AK,0.0,0.0,0.0,0,,,0.0,...,0.0401,0.011,0.306,0.3887,1,0.2263,0.255,0.4519,36200,19355.0


('AK', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
61,Alaska Bible College,Palmer,AK,0.0,0.0,0.0,1,,,0.0,...,0.037,0.0,0.0,0.1481,1,0.3571,0.2857,0.4286,,PrivacySuppressed
64,Alaska Pacific University,Anchorage,AK,0.0,0.0,0.0,1,555.0,503.0,0.0,...,0.0945,0.0,0.0873,0.3745,1,0.3152,0.5297,0.491,47000.0,23250


('AL', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5


('AL', 1)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370
10,Birmingham Southern College,Birmingham,AL,0.0,0.0,0.0,1,560.0,560.0,0.0,...,0.0051,0.0,0.0051,0.0017,1,0.192,0.4809,0.0152,44200,27000


('AR', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
128,University of Arkansas at Little Rock,Little Rock,AR,0.0,0.0,0.0,0,470.0,510.0,0.0,...,0.0755,0.0283,0.0003,0.4126,1,0.3941,0.4775,0.4062,33900,21736
129,University of Arkansas for Medical Sciences,Little Rock,AR,0.0,0.0,0.0,0,,,0.0,...,0.0281,0.007,0.0169,0.2433,1,0.3944,0.6144,0.5133,61400,12500


In [47]:
# groupbyオブジェクトにheadメソッド呼び出しで各グループの先頭の数行を１つのDataFrameにする
grouped.head(2).head(6)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
10,Birmingham Southern College,Birmingham,AL,0.0,0.0,0.0,1,560.0,560.0,0.0,...,0.0051,0.0,0.0051,0.0017,1,0.192,0.4809,0.0152,44200,27000.0
43,Prince Institute-Southeast,Elmhurst,IL,0.0,0.0,0.0,0,,,0.0,...,0.0,0.0,0.0,0.0,1,0.7857,0.9375,0.6569,PrivacySuppressed,20992.0
60,University of Alaska Anchorage,Anchorage,AK,0.0,0.0,0.0,0,,,0.0,...,0.098,0.0181,0.0457,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5


In [51]:
# nthメソッドは整数のリストにより各グループの指定行を選ぶ
# 例は各グループの先頭と末尾の行を選ぶ
grouped.nth([1, -1]).head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,INSTNM,CITY,HBCU,MENONLY,WOMENONLY,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,UGDS_WHITE,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
STABBR,RELAFFIL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AK,0,University of Alaska Fairbanks,Fairbanks,0.0,0.0,0.0,,,0.0,5536.0,0.4259,...,0.0401,0.011,0.306,0.3887,1,0.2263,0.255,0.4519,36200.0,19355
AK,0,Ilisagvik College,Barrow,0.0,0.0,0.0,,,0.0,109.0,0.1376,...,0.0,0.0183,0.0,0.6239,1,0.1323,0.0,0.6498,24900.0,PrivacySuppressed
AK,1,Alaska Pacific University,Anchorage,0.0,0.0,0.0,555.0,503.0,0.0,275.0,0.5309,...,0.0945,0.0,0.0873,0.3745,1,0.3152,0.5297,0.491,47000.0,23250
AK,1,Alaska Christian College,Soldotna,0.0,0.0,0.0,,,0.0,68.0,0.0588,...,0.0147,0.0,0.1324,0.0735,1,0.8868,0.6792,0.2264,,PrivacySuppressed
AL,0,University of Alabama at Birmingham,Birmingham,0.0,0.0,0.0,570.0,565.0,0.0,11383.0,0.5922,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700.0,21941.5
AL,0,Alabama College of Osteopathic Medicine,Dothan,0.0,0.0,0.0,,,0.0,,,...,,,,,1,,,,,PrivacySuppressed
AL,1,Birmingham Southern College,Birmingham,0.0,0.0,0.0,560.0,560.0,0.0,1180.0,0.7983,...,0.0051,0.0,0.0051,0.0017,1,0.192,0.4809,0.0152,44200.0,27000
AL,1,Strayer University-Huntsville Campus,Huntsville,,,,,,,,,...,,,,,1,,,,49200.0,36173.5


# レシピ59 マイノリティが多数派の米国州をフィルタリング

#### 学部学生で非白人の方が白人より多い州を求める

In [52]:
# collegeデータを読み込み州ごとにグループ分けして、グループの総数を表示する
college = pd.read_csv('data/college.csv', index_col = 'INSTNM')
grouped = college.groupby('STABBR')
grouped.ngroups

59

In [53]:
college['STABBR'].nunique() # 同じ数だと確認

59

In [54]:
# groupbyオブジェクトにfilterメソッドがあり、ユーザ関数を渡す事ができる
def check_minority(df, threshold):
    minority_pct = 1 - df['UGDS_WHITE']
    total_minority = (df['UGDS'] * minority_pct).sum()
    total_ugds = df['UGDS'].sum()
    total_minority_pct = total_minority / total_ugds
    return total_minority_pct > threshold

In [55]:
# しきい値50%にしてcheck_minority関数をfilterメソッドに渡す
college_filtered = grouped.filter(check_minority, threshold=.5)
college_filtered.head()

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Everest College-Phoenix,Phoenix,AZ,0.0,0.0,0.0,1,,,0.0,4102.0,...,0.0373,0.0,0.1026,0.4749,0,0.8291,0.7151,0.67,28600,9500
Collins College,Phoenix,AZ,0.0,0.0,0.0,0,,,0.0,83.0,...,0.0241,0.0,0.3855,0.3373,0,0.7205,0.8228,0.4764,25700,47000
Empire Beauty School-Paradise Valley,Phoenix,AZ,0.0,0.0,0.0,1,,,0.0,25.0,...,0.04,0.0,0.0,0.16,0,0.6349,0.5873,0.4651,17800,9588
Empire Beauty School-Tucson,Tucson,AZ,0.0,0.0,0.0,0,,,0.0,126.0,...,0.0,0.0,0.0079,0.2222,1,0.7962,0.6615,0.4229,18200,9833
Thunderbird School of Global Management,Glendale,AZ,0.0,0.0,0.0,0,,,0.0,1.0,...,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,118900,PrivacySuppressed


In [56]:
# 数の確認
college.shape

(7535, 26)

In [58]:
college_filtered.shape

(3028, 26)

In [60]:
college_filtered['STABBR'].nunique()

20

# レシピ60 減量の勝負でtransform

#### シミュレーションデータを使い、2人が4か月間減量したパーセントを追跡し、月末に減量パーセントの高い人を勝者と宣言

In [61]:
# weight_lossデータを読み、AmyとBobの最初の月を調べる
weight_loss = pd.read_csv('data/weight_loss.csv')
weight_loss.query('Month == "Jan"')

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
1,Amy,Jan,Week 1,197
2,Bob,Jan,Week 2,288
3,Amy,Jan,Week 2,189
4,Bob,Jan,Week 3,283
5,Amy,Jan,Week 3,189
6,Bob,Jan,Week 4,283
7,Amy,Jan,Week 4,190


In [64]:
weight_loss[weight_loss['Month']== 'Jan']

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
1,Amy,Jan,Week 1,197
2,Bob,Jan,Week 2,288
3,Amy,Jan,Week 2,189
4,Bob,Jan,Week 3,283
5,Amy,Jan,Week 3,189
6,Bob,Jan,Week 4,283
7,Amy,Jan,Week 4,190


In [68]:
# 毎週の減量幅を計算する関数を作成
def find_perc_loss(s):
    return(s-s.iloc[0]) / s.iloc[0]

In [69]:
# 上記の関数を1月のBobについてテストする
# 週ごとの減量幅が表示される
bob_jan = weight_loss.query('Name== "Bob" and Month=="Jan"')
find_perc_loss(bob_jan['Weight'])

0    0.000000
2   -0.010309
4   -0.027491
6   -0.027491
Name: Weight, dtype: float64

In [71]:
# 人と週の各組合せに適用して月の第1週と比較した減量結果を得る
pcnt_loss = weight_loss.groupby(['Name', 'Month'])['Weight'].transform(find_perc_loss)

pcnt_loss.head(8)

0    0.000000
1    0.000000
2   -0.010309
3   -0.040609
4   -0.027491
5   -0.040609
6   -0.027491
7   -0.035533
Name: Weight, dtype: float64

In [72]:
# 上記結果を元のDataFrameに追加する
weight_loss['Perc Weight Loss'] = pcnt_loss.round(3)
weight_loss.query('Name=="Bob" and Month in ["Jan", "Feb"]')

Unnamed: 0,Name,Month,Week,Weight,Perc Weight Loss
0,Bob,Jan,Week 1,291,0.0
2,Bob,Jan,Week 2,288,-0.01
4,Bob,Jan,Week 3,283,-0.027
6,Bob,Jan,Week 4,283,-0.027
8,Bob,Feb,Week 1,283,0.0
10,Bob,Feb,Week 2,275,-0.028
12,Bob,Feb,Week 3,268,-0.053
14,Bob,Feb,Week 4,268,-0.053


In [74]:
# 第4週を選択する
week4 = weight_loss.query('Week == "Week 4"')
week4

Unnamed: 0,Name,Month,Week,Weight,Perc Weight Loss
6,Bob,Jan,Week 4,283,-0.027
7,Amy,Jan,Week 4,190,-0.036
14,Bob,Feb,Week 4,268,-0.053
15,Amy,Feb,Week 4,173,-0.089
22,Bob,Mar,Week 4,261,-0.026
23,Amy,Mar,Week 4,170,-0.017
30,Bob,Apr,Week 4,250,-0.042
31,Amy,Apr,Week 4,161,-0.053


In [75]:
# pivotメソッドで変形し、BobとAmyの減量パーセントを各月で直接比較できるようにする
winner = week4.pivot(index='Month', columns='Name', values='Perc Weight Loss')
winner

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Apr,-0.053,-0.042
Feb,-0.089,-0.053
Jan,-0.036,-0.027
Mar,-0.017,-0.026


In [76]:
# Numpyのwhereを使って勝者の名前カラムを作り、各月の勝ったパーセントをハイライトする
winner['Winner'] = np.where(winner['Amy'] < winner['Bob'], 'Amy', 'Bob')
winner.style.highlight_min(axis=1, color='green')

Name,Amy,Bob,Winner
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apr,-0.053,-0.042,Amy
Feb,-0.089,-0.053,Amy
Jan,-0.036,-0.027,Amy
Mar,-0.017,-0.026,Bob


In [77]:
# value_countsメソッドを使い最終的に勝った回数を返す
winner.Winner.value_counts()

Amy    3
Bob    1
Name: Winner, dtype: int64

In [78]:
# 月の順序をカテゴリ変数にして並び替える
week4a = week4.copy()
month_chron = week4a['Month'].unique() # 又はdrop_duplicatesを使う
month_chron

array(['Jan', 'Feb', 'Mar', 'Apr'], dtype=object)

In [80]:
week4a['Monht'] = pd.Categorical(week4a['Month'],
                                categories=month_chron,
                                ordered=True)

week4a.pivot(index='Month', columns='Name', values='Perc Weight Loss')

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Apr,-0.053,-0.042
Feb,-0.089,-0.053
Jan,-0.036,-0.027
Mar,-0.017,-0.026


# レシピ61 SATの加重平均点を州ごとにapplyで計算

#### 数学と言語能力のSAT点数の加重平均を州ごとに求める

In [85]:
# UGDS, SATMTMID, SATVRMIDカラムのどれかに欠損値がある行を削除する
college = pd.read_csv('data/college.csv')
subset = ['UGDS', 'SATMTMID', 'SATVRMID']
college2 = college.dropna(subset=subset)
college.shape,college2.shape

((7535, 27), (1184, 27))

In [88]:
# SATの数学点だけの加重平均をとる関数をユーザ定義する
def weighted_math_average(df):
    weighted_math = df['UGDS'] * df['SATMTMID']
    return int(weighted_math.sum() / df['UGDS'].sum())

In [90]:
# 州でグループ分けしてapplyメソッドにこの関数を渡す
college2.groupby('STABBR').apply(weighted_math_average).head()

STABBR
AK    503
AL    536
AR    529
AZ    569
CA    564
dtype: int64

In [91]:
# aggメソッドに同じ関数を渡して確認する
college2.groupby('STABBR').agg(weighted_math_average).head()

Unnamed: 0_level_0,INSTNM,CITY,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AK,503,503,503,503,503,503,503,503,503,503,...,503,503,503,503,503,503,503,503,503,503
AL,536,536,536,536,536,536,536,536,536,536,...,536,536,536,536,536,536,536,536,536,536
AR,529,529,529,529,529,529,529,529,529,529,...,529,529,529,529,529,529,529,529,529,529
AZ,569,569,569,569,569,569,569,569,569,569,...,569,569,569,569,569,569,569,569,569,569
CA,564,564,564,564,564,564,564,564,564,564,...,564,564,564,564,564,564,564,564,564,564


In [92]:
# カラムをSATMTMIDだけに制限しようとするとUGDSにアクセスできないのでエラーになる
college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)

KeyError: 'UGDS'

In [107]:
# 2つのSAT点の加重及び算術平均を各グループの大学数ともに計算する関数を作成
from collections import OrderedDict
def weighted_average(df):
    data = OrderedDict()
    weight_m = df['UGDS'] * df['SATMTMID']
    weight_v = df['UGDS'] * df['SATVRMID']
    wm_avg = weight_m.sum() / df['UGDS'].sum()
    wv_avg = weight_v.sum() / df['UGDS'].sum()
    data['weighted_math_avg'] = wm_avg
    data['weighted_verbal_avg'] = wv_avg
    data['math_avg'] = df['SATMTMID'].mean()
    data['verbal_avg'] = df['SATVRMID'].mean()
    data['count'] = len(df)
    return pd.Series(data)

college2.groupby('STABBR').apply(weighted_average).head(10)


Unnamed: 0_level_0,weighted_math_avg,weighted_verbal_avg,math_avg,verbal_avg,count
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AK,503.0,555.0,503.0,555.0,1.0
AL,536.137917,533.383387,504.285714,508.47619,21.0
AR,529.112332,504.876157,515.9375,491.875,16.0
AZ,569.313985,557.30335,536.666667,538.333333,6.0
CA,564.94542,539.316605,562.902778,549.083333,72.0
CO,553.12382,547.033996,540.214286,537.714286,14.0
CT,545.341834,533.417563,522.5,517.857143,14.0
DC,621.905104,623.514036,588.333333,589.166667,6.0
DE,569.954949,553.53456,495.0,486.666667,3.0
FL,565.324731,565.815873,521.842105,529.289474,38.0


In [108]:
# 各グループで複数の行とカラムをDataFrameで返す
# 仕事量軽減の為、加重平均はNumpyのaverage関数、SciPyのgmeanとhmean関数で幾何及び調和平均を計算する
from scipy.stats import gmean, hmean

def calculate_means(df):
    df_means = pd.DataFrame(index=['Arithmetic', 'Weighted',
                                  'Geometric', 'Harmonic'])
    cols = ['SATMTMID', 'SATVRMID']
    for col in cols:
        arithmetic = df[col].mean()
        weighted = np.average(df[col], weights=df['UGDS'])
        geometric = gmean(df[col])
        harmonic = hmean(df[col])
        df_means[col] = [arithmetic, weighted, geometric, harmonic]
    df_means['count'] = len(df)
    return df_means.astype(int)

college2.groupby('STABBR').apply(calculate_means).head(12)

Unnamed: 0_level_0,Unnamed: 1_level_0,SATMTMID,SATVRMID,count
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,Arithmetic,503,555,1
AK,Weighted,503,555,1
AK,Geometric,503,555,1
AK,Harmonic,503,555,1
AL,Arithmetic,504,508,21
AL,Weighted,536,533,21
AL,Geometric,500,505,21
AL,Harmonic,497,502,21
AR,Arithmetic,515,491,16
AR,Weighted,529,504,16


# レシピ62 連続変数でグループ分け

#### panda cut関数を用いて飛行距離を離散化し、500~1000マイル間を飛行する便数の最も多い航空会社を見つける

In [109]:
# flightsデータセットを読み込み、先頭5行を表示
flights = pd.read_csv('data/flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0
3,1,1,4,AA,DFW,DCA,1555,7.0,126.0,1192,1935,-7.0,0,0
4,1,1,4,WN,LAX,MCI,1720,48.0,166.0,1363,2225,39.0,0,0


In [115]:
# 飛行距離の分布作成のため、DISTカラムを離散化する
bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
cuts = pd.cut(flights['DIST'], bins=bins)
cuts.head()

0     (500.0, 1000.0]
1    (1000.0, 2000.0]
2     (500.0, 1000.0]
3    (1000.0, 2000.0]
4    (1000.0, 2000.0]
Name: DIST, dtype: category
Categories (5, interval[float64]): [(-inf, 200.0] < (200.0, 500.0] < (500.0, 1000.0] < (1000.0, 2000.0] < (2000.0, inf]]

In [116]:
# 各カテゴリの値を数える
cuts.value_counts()

(500.0, 1000.0]     20659
(200.0, 500.0]      15874
(1000.0, 2000.0]    14186
(2000.0, inf]        4054
(-inf, 200.0]        3719
Name: DIST, dtype: int64

In [117]:
# cutsをgroupbyメソッドに渡し、AIRLINEカラムでvalue_countsメソッドを呼び、各距離グループの分布を知る
flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True).round(3).head(15)

DIST            AIRLINE
(-inf, 200.0]   OO         0.326
                EV         0.289
                MQ         0.211
                DL         0.086
                AA         0.052
                UA         0.027
                WN         0.009
(200.0, 500.0]  WN         0.194
                DL         0.189
                OO         0.159
                EV         0.156
                MQ         0.100
                AA         0.071
                UA         0.062
                VX         0.028
Name: AIRLINE, dtype: float64

In [128]:
# 飛行時間の25位、50位、75位パーセンタイルが求まる
flights.groupby(cuts)['AIR_TIME'].quantile(.75).div(60).round(2)

DIST
(-inf, 200.0]       0.57
(200.0, 500.0]      1.05
(500.0, 1000.0]     1.92
(1000.0, 2000.0]    3.40
(2000.0, inf]       5.03
Name: AIR_TIME, dtype: float64

In [130]:
labels=['Under an Hour', '1 Hour', '1-2 Hours', '2-4 Hours', '4+ Hours']
cuts2 = pd.cut(flights['DIST'], bins=bins, labels= labels)
flights.groupby(cuts2)['AIRLINE'].value_counts(normalize=True)\
                                 .round(3)\
                                 .unstack()\
                                 .style.highlight_max(axis=1, color='green')


AIRLINE,AA,AS,B6,DL,EV,F9,HA,MQ,NK,OO,UA,US,VX,WN
DIST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Under an Hour,0.052,,,0.086,0.289,,,0.211,,0.326,0.027,,,0.009
1 Hour,0.071,0.001,0.007,0.189,0.156,0.005,,0.1,0.012,0.159,0.062,0.016,0.028,0.194
1-2 Hours,0.144,0.023,0.003,0.206,0.101,0.038,,0.051,0.03,0.106,0.131,0.025,0.004,0.138
2-4 Hours,0.264,0.016,0.003,0.165,0.016,0.031,,0.003,0.045,0.046,0.199,0.04,0.012,0.16
4+ Hours,0.212,0.012,0.08,0.171,,0.004,0.028,,0.019,,0.289,0.065,0.074,0.046


# レシピ63 都市間の航空便の総数

#### 都市間の全便数を数える
- 発着空港を英字順にソート
- 2空港の組み合わせが常に同じ順になるようにする
- 新たなカラム配置を使ってグループ分けして数える

In [7]:
# 発着空港ごとに全便数を数える
flights = pd.read_csv('data/flights.csv')
flights_ct = flights.groupby(['ORG_AIR', 'DEST_AIR']).size()
flights_ct.head()

ORG_AIR  DEST_AIR
ATL      ABE         31
         ABQ         16
         ABY         19
         ACY          6
         AEX         40
dtype: int64

In [8]:
# ヒューストン(IAH)とアトランタ(ATL)間の全便数を両方向選ぶ
flights_ct.loc[[('ATL', 'IAH'), ('IAH', 'ATL')]]

ORG_AIR  DEST_AIR
ATL      IAH         121
IAH      ATL         148
dtype: int64

In [17]:
# 発着地それぞれを各行で独立に英字順にソートする
flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)
flights_sort = pd.DataFrame(list(flights_sort),columns=['ORG_AIR', 'DEST_AIR'])
flights_sort.head()

Unnamed: 0,ORG_AIR,DEST_AIR
0,LAX,SLC
1,DEN,IAD
2,DFW,VPS
3,DCA,DFW
4,LAX,MCI


In [20]:
# 各行を一般的な名前に変えて都市間の全便数を求める
rename_dict = {'ORG_AIR':'AIR1','DEST_AIR':'AIR2'}
flights_sort = flights_sort.rename(columns=rename_dict)
flights_ct2 = flights_sort.groupby(['AIR1', 'AIR2']).size()
flights_ct2.head()

AIR1  AIR2
ABE   ATL     31
      ORD     24
ABI   DFW     74
ABQ   ATL     16
      DEN     46
dtype: int64

In [21]:
# アトランタとヒューストンの全便数を選び確認する
flights_ct2.loc[('ATL', 'IAH')]

269

In [22]:
# ヒューストンからアトランタの便を選ぼうとするとエラーになる
flights_ct2.loc[('IAH', 'ATL')]

KeyError: ('IAH', 'ATL')

In [23]:
sorted(flights.loc[0, ['ORG_AIR', 'DEST_AIR']])

['LAX', 'SLC']

In [24]:
# NUmpyのsort関数を検討する
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
data_sorted[:10]

array([['LAX', 'SLC'],
       ['DEN', 'IAD'],
       ['DFW', 'VPS'],
       ['DCA', 'DFW'],
       ['LAX', 'MCI'],
       ['IAH', 'SAN'],
       ['DFW', 'MSY'],
       ['PHX', 'SFO'],
       ['ORD', 'STL'],
       ['IAH', 'SJC']], dtype=object)

In [26]:
# flights_sort DataFrameと等しいかチェックする
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
fs_orig = flights_sort.rename(columns={'ORG_AIR': 'AIR1', 'DEST_AIR': 'AIR2'})
flights_sort2.equals(fs_orig)

True

In [27]:
# それぞれのsortでの時間計測
%timeit flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)

4.1 s ± 43.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
%%timeit
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])

9.4 ms ± 106 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# レシピ64 定期運航便の最長ストリーク

#### 定期運航便の最長連続ストリークを航空会社ごと出発飛行場ことに計算する

In [29]:
# 小さなSeriesで１のストリークを数える
s = pd.Series([0, 1, 1, 0, 1, 1, 1,0])
s

0    0
1    1
2    1
3    0
4    1
5    1
6    1
7    0
dtype: int64

In [31]:
# cumsumメソッドを使う
s1 = s.cumsum()
s1

0    0
1    1
2    2
3    2
4    3
5    4
6    5
7    5
dtype: int64

In [33]:
# このSeriesと元のを掛け合わせる
s.mul(s1)

0    0
1    1
2    2
3    0
4    3
5    4
6    5
7    0
dtype: int64

In [34]:
# 先頭を累積和のままでなく1にしたい。現在の値から前の値を引くdiffメソッドをチェイニングする
s.mul(s1).diff()

0    NaN
1    1.0
2    1.0
3   -2.0
4    3.0
5    1.0
6    1.0
7   -5.0
dtype: float64

In [35]:
# マイナス値がストリークの終わりを表す。whereメソッドでプラス値を欠損値にする
s.mul(s1).diff().where(lambda x: x<0)

0    NaN
1    NaN
2    NaN
3   -2.0
4    NaN
5    NaN
6    NaN
7   -5.0
dtype: float64

In [37]:
# これらの値をfillメソッドで下に伝播する
s.mul(s1).diff().where(lambda x: x < 0).ffill()

0    NaN
1    NaN
2    NaN
3   -2.0
4   -2.0
5   -2.0
6   -2.0
7   -5.0
dtype: float64

In [39]:
# このSerisをs1に足し戻して、余分な累積和を解消する
s.mul(s1).diff().where(lambda x: x <0).ffill().add(s1, fill_value=0)

0    0.0
1    1.0
2    2.0
3    0.0
4    1.0
5    2.0
6    3.0
7    0.0
dtype: float64

In [42]:
# 航空会社と出発飛行場について最長ストリークをみつける
# ltはless thanで<
filghts = pd.read_csv('data/flights.csv')
flights['ON_TIME'] = flights['ARR_DELAY'].lt(15).astype(int)
flights[['AIRLINE', 'ORG_AIR', 'ON_TIME']].head(10)

Unnamed: 0,AIRLINE,ORG_AIR,ON_TIME
0,WN,LAX,0
1,UA,DEN,1
2,MQ,DFW,0
3,AA,DFW,1
4,WN,LAX,0
5,UA,IAH,1
6,AA,DFW,0
7,F9,SFO,1
8,AA,ORD,1
9,UA,IAH,1


In [43]:
# 対象Sereisで１の最長ストリークを返す関数を定義する
def max_streak(s):
    s1 = s.cumsum()
    return s.mul(s1).diff().where(lambda x: x<0).ffill().add(s1, fill_value=0).max()

In [44]:
# 航空会社と出発飛行場ごとに、定時到着の最長ストリークを、全便数と定時到着パーセントとともに計算する
# 最初に1年での日と予定出発時刻でソートする
flights.sort_values(['MONTH', 'DAY', 'SCHED_DEP'])\
        .groupby(['AIRLINE', 'ORG_AIR'])['ON_TIME']\
        .agg(['mean', 'size', max_streak]).round(2).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size,max_streak
AIRLINE,ORG_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,ATL,0.82,233,15
AA,DEN,0.74,219,17
AA,DFW,0.78,4006,64
AA,IAH,0.8,196,24
AA,LAS,0.79,374,29


In [45]:
# 遅延到着便の最長ストリークを見つける

def max_delay_streak(df):
    df =df.reset_index(drop=True)
    s = 1 - df['ON_TIME']
    s1 = s.cumsum()
    streak = s.mul(s1).diff().where(lambda x: x < 0).ffill().add(s1, fill_value=0)
    last_idx = streak.idxmax()
    first_idx = last_idx - streak.max() + 1
    df_return = df.loc[[first_idx, last_idx], ['MONTH', 'DAY']]
    df_return['streak'] = streak.max()
    df_return.index = ['first', 'last']
    df_return.index.name='type'
    return df_return

In [47]:
flights.sort_values(['MONTH', 'DAY', 'SCHED_DEP'])\
        .groupby(['AIRLINE', 'ORG_AIR'])\
        .apply(max_delay_streak)\
        .sort_values('streak', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,MONTH,DAY,streak
AIRLINE,ORG_AIR,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,DFW,first,2.0,26.0,38.0
AA,DFW,last,3.0,1.0,38.0
MQ,ORD,last,1.0,12.0,28.0
MQ,ORD,first,1.0,6.0,28.0
MQ,DFW,last,2.0,26.0,25.0
MQ,DFW,first,2.0,21.0,25.0
NK,ORD,first,6.0,7.0,15.0
NK,ORD,last,6.0,18.0,15.0
DL,ATL,last,12.0,24.0,14.0
DL,ATL,first,12.0,23.0,14.0
