In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# filter()
- `DataFrameGroupBy.filter(func, dropna=True, *args, **kwargs)`
- 특정 집계 조건을 만족하는 Group의 행들만 조회한다.
    1. DataFrameGroupBy의 group로 DataFrame을 함수에 전달한다.
    2. 함수는 받은 DataFrame을 이용해 집계한 값의 조건을 비교해서 반환한다.(반환타입: Bool) 
    3. 반환값이 True인 Group들의 모든 행들로 구성된 DataFrame을 반환한다.
- 매개변수
    - func: filtering 조건을 구현한 함수
        - 첫번째 매개변수로 Group으로 묶인 DataFrame을 받는다.
    - dropna=True
        - 필터를 통과하지 못한 group의 DataFrame의 값들을 drop시킨다. False로 설정하면 NA 처리해서 반환한다.
    - \*args, \*\*kwargs: filter 함수의 매개변수에 전달할 전달인자값.

In [None]:
import numpy as np
import pandas as pd

In [131]:
# cnt1 - 사과: 10대, 귤: 20대, 배: 단단위, 딸기 30이상
data = dict(fruits=['사과', '사과','사과', '사과','사과','귤','귤','귤','귤','귤','배','배','배','배','배','딸기','딸기','딸기','딸기','딸기']
            ,cnt1=[10, 12, 13, 11, 12, 21, 22, 27, 24, 26, 7, 7, 8, 3, 2, 30, 35, 37, 41, 28]
            ,cnt2=[100,  103, 107, 107,  101,  51,  57, 58,  57, 51,  9, 9,  5,  7,  7,  208, 217, 213, 206, 204]
           )
df = pd.DataFrame(data)
df

Unnamed: 0,fruits,cnt1,cnt2
0,사과,10,100
1,사과,12,103
2,사과,13,107
3,사과,11,107
4,사과,12,101
5,귤,21,51
6,귤,22,57
7,귤,27,58
8,귤,24,57
9,귤,26,51


#### 과일중 cnt1의 평균이 20 이상인 과일들만 보기
- 집계결과가 아니라 조건을 만족하는 행들을 다 출력

In [132]:
r = df.groupby('fruits')['cnt1'].mean()
r[r >= 20]

fruits
귤     24.0
딸기    34.2
Name: cnt1, dtype: float64

In [133]:
def check_cnt1_mean(x):
    """
    [매개변수]
        x: DataFrame (Group별로 나눈 DataFrame)
    [반환값]
        bool: x['cnt1']의 평균이 20이상인지 여부
    """
    return x['cnt1'].mean() >= 20

In [134]:
df.groupby('fruits').filter(check_cnt1_mean)

Unnamed: 0,fruits,cnt1,cnt2
5,귤,21,51
6,귤,22,57
7,귤,27,58
8,귤,24,57
9,귤,26,51
15,딸기,30,208
16,딸기,35,217
17,딸기,37,213
18,딸기,41,206
19,딸기,28,204


In [21]:
# 람다 표현식 - 간단한 함수를 만드는 식
# lambda [매개변수]: 처리 식
# 처리 식의 결과를 알아서 return 시켜준다. (return을 식에 넣으면 안됨)

df.groupby('fruits').filter(lambda x: x['cnt1'].mean() >= 20, dropna = False)  # dropna는 일반적으로 True(Default)
# dropna를 False로 주면, 조건이 False인 행들도 NaN으로 출력된다.

Unnamed: 0,fruits,cnt1,cnt2
0,,,
1,,,
2,,,
3,,,
4,,,
5,귤,21.0,51.0
6,귤,22.0,57.0
7,귤,27.0,58.0
8,귤,24.0,57.0
9,귤,26.0,51.0


#### 매개변수 있는 filter함수

In [22]:
def check_mean(x, col, threshold):
    """
    [매개변수]
        x: DataFrame
        col: str 평균을 구할 칼럼명
        threshold: int 비교대상 값
    """
    return x[col].mean() >= threshold

In [25]:
df.groupby('fruits').filter(check_mean, col = 'cnt2', threshold = 20)

Unnamed: 0,fruits,cnt1,cnt2
0,사과,10,100
1,사과,12,103
2,사과,13,107
3,사과,11,107
4,사과,12,101
5,귤,21,51
6,귤,22,57
7,귤,27,58
8,귤,24,57
9,귤,26,51


In [27]:
# 람다식을 사용하는 경우에는 추가 매개변수는 코드 상에 직접 값으로 입력하고 작성하면 된다.
# (항상 동일한 값이 전달되므로)

df.groupby('fruits').filter(lambda x, col, thres: x[col].mean() >= thres,
                            col = 'cnt2',
                            thres = 20)

Unnamed: 0,fruits,cnt1,cnt2
0,사과,10,100
1,사과,12,103
2,사과,13,107
3,사과,11,107
4,사과,12,101
5,귤,21,51
6,귤,22,57
7,귤,27,58
8,귤,24,57
9,귤,26,51


# transform
함수에 의해 처리된 값(반환값)으로 원래 값들을 변경(tranform) 해서 반환    
DataFrame에 Group 단위 통계량을 추가할 때 유용하다.
- `DataFrameGroupBy.transform(func, *args)`, `SeriesGroupBy..transform(func, *args)`
    - func: 매개변수로 그룹별로 Series를 받아 Series의 값들을 변환하여 (Series로)반환하는 함수객체
        - DataFrameGroupBy은 모든 컬럼의 값들을 group 별 Series로 전달한다.
    - *args: 함수에 전달할 추가 인자값이 있으면 매개변수 순서에 맞게 값을 전달한다. (위치기반 argument)
- transform() 함수를 groupby() 와 사용하면 컬럼의 각 원소들을 자신이 속한 그룹의 통계량으로 변환된 데이터셋을 생성할 수 있다.
- 컬럼의 값과 통계값을 비교해서 보거나 결측치 처리등에 사용할 수있다.

In [28]:
df.groupby('fruits').transform('mean')

Unnamed: 0,cnt1,cnt2
0,11.6,103.6
1,11.6,103.6
2,11.6,103.6
3,11.6,103.6
4,11.6,103.6
5,24.0,54.8
6,24.0,54.8
7,24.0,54.8
8,24.0,54.8
9,24.0,54.8


In [29]:
def func(x):
    """
    [매개변수]
        x: Series - 그룹별 각 컬럼들을 Series로 전달
    [반환값]
        int: max - min
    """
    return x.max() - x.min()

In [30]:
df.groupby('fruits').transform(func)

Unnamed: 0,cnt1,cnt2
0,3,7
1,3,7
2,3,7
3,3,7
4,3,7
5,6,7
6,6,7
7,6,7
8,6,7
9,6,7


In [31]:
df.groupby('fruits').transform(lambda x: x.max() - x.min())

Unnamed: 0,cnt1,cnt2
0,3,7
1,3,7
2,3,7
3,3,7
4,3,7
5,6,7
6,6,7
7,6,7
8,6,7
9,6,7


In [33]:
# 컬럼 단위로 처리 (cnt1에 대해서만 수행하는 예시)
df.groupby('fruits')['cnt1'].transform('mean')

0     11.6
1     11.6
2     11.6
3     11.6
4     11.6
5     24.0
6     24.0
7     24.0
8     24.0
9     24.0
10     5.4
11     5.4
12     5.4
13     5.4
14     5.4
15    34.2
16    34.2
17    34.2
18    34.2
19    34.2
Name: cnt1, dtype: float64

#### 원본에 통계치 붙여서 비교하기

In [36]:
# 그룹별 평균을 transform을 통해 조회

cnt1_group_mean = df.groupby('fruits')['cnt1'].transform('mean')
cnt1_group_mean

0     11.6
1     11.6
2     11.6
3     11.6
4     11.6
5     24.0
6     24.0
7     24.0
8     24.0
9     24.0
10     5.4
11     5.4
12     5.4
13     5.4
14     5.4
15    34.2
16    34.2
17    34.2
18    34.2
19    34.2
Name: cnt1, dtype: float64

In [40]:
cnt2_group_median = df.groupby('fruits')['cnt2'].transform('median')
cnt2_group_median

0     103
1     103
2     103
3     103
4     103
5      57
6      57
7      57
8      57
9      57
10      7
11      7
12      7
13      7
14      7
15    208
16    208
17    208
18    208
19    208
Name: cnt2, dtype: int64

In [38]:
# insert(): 새로운 컬럼을 중간에 넣는다.

df2 = df.copy()
df2.insert(2, 'cnt1 평균', cnt1_group_mean)  #insert(어느 열에 넣을지, 이름, 넣고자 하는 값)
df2

Unnamed: 0,fruits,cnt1,cnt1 평균,cnt2
0,사과,10,11.6,100
1,사과,12,11.6,103
2,사과,13,11.6,107
3,사과,11,11.6,107
4,사과,12,11.6,101
5,귤,21,24.0,51
6,귤,22,24.0,57
7,귤,27,24.0,58
8,귤,24,24.0,57
9,귤,26,24.0,51


In [41]:
df2['cnt2 중앙값'] = cnt2_group_median  # 값을 맨 뒤에 추가할 땐, 이런식으로 해도 가능하다.
df2

Unnamed: 0,fruits,cnt1,cnt1 평균,cnt2,cnt2 중앙값
0,사과,10,11.6,100,103
1,사과,12,11.6,103,103
2,사과,13,11.6,107,103
3,사과,11,11.6,107,103
4,사과,12,11.6,101,103
5,귤,21,24.0,51,57
6,귤,22,24.0,57,57
7,귤,27,24.0,58,57
8,귤,24,24.0,57,57
9,귤,26,24.0,51,57


In [45]:
# 행을 섞기

#df.sample(frac = 1)  # frac: 원본에서 이 비율만큼의 양만을 사용하겠다.
df3 = df.sample(frac = 1).reset_index(drop = True)
df3

Unnamed: 0,fruits,cnt1,cnt2
0,사과,11,107
1,배,3,7
2,사과,12,101
3,딸기,35,217
4,귤,22,57
5,사과,13,107
6,배,7,9
7,딸기,37,213
8,귤,21,51
9,귤,26,51


In [47]:
c1 = df3.groupby('fruits')['cnt2'].transform('mean')
c1

0     103.6
1       7.4
2     103.6
3     209.6
4      54.8
5     103.6
6       7.4
7     209.6
8      54.8
9      54.8
10     54.8
11    103.6
12      7.4
13    209.6
14      7.4
15     54.8
16    209.6
17    209.6
18      7.4
19    103.6
Name: cnt2, dtype: float64

In [48]:
c2 = df3.groupby('fruits')['cnt2'].transform('mean')
df3['cnt2 평균'] = c2
df3

Unnamed: 0,fruits,cnt1,cnt2,cnt2 평균
0,사과,11,107,103.6
1,배,3,7,7.4
2,사과,12,101,103.6
3,딸기,35,217,209.6
4,귤,22,57,54.8
5,사과,13,107,103.6
6,배,7,9,7.4
7,딸기,37,213,209.6
8,귤,21,51,54.8
9,귤,26,51,54.8


#### 결측치 처리
- transform이용해서 여기선 결측치를 같은 과일그룹의 평균값으로 변환
    - 전체 평균보다 좀더 정확할 수 있다.

In [52]:
# 결측치 변경(대체)
# series.fillna(대체할 값)
# 대체값: scalar - 동일한 값으로 대체
#        배열형태(리스트, 시리즈, ndarray) - NA의 index와 동일한 index에 있는 값으로 대체

s = pd.Series([1, np.nan, 3, np.nan, 5])
rv = pd.Series([10, 20, 30, 40, 50])

In [54]:
s.fillna(rv)  # s와 rv의 size가 동일해야한다.

0     1.0
1    20.0
2     3.0
3    40.0
4     5.0
dtype: float64

In [58]:
df4 = df.copy()
df4.loc[[0, 1, 5, 6, 10, 11, 16], 'cnt2'] = np.nan
df4

Unnamed: 0,fruits,cnt1,cnt2
0,사과,10,
1,사과,12,
2,사과,13,107.0
3,사과,11,107.0
4,사과,12,101.0
5,귤,21,
6,귤,22,
7,귤,27,58.0
8,귤,24,57.0
9,귤,26,51.0


In [60]:
# 결측치 제거
df4.dropna()

Unnamed: 0,fruits,cnt1,cnt2
2,사과,13,107.0
3,사과,11,107.0
4,사과,12,101.0
7,귤,27,58.0
8,귤,24,57.0
9,귤,26,51.0
12,배,8,5.0
13,배,3,7.0
14,배,2,7.0
15,딸기,30,208.0


In [63]:
# 결측치 대체
print(df4['cnt2'].mean())  # cnt2 전체의 평균값
df4['cnt2'].fillna(df4['cnt2'].mean())

102.38461538461539


0     102.384615
1     102.384615
2     107.000000
3     107.000000
4     101.000000
5     102.384615
6     102.384615
7      58.000000
8      57.000000
9      51.000000
10    102.384615
11    102.384615
12      5.000000
13      7.000000
14      7.000000
15    208.000000
16    102.384615
17    213.000000
18    206.000000
19    204.000000
Name: cnt2, dtype: float64

In [68]:
# 결측치 대체 (과일별로 계산 후)
c2_mean = df4.groupby('fruits')['cnt2'].transform('mean')
print(c2_mean)
print(df4['cnt2'])
df4['cnt2'].fillna(c2_mean)

0     105.000000
1     105.000000
2     105.000000
3     105.000000
4     105.000000
5      55.333333
6      55.333333
7      55.333333
8      55.333333
9      55.333333
10      6.333333
11      6.333333
12      6.333333
13      6.333333
14      6.333333
15    207.750000
16    207.750000
17    207.750000
18    207.750000
19    207.750000
Name: cnt2, dtype: float64
0       NaN
1       NaN
2     107.0
3     107.0
4     101.0
5       NaN
6       NaN
7      58.0
8      57.0
9      51.0
10      NaN
11      NaN
12      5.0
13      7.0
14      7.0
15    208.0
16      NaN
17    213.0
18    206.0
19    204.0
Name: cnt2, dtype: float64


0     105.000000
1     105.000000
2     107.000000
3     107.000000
4     101.000000
5      55.333333
6      55.333333
7      58.000000
8      57.000000
9      51.000000
10      6.333333
11      6.333333
12      5.000000
13      7.000000
14      7.000000
15    208.000000
16    207.750000
17    213.000000
18    206.000000
19    204.000000
Name: cnt2, dtype: float64

In [69]:
df4['cnt2'] = df4['cnt2'].fillna(c2)
df4

Unnamed: 0,fruits,cnt1,cnt2
0,사과,10,103.6
1,사과,12,7.4
2,사과,13,107.0
3,사과,11,107.0
4,사과,12,101.0
5,귤,21,103.6
6,귤,22,7.4
7,귤,27,58.0
8,귤,24,57.0
9,귤,26,51.0


# TODO 
- data/diamonds.csv 조회

In [142]:
# cut 별 평균 가격이 4000 이상인 diamond 데이터들 조회 
dia = pd.read_csv('data/diamonds.csv')
pr = dia.groupby('cut')['price'].mean()
pr[pr >= 4000]

cut
Fair       4358.757764
Premium    4584.257704
Name: price, dtype: float64

In [143]:
dia.groupby('cut').filter(lambda x: x['price'].mean() >= 4000)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
12,0.22,Premium,F,SI1,60.4,61.0,342,3.88,3.84,2.33
14,0.20,Premium,E,SI2,60.2,62.0,345,3.79,3.75,2.27
...,...,...,...,...,...,...,...,...,...,...
53928,0.79,Premium,E,SI2,61.4,58.0,2756,6.03,5.96,3.68
53930,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49
53931,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43
53934,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58


In [144]:
# color 별 carat의 최대값과 최소값의 차이가 2이상 3미만인 diamond 데이터들 조회
dia.groupby('color').filter(lambda x: 2 <= x['carat'].max() - x['carat'].min() < 3)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
12,0.22,Premium,F,SI1,60.4,61.0,342,3.88,3.84,2.33
...,...,...,...,...,...,...,...,...,...,...
53929,0.71,Ideal,G,VS1,61.4,56.0,2756,5.76,5.73,3.53
53930,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49
53931,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43
53932,0.70,Very Good,E,VS2,60.5,59.0,2757,5.71,5.76,3.47


In [148]:
# clarity 별 평균 가격 컬럼을 DataFrame에 추가.
df_dia = dia.copy()
df_dia['clarity_mean'] = df_dia.groupby('clarity')['price'].transform('mean')
df_dia

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,clarity_mean
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,5063.028606
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,3996.001148
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,3839.455391
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,3924.989395
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,5063.028606
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,3996.001148
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,3996.001148
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,3996.001148
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,5063.028606


# pivot_table()
엑셀의 pivot table 기능을 제공하는 메소드.    
분류별 집계(Group으로 묶어 집계)를 처리하는 함수로 group으로 묶고자 하는 컬럼을 행과 열로 위치시키고 집계값을 값으로 보여준다.    
역할은 groupby() 를 이용한 집계와 같다.

> pivot() 함수와 역할이 다르다.   
> pivot() 은 index와 column의 형태를 바꾸는 reshape 함수.

- `DataFrame.pivot_table(values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')`
- **매개변수**
    - index
        - 문자열 또는 리스트. index로 올 컬럼들 => groupby였으면 묶었을 컬럼
    - columns
        - 문자열 또는 리스트. column으로 올 컬럼들 => groupby였으면 묶었을 컬럼 (index/columns가 묶여서 groupby에 묶을 컬럼들이 된다.)
    - values
        - 문자열 또는 리스트. 집계할 대상 컬럼들
    - aggfunc
        - 집계함수 지정. 함수, 함수이름문자열, 함수리스트(함수이름 문자열/함수객체), dict: 집계할 함수
        - 기본(생략시): 평균을 구한다. (mean이 기본값)
    - fill_value, dropna
        - fill_value: 집계시 NA가 나올경우 채울 값
        - dropna: boolean. 컬럼의 전체값이 NA인 경우 그 컬럼 제거(기본: True)
    - margins/margins_name
        - margins: boolean(기본: False). 총집계결과를 만들지 여부.
        - margins_name: margin의 이름 문자열로 지정 (생략시 All)

In [70]:
flights = pd.read_csv('data/flights.csv')
flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58492 entries, 0 to 58491
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MONTH      58492 non-null  int64  
 1   DAY        58492 non-null  int64  
 2   WEEKDAY    58492 non-null  int64  
 3   AIRLINE    58492 non-null  object 
 4   ORG_AIR    58492 non-null  object 
 5   DEST_AIR   58492 non-null  object 
 6   SCHED_DEP  58492 non-null  int64  
 7   DEP_DELAY  57659 non-null  float64
 8   AIR_TIME   57474 non-null  float64
 9   DIST       58492 non-null  int64  
 10  SCHED_ARR  58492 non-null  int64  
 11  ARR_DELAY  57474 non-null  float64
 12  DIVERTED   58492 non-null  int64  
 13  CANCELLED  58492 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 6.2+ MB


In [71]:
flights.groupby(['AIRLINE', 'WEEKDAY'])['AIR_TIME'].mean()

AIRLINE  WEEKDAY
AA       1          147.610569
         2          143.851852
         3          144.514005
         4          141.124618
         5          145.430966
                       ...    
WN       3          104.219920
         4          107.200800
         5          107.893635
         6          109.247433
         7          107.602273
Name: AIR_TIME, Length: 98, dtype: float64

### 1개의 컬럼을 grouping 해서 집계
- 항공사별 비행시간의 평균 
- 사용컬럼
    - grouping할 컬럼
        - AIRLINE: 항공사
    - 집계대상컬럼
        - AIR_TIME
- 집계: mean

In [72]:
flights.groupby('AIRLINE')['AIR_TIME'].mean()

AIRLINE
AA    144.259404
AS    147.845052
B6    209.412963
DL    115.334187
EV     68.964016
F9    127.592337
HA    338.288288
MQ     61.318346
NK    135.736878
OO     76.010272
UA    155.650521
US    147.686755
VX    154.864097
WN    107.005897
Name: AIR_TIME, dtype: float64

In [78]:
#flights.pivot_table(values = 'AIR_TIME', index = 'AIRLINE', aggfunc = 'mean')
#flights.pivot_table(values = 'AIR_TIME', columns = 'AIRLINE', aggfunc = 'mean')
#flights.pivot_table(values = 'AIR_TIME', index = 'AIRLINE', aggfunc = 'mean', margins = True)  # 총평균 (All)
flights.pivot_table(values = 'AIR_TIME', index = 'AIRLINE', aggfunc = 'mean'
                    , margins = True, margins_name = 'total mean')  # 총평균 (total mean)

Unnamed: 0_level_0,AIR_TIME
AIRLINE,Unnamed: 1_level_1
AA,144.259404
AS,147.845052
B6,209.412963
DL,115.334187
EV,68.964016
F9,127.592337
HA,338.288288
MQ,61.318346
NK,135.736878
OO,76.010272


In [79]:
flights.pivot_table(values = 'AIR_TIME', index = 'AIRLINE', aggfunc = ['count', 'mean', 'sum'])

Unnamed: 0_level_0,count,mean,sum
Unnamed: 0_level_1,AIR_TIME,AIR_TIME,AIR_TIME
AIRLINE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
AA,8720,144.259404,1257942.0
AS,768,147.845052,113545.0
B6,540,209.412963,113083.0
DL,10539,115.334187,1215507.0
EV,5697,68.964016,392888.0
F9,1305,127.592337,166508.0
HA,111,338.288288,37550.0
MQ,3314,61.318346,203209.0
NK,1486,135.736878,201705.0
OO,6425,76.010272,488366.0


### 두개의 컬럼을 grouping 해서 집계
- 항공사/출발공항코드 별 취소 총수 (1이 취소이므로 합계를 구한다.)
- 사용컬럼
    - grouping할 컬럼
        - AIRLINE: 항공사
        - ORG_AIR: 출발 공항코드
    - 집계대상컬럼
        - CANCELLED: 취소여부 - 1:취소, 0: 취소안됨
- 집계: sum

In [82]:
flights.groupby(['AIRLINE', 'ORG_AIR'])['CANCELLED'].sum()

AIRLINE  ORG_AIR
AA       ATL         3
         DEN         4
         DFW        86
         IAH         3
         LAS         3
                    ..
WN       LAS         7
         LAX        32
         MSP         1
         PHX         6
         SFO        25
Name: CANCELLED, Length: 114, dtype: int64

In [86]:
flights.pivot_table(values = 'CANCELLED', 
                    index = 'AIRLINE', 
                    columns = 'ORG_AIR',
                    aggfunc = 'sum', 
                    fill_value = -1  # NaN을 -1로 대체 (fill_value)
                   )  
# 결측치(NaN)이 결과에 나오는 것은 (행, 열)그룹으로 묶인 데이터가 1개도 없기 때문이다.

ORG_AIR,ATL,DEN,DFW,IAH,LAS,LAX,MSP,ORD,PHX,SFO
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AA,3,4,86,3,3,11,3,35,4,2
AS,0,0,0,0,0,0,0,0,0,0
B6,-1,0,0,-1,0,0,-1,0,0,1
DL,28,1,0,0,1,1,4,0,1,2
EV,18,6,27,36,-1,-1,6,53,0,-1
F9,0,2,1,0,1,1,1,4,0,0
HA,-1,-1,-1,-1,0,0,-1,-1,0,0
MQ,5,-1,62,0,-1,0,0,85,-1,-1
NK,1,1,6,0,1,1,3,10,2,-1
OO,3,25,2,10,0,15,4,41,9,33


In [88]:
flights.pivot_table(values = ['CANCELLED', 'AIR_TIME'],  # value를 2개 이상으로 하면 옆으로 붙는다.
                    index = 'AIRLINE',
                    columns = 'ORG_AIR',
                    aggfunc = 'sum',
                    margins = True,  # All 행, 열이 추가
                    margins_name = 'total_sum'  # All을 total_sum으로 변경
                   )

Unnamed: 0_level_0,AIR_TIME,AIR_TIME,AIR_TIME,AIR_TIME,AIR_TIME,AIR_TIME,AIR_TIME,AIR_TIME,AIR_TIME,AIR_TIME,...,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED
ORG_AIR,ATL,DEN,DFW,IAH,LAS,LAX,MSP,ORD,PHX,SFO,...,DEN,DFW,IAH,LAS,LAX,MSP,ORD,PHX,SFO,total_sum
AIRLINE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AA,27074.0,24832.0,499575.0,18740.0,54814.0,205473.0,17267.0,211361.0,127960.0,70846.0,...,4.0,86.0,3.0,3.0,11.0,3.0,35.0,4.0,2.0,0
AS,3841.0,7306.0,10426.0,1837.0,15787.0,36277.0,2979.0,13270.0,9948.0,11874.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
B6,,5065.0,3922.0,,19996.0,36468.0,,9325.0,4678.0,33629.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,1.0,0
DL,617926.0,30265.0,14886.0,9833.0,51365.0,147066.0,232860.0,16318.0,37556.0,57432.0,...,1.0,0.0,0.0,1.0,1.0,4.0,0.0,1.0,2.0,0
EV,103710.0,13953.0,40386.0,127934.0,,,17328.0,89444.0,133.0,,...,6.0,27.0,36.0,,,6.0,53.0,0.0,,0
F9,17138.0,76089.0,4142.0,6421.0,16785.0,5800.0,4737.0,21787.0,6331.0,7278.0,...,2.0,1.0,0.0,1.0,1.0,1.0,4.0,0.0,0.0,0
HA,,,,,7846.0,15952.0,,,7275.0,6477.0,...,,,,0.0,0.0,,,0.0,0.0,0
MQ,5244.0,,99593.0,91.0,,631.0,235.0,97415.0,,,...,,62.0,0.0,,0.0,0.0,85.0,,,0
NK,12023.0,13317.0,35950.0,18000.0,48125.0,20761.0,11830.0,37433.0,4266.0,,...,1.0,6.0,0.0,1.0,1.0,3.0,10.0,2.0,,0
OO,12915.0,106038.0,13134.0,45807.0,5896.0,74904.0,52957.0,71114.0,34590.0,71011.0,...,25.0,2.0,10.0,0.0,15.0,4.0,41.0,9.0,33.0,0


### 3개 이상의 컬럼을 grouping해서 집계
- 항공사/월/출발공항코드 별 취소 총수 
- grouping할 컬럼
    - AIRLINE:항공사
    - MONTH:월
    - ORG_AIR: 출발지 공항
- 집계 대상컬럼
    - CANCELLED: 취소여부
- 집계 : sum    

In [89]:
flights.groupby(['AIRLINE', 'MONTH', 'ORG_AIR'])["CANCELLED"].sum()

AIRLINE  MONTH  ORG_AIR
AA       1      ATL        0
                DEN        0
                DFW        8
                IAH        0
                LAS        0
                          ..
WN       12     LAS        1
                LAX        2
                MSP        0
                PHX        0
                SFO        0
Name: CANCELLED, Length: 1133, dtype: int64

In [94]:
result = flights.pivot_table(values = 'CANCELLED',
                            index = ['AIRLINE', 'ORG_AIR'],
                            columns = 'MONTH',
                            aggfunc = 'sum',
                            margins = True,
                            margins_name = 'total sum'
                            )
result.to_excel('flights_stat.xlsx')

In [95]:
result

Unnamed: 0_level_0,MONTH,1,2,3,4,5,6,7,8,9,11,12,total sum
AIRLINE,ORG_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AA,ATL,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
AA,DEN,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,4
AA,DFW,8.0,33.0,13.0,4.0,8.0,7.0,1.0,2.0,1.0,3.0,6.0,86
AA,IAH,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3
AA,LAS,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
WN,LAX,3.0,2.0,3.0,2.0,1.0,0.0,9.0,4.0,3.0,3.0,2.0,32
WN,MSP,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
WN,PHX,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,6
WN,SFO,4.0,5.0,0.0,2.0,2.0,6.0,0.0,2.0,2.0,2.0,0.0,25


### 3개 이상의 컬럼을 grouping해서 집계 2
- 항공사/월/출발공항코드 별 최대/최소 연착시간
- grouping할 컬럼
    - AIRLINE:항공사
    - MONTH:월
    - ORG_AIR: 출발지 공항
- 집계 대상컬럼
    - ARR_DELAY: 연착시간
- 집계 : min, max    

In [96]:
flights.pivot_table(values = 'ARR_DELAY',
                    index = ['AIRLINE', 'ORG_AIR'],
                    columns = 'MONTH',
                    aggfunc = ['min', 'max'],
                    margins = True,
                    margins_name = 'total_sum'
                   )

Unnamed: 0_level_0,Unnamed: 1_level_0,min,min,min,min,min,min,min,min,min,min,...,max,max,max,max,max,max,max,max,max,max
Unnamed: 0_level_1,MONTH,1,2,3,4,5,6,7,8,9,11,...,3,4,5,6,7,8,9,11,12,total_sum
AIRLINE,ORG_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
AA,ATL,-27.0,-26.0,-32.0,-30.0,-26.0,-23.0,-27.0,-32.0,-33.0,-33.0,...,25.0,115.0,25.0,159.0,319.0,84.0,196.0,255.0,203.0,319.0
AA,DEN,-13.0,-17.0,-19.0,-28.0,-20.0,-13.0,-30.0,-27.0,-27.0,-27.0,...,53.0,105.0,330.0,10.0,67.0,257.0,152.0,146.0,106.0,330.0
AA,DFW,-39.0,-29.0,-29.0,-37.0,-36.0,-33.0,-32.0,-32.0,-33.0,-45.0,...,234.0,275.0,285.0,602.0,203.0,268.0,241.0,349.0,293.0,602.0
AA,IAH,-23.0,-27.0,-13.0,-27.0,-19.0,-13.0,-19.0,-30.0,-31.0,-14.0,...,97.0,127.0,131.0,456.0,858.0,95.0,73.0,98.0,103.0,858.0
AA,LAS,-32.0,-25.0,-24.0,-19.0,-31.0,-27.0,-40.0,-28.0,-32.0,-31.0,...,111.0,626.0,54.0,206.0,157.0,157.0,36.0,89.0,219.0,732.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WN,LAX,-25.0,-27.0,-25.0,-37.0,-26.0,-30.0,-25.0,-20.0,-30.0,-28.0,...,104.0,165.0,244.0,244.0,248.0,221.0,157.0,87.0,493.0,493.0
WN,MSP,-38.0,-32.0,-30.0,-33.0,-24.0,-24.0,-28.0,-21.0,-23.0,-29.0,...,43.0,64.0,13.0,88.0,84.0,45.0,23.0,114.0,90.0,128.0
WN,PHX,-38.0,-45.0,-24.0,-43.0,-27.0,-25.0,-29.0,-24.0,-25.0,-41.0,...,78.0,130.0,203.0,168.0,171.0,161.0,131.0,244.0,254.0,254.0
WN,SFO,-19.0,-24.0,-29.0,-29.0,-22.0,-26.0,-30.0,-22.0,-33.0,-23.0,...,237.0,72.0,133.0,205.0,51.0,65.0,69.0,143.0,215.0,237.0


# apply() - Series, DataFrame의 데이터 일괄 처리

데이터프레임의 행들과 열들 또는 Series의 원소들에 공통된 처리를 할 때 apply 함수를 이용하면 반복문을 사용하지 않고 일괄 처리가 가능하다.

- DataFrame.apply(함수, axis=0, args=())
    - 인수로 행이나 열을 받는 함수를 apply 메서드의 인수로 넣으면 데이터프레임의 행이나 열들을 하나씩 함수에 전달한다.
    - 매개변수
        - 함수: DataFrame의 행들 또는 열들을 전달할 함수
        - axis: **0-열을 전달, 1-행을 전달 (기본값 0)** G: 0이 행이다...
        - args: 행/열 이외에 전달할 매개변수를 위치기반(순서대로) 튜플로 전달
- Series.apply(함수, args=())
    - 인수로 Series의 원소들을 받는 함수를 apply 메소드의 인수로 넣으면  Series의 원소들을 하나씩 함수로 전달한다.
    - 매개변수
        - 함수: Series의 원소들을 전달할 함수
        - args: 원소 이외에 전달할 매개변수를 위치기반(순서대로) 튜플로 전달

In [97]:
df = pd.DataFrame(np.arange(24).reshape(6, 4), columns = ['NO1', 'NO2', 'NO3', 'NO4'])
df

Unnamed: 0,NO1,NO2,NO3,NO4
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
5,20,21,22,23


In [98]:
def func(x):
    """
    apply()에 적용할 함수
    [매개변수]
        x: DataFrame에 적용 시 - Series / Series에 적용 시 - int Scalar
    [반환값]
        x를 처리한 결과를 반환 
    """
    print(type(x))
    return x**2

In [105]:
df.apply(func)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


Unnamed: 0,NO1,NO2,NO3,NO4
0,0,1,4,9
1,16,25,36,49
2,64,81,100,121
3,144,169,196,225
4,256,289,324,361
5,400,441,484,529


In [100]:
# Series
df['NO1'].apply(func)

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


0      0
1     16
2     64
3    144
4    256
5    400
Name: NO1, dtype: int64

In [101]:
df.apply(lambda x: x**2)

Unnamed: 0,NO1,NO2,NO3,NO4
0,0,1,4,9
1,16,25,36,49
2,64,81,100,121
3,144,169,196,225
4,256,289,324,361
5,400,441,484,529


In [104]:
df.apply(func, axis = 1)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


Unnamed: 0,NO1,NO2,NO3,NO4
0,0,1,4,9
1,16,25,36,49
2,64,81,100,121
3,144,169,196,225
4,256,289,324,361
5,400,441,484,529


In [106]:
df.apply(lambda x: x.mean())

NO1    10.0
NO2    11.0
NO3    12.0
NO4    13.0
dtype: float64

In [108]:
df['NO4'] = df['NO4'].apply(lambda x: f'{x}원')
df

Unnamed: 0,NO1,NO2,NO3,NO4
0,0,1,2,3원
1,4,5,6,7원
2,8,9,10,11원
3,12,13,14,15원
4,16,17,18,19원
5,20,21,22,23원


# cut()/qcut() - 연속형(실수)을 범주형으로 변환
- cut() : 지정한 값을 기준으로 구간을 나눠 그룹으로 묶는다.
    - `pd.cut(x, bins,right=True, labels=None)`
    - 매개변수
        - x: 나눌 대상. 1차원 배열형태의 자료구조
        - bins: 나누는 기준값(구간경계)을 리스트로 전달
        - right: 구간경계의 오른쪽(True)을 포함할지 왼쪽(False)을 포함할지
        - labels: 각 구간의 label을 리스트로 전달
- qcut() : 데이터를 오름차순으로 정렬한 뒤 데이터 개수가 같도록 지정한 개수만큼의 구간으로 나눈다.
    - `pd.qcut(x, q, labels)`
    - 매개변수
        - x: 나눌 대상. 1차원 배열형태의 자료구조
        - q: 나눌 개수

In [110]:
ages = pd.Series(np.random.randint(50, size = 30))
ages.shape

(30,)

In [111]:
ages.value_counts()

5     2
40    2
22    2
20    2
9     2
18    2
10    2
14    1
46    1
2     1
6     1
44    1
45    1
16    1
12    1
30    1
17    1
42    1
25    1
27    1
28    1
29    1
33    1
dtype: int64

In [120]:
bins = [-1, 10, 20, 30, 40, 51]  # 시작과 끝범주가 빠지기 때문에, 작은값과 큰값도 설정해주어야 한다.
pd.cut(ages, bins = bins, right = False)

# (): 포함 X / []: 포함 O
# ex) (10, 20] -> 10 초과 20 이하 (right = True)
# ex) [10, 20) -> 10 이상 20 미만 (right = False)

0     [-1, 10)
1     [-1, 10)
2     [30, 40)
3     [20, 30)
4     [20, 30)
5     [10, 20)
6     [40, 51)
7     [10, 20)
8     [20, 30)
9     [10, 20)
10    [40, 51)
11    [-1, 10)
12    [10, 20)
13    [40, 51)
14    [10, 20)
15    [10, 20)
16    [30, 40)
17    [-1, 10)
18    [20, 30)
19    [20, 30)
20    [20, 30)
21    [40, 51)
22    [-1, 10)
23    [40, 51)
24    [10, 20)
25    [20, 30)
26    [-1, 10)
27    [40, 51)
28    [20, 30)
29    [10, 20)
dtype: category
Categories (5, interval[int64]): [[-1, 10) < [10, 20) < [20, 30) < [30, 40) < [40, 51)]

In [122]:
bins = [-1, 10, 20, 30, 40, 51] 
age_cate = pd.cut(ages, bins = bins, right = False)
print(age_cate.shape)
print(age_cate.value_counts())

(30,)
[20, 30)    8
[10, 20)    8
[40, 51)    6
[-1, 10)    6
[30, 40)    2
dtype: int64


In [125]:
label = ['10대 이하', '10대', '20대', '30대', '40대']  # 각 범주명 지정
age_cate = pd.cut(ages, bins = bins, right = False, labels = label)
age_cate

0     10대 이하
1     10대 이하
2        30대
3        20대
4        20대
5        10대
6        40대
7        10대
8        20대
9        10대
10       40대
11    10대 이하
12       10대
13       40대
14       10대
15       10대
16       30대
17    10대 이하
18       20대
19       20대
20       20대
21       40대
22    10대 이하
23       40대
24       10대
25       20대
26    10대 이하
27       40대
28       20대
29       10대
dtype: category
Categories (5, object): [10대 이하 < 10대 < 20대 < 30대 < 40대]

In [129]:
df_age = pd.DataFrame({'나이': ages, '나이대': age_cate})
df_age

Unnamed: 0,나이,나이대
0,6,10대 이하
1,9,10대 이하
2,30,30대
3,20,20대
4,22,20대
5,18,10대
6,40,40대
7,17,10대
8,20,20대
9,10,10대


In [130]:
df_age.groupby('나이대')['나이'].mean()

나이대
10대 이하     6.000000
10대       14.375000
20대       24.125000
30대       31.500000
40대       42.833333
Name: 나이, dtype: float64

In [127]:
pd.qcut(ages, 3, labels = ['범주1', '범주2', '범주3'])  # 3개로 범주를 나눈다.

0     범주1
1     범주1
2     범주3
3     범주2
4     범주2
5     범주2
6     범주3
7     범주2
8     범주2
9     범주1
10    범주3
11    범주1
12    범주2
13    범주3
14    범주1
15    범주1
16    범주3
17    범주1
18    범주2
19    범주3
20    범주3
21    범주3
22    범주1
23    범주3
24    범주2
25    범주2
26    범주1
27    범주3
28    범주2
29    범주1
dtype: category
Categories (3, object): [범주1 < 범주2 < 범주3]

In [1]:
import pandas as pd
dia_df = pd.read_csv('data/diamonds.csv')
dia_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [4]:
print(dia_df.price.min(), dia_df.price.max())
dia_df.price.value_counts()

326 18823


605      132
802      127
625      126
828      125
776      124
        ... 
13550      1
13014      1
6811       1
5354       1
11600      1
Name: price, Length: 11602, dtype: int64

In [16]:
# 가격: 저가, 중가, 고가

bins = [300, 1000, 5000, 20000]
price_cate = pd.cut(dia_df.price, bins = bins, labels = ['저가', '중가', '고가'])
price_cate

0        저가
1        저가
2        저가
3        저가
4        저가
         ..
53935    중가
53936    중가
53937    중가
53938    중가
53939    중가
Name: price, Length: 53940, dtype: category
Categories (3, object): [저가 < 중가 < 고가]

In [9]:
price_cate2 = pd.qcut(dia_df.price, 3, labels = ['저가', '중가', '고가'], retbins = True)
# retbins를 통해 어느 구간에서 저가, 중가, 고가로 나뉘었는지를 반환
price_cate2

(0        저가
 1        저가
 2        저가
 3        저가
 4        저가
          ..
 53935    중가
 53936    중가
 53937    중가
 53938    중가
 53939    중가
 Name: price, Length: 53940, dtype: category
 Categories (3, object): [저가 < 중가 < 고가],
 array([  326.        ,  1240.        ,  4287.33333333, 18823.        ]))

In [10]:
dia_df['price_cate'] = price_cate
dia_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_cate
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,저가
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,저가
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,저가
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,저가
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,저가
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,중가
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,중가
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,중가
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,중가


In [11]:
# 가격대별 집계
dia_df.groupby('price_cate')['carat'].mean()

price_cate
저가    0.334937
중가    0.712814
고가    1.397875
Name: carat, dtype: float64

In [13]:
# 가격대, cut 별 가격의 평균
dia_df.pivot_table(values = 'price', index = 'cut', columns = 'price_cate',
                   aggfunc = 'mean', margins = True)

price_cate,저가,중가,고가,All
cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fair,776.720721,2845.605976,9074.158879,4358.757764
Good,644.526079,2915.155503,8957.215149,3928.864452
Ideal,733.002773,2362.558209,9335.894082,3457.54197
Premium,734.418851,2729.378705,9507.518974,4584.257704
Very Good,650.268946,2793.512835,9169.034999,3981.759891
All,708.38488,2623.613796,9313.451611,3932.799722


In [14]:
dia_df.groupby(['cut', 'price_cate'])['price'].mean()

cut        price_cate
Fair       저가             776.720721
           중가            2845.605976
           고가            9074.158879
Good       저가             644.526079
           중가            2915.155503
           고가            8957.215149
Ideal      저가             733.002773
           중가            2362.558209
           고가            9335.894082
Premium    저가             734.418851
           중가            2729.378705
           고가            9507.518974
Very Good  저가             650.268946
           중가            2793.512835
           고가            9169.034999
Name: price, dtype: float64