## 1.그룹 객체 만들기(분할)

### 1-1.1개 열을 기준으로 그룹화

In [1]:
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
print(f"승객수: {len(df)}\n{df.head()}\n")

# class 열을 기준으로 분할
grouped = df.groupby(['class'])
print(grouped,'\n')

# 그룹 객체를 iteration으로 출력
for key, group in grouped:
    print(f"key: {key}\nnumber: {len(group)}")
    print(group.head(),'\n')

# 그룹별 연산
average = grouped.mean()
print(f"그룹별 평균:\n{average}\n")

# 3등급 그룹 선택
group3 = grouped.get_group('Third')
print(f"group3:\n{group3.head()}")

승객수: 891
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f2d1e2096d0> 

key: First
number: 216
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
6   54.0    male  First  51.8625         0
11  58.0  female  First  26.5500         1
23  28.0    male  First  35.5000         1 

key: Second
number: 184
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1 

key: Third
number: 491
    age     sex  class     fare  survived
0  22.0    male  Third   7

### 1-2.여러 열을 기준으로 그룹화

In [2]:
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
print(f"승객수: {len(df)}\n{df.head()}\n")

# class, sex 열을 기준으로 분할
grouped_two = df.groupby(['class', 'sex'])
print(grouped_two,'\n')

# 그룹 객체를 iteration으로 출력
for key, group in grouped_two:
    print(f"key: {key}\nnumber: {len(group)}")
    print(group.head(),'\n')

# 그룹 연산
average_two = grouped_two.mean()
print(f"그룹별 평균:\n{average_two}\n")

# class: Third, sex:female 인 그룹 선택
group3f = grouped_two.get_group(('Third', 'female'))
print(f"group3f:\n{group3f.head()}")

승객수: 891
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f2d10798710> 

key: ('First', 'female')
number: 94
     age     sex  class      fare  survived
1   38.0  female  First   71.2833         1
3   35.0  female  First   53.1000         1
11  58.0  female  First   26.5500         1
31   NaN  female  First  146.5208         1
52  49.0  female  First   76.7292         1 

key: ('First', 'male')
number: 122
     age   sex  class      fare  survived
6   54.0  male  First   51.8625         0
23  28.0  male  First   35.5000         1
27  19.0  male  First  263.0000         0
30  40.0  male  First   27.7208         0
34  28.0  male  First   82.1708         0 

key: ('Second', 'female')
number: 76
     age     sex   class     fare  s

## 2.그룹 연산 메소드(적용-결합)

### 2-1.데이터 집계


In [3]:
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
print(f"승객수: {len(df)}\n{df.head()}\n")

# class열을 기준으로 분할
grouped = df.groupby(['class'])

# 그룹별 표준편차
std_all = grouped.std()
print(f"std_all:\n{std_all}\n")

# fare 열에 대한 표준편차
fare_std = grouped['fare'].std()
print(f"fare_std:\n{fare_std}\n")

# 사용자 함수를 인자로 전달
def min_max(x):
    return x.max() - x.min()

agg_minmax = grouped.agg(min_max)
print(f"aff_minmax:\n{agg_minmax}\n")

# 여러 함수를 각열에 동일하게 적용
agg_all = grouped.agg(['min', 'max'])
print(f"agg_all:\n{agg_all}\n")

# 각열에 다른 함수를 적용
agg_sep = grouped.agg({'fare':'mean', 'age':['min', 'max']})
print(f"agg_all:\n{agg_sep}")

승객수: 891
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0

std_all:
              age       fare  survived
class                                 
First   14.802856  78.380373  0.484026
Second  14.001077  13.417399  0.500623
Third   12.495398  11.778142  0.428949

fare_std:
class
First     78.380373
Second    13.417399
Third     11.778142
Name: fare, dtype: float64

aff_minmax:
          age      fare  survived
class                            
First   79.08  512.3292         1
Second  69.33   73.5000         1
Third   73.58   69.5500         1

agg_all:
         age           sex       fare           survived    
         min   max     min   max  min       max      min max
class                                                       
First   0.92  80.0  female  male  0.0  512.3292       

### 2-2.그룹 연산 데이터 변환

In [4]:
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
print(f"승객수: {len(df)}\n{df.head()}\n")

# class열을 기준으로 분할
grouped = df.groupby(['class'])

# age열의 평균
age_mean = grouped.age.mean()
print(f"age_mean:\n{age_mean}\n")

# age열의 표준편차
age_std = grouped.age.std()
print(f"age_std:\n{age_std}\n")

# 각 그룹별 z-score 계산
for key, group in grouped.age:
    print(f"key: {key}")
    z_score = (group - age_mean[key])/age_std[key]
    print(f"z-score:\n{z_score.head()}\n")

# 사용자 정의 함수:z-score
def z_score(x):
    return (x - x.mean())/x.std()

# transform 사용 (본래행 인덱스 순서대로 정렬)
z_score = grouped.age.transform(z_score)
print(f"z-score:\n{z_score.head()}")

승객수: 891
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0

age_mean:
class
First     38.233441
Second    29.877630
Third     25.140620
Name: age, dtype: float64

age_std:
class
First     14.802856
Second    14.001077
Third     12.495398
Name: age, dtype: float64

key: First
z-score:
1    -0.015770
3    -0.218434
6     1.065103
11    1.335321
23   -0.691315
Name: age, dtype: float64

key: Second
z-score:
9    -1.134029
15    1.794317
17         NaN
20    0.365855
21    0.294432
Name: age, dtype: float64

key: Third
z-score:
0   -0.251342
2    0.068776
4    0.789041
5         NaN
7   -1.851931
Name: age, dtype: float64

z-score:
0   -0.251342
1   -0.015770
2    0.068776
3   -0.218434
4    0.789041
Name: age, dtype: float64


### 2-3.그룹 객체 필터링

In [5]:
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
print(f"승객수: {len(df)}\n{df.head()}\n")

# class열을 기준으로 분할
grouped = df.groupby(['class'])

# 그룹별 데이터 개수가 200개 이상 필터링
grouped_filter = grouped.filter(lambda x:len(x) >= 200)
print(f"그룹별 개수가 200개 이하:\n{grouped_filter.head()}\ntype: {type(grouped_filter)}\n")

# 그룹별 age열의 평균이 30이하 필터링
age_filter = grouped.filter(lambda x:x.age.mean() <= 30)
print(f"그룹별 평균이 30이하:\n{age_filter.tail()}\ntype: {type(age_filter)}\n")

승객수: 891
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0

그룹별 개수가 200개 이하:
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0
type: <class 'pandas.core.frame.DataFrame'>

그룹별 평균이 30이하:
      age     sex   class    fare  survived
884  25.0    male   Third   7.050         0
885  39.0  female   Third  29.125         0
886  27.0    male  Second  13.000         0
888   NaN  female   Third  23.450         0
890  32.0    male   Third   7.750         0
type: <class 'pandas.core.frame.DataFrame'>



### 2-4.그룹 객체에 함수 매핑

In [6]:
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
print(f"승객수: {len(df)}\n{df.head()}\n")

# class열을 기준으로 분할
grouped = df.groupby(['class'])

grouped_describe = grouped.apply(lambda x:x.describe())
print(f"grouped_describe:\n{grouped_describe}")

승객수: 891
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0

grouped_describe:
                     age        fare    survived
class                                           
First  count  186.000000  216.000000  216.000000
       mean    38.233441   84.154687    0.629630
       std     14.802856   78.380373    0.484026
       min      0.920000    0.000000    0.000000
       25%     27.000000   30.923950    0.000000
       50%     37.000000   60.287500    1.000000
       75%     49.000000   93.500000    1.000000
       max     80.000000  512.329200    1.000000
Second count  173.000000  184.000000  184.000000
       mean    29.877630   20.662183    0.472826
       std     14.001077   13.417399    0.500623
       min      0.670000    0.000000    0.000000
       25%     23.000000   13.0000

In [7]:
# 사용자 정의 함수:z-score
def z_score(x):
    return (x - x.mean())/x.std()

age_zscore = grouped.age.apply(z_score)
print(f"age_zscore:\n{age_zscore.head()}")

age_zscore:
0   -0.251342
1   -0.015770
2    0.068776
3   -0.218434
4    0.789041
Name: age, dtype: float64


In [8]:
# age 30이하 필터링하여 그룹을 출력
age_filter = grouped.apply(lambda x:x.age.mean() < 30)
print(f"age_filter:\n{age_filter}\n")

for idx in age_filter.index:
    if age_filter[idx] == True:
        age_filter_df = grouped.get_group(idx)
        print(f"group:{idx}\n{age_filter_df.head()}\n")

age_filter:
class
First     False
Second     True
Third      True
dtype: bool

group:Second
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1

group:Third
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0

