### https://pandas.pydata.org/

In [1]:
import numpy as np # NumPy
import pandas as pd # Pandas

# IPython( jupyter notebook ) 디스플레이 설정 변경
# 출력할 열의 개수 한도 설정
pd.set_option( 'display.max_columns', 20 )
# 출력할 열의 너비 설정
pd.set_option( 'display.max_colwidth', 20 )
# 유니코드 사용 너비 조정
pd.set_option( 'display.unicode.east_asian_width', True )

# 경고( warnings ) 메시지 출력 방지
import warnings
warnings.filterwarnings( 'ignore' )

## 데이터 프레임 연산
### 데이터 프레임 vs 숫자

In [2]:
d = { 'c0' : [1,2,3],
     'c1' : [4,5,6],
     'c2' : [7,8,9],
     'c3' : [10,11,12],
     'c4' : [13,14,15]
}
df = pd.DataFrame(d, index = ['r0','r1','r2'])
df

Unnamed: 0,c0,c1,c2,c3,c4
r0,1,4,7,10,13
r1,2,5,8,11,14
r2,3,6,9,12,15


In [3]:
add = df + 100
add

Unnamed: 0,c0,c1,c2,c3,c4
r0,101,104,107,110,113
r1,102,105,108,111,114
r2,103,106,109,112,115


### 데이터프레임 vs 데이터프레임

In [4]:
subtract = add - df
subtract

Unnamed: 0,c0,c1,c2,c3,c4
r0,100,100,100,100,100
r1,100,100,100,100,100
r2,100,100,100,100,100


### 외부 파일 읽기

In [8]:
df = pd.read_csv('./data/auto-mpg.csv')
# reference = https://archive.ics.uci.edu/ml/datasets/auto+mpg
df.head(10)
# 연비, 실린더수, 배기량, 마력, 차무게, 가속, 모델연식, ? , 차 이름

Unnamed: 0,18.0,8,307.0,130.0,3504.,12.0,70,1,chevrolet chevelle malibu
0,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
3,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
4,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500
5,14.0,8,454.0,220.0,4354.0,9.0,70,1,chevrolet impala
6,14.0,8,440.0,215.0,4312.0,8.5,70,1,plymouth fury iii
7,14.0,8,455.0,225.0,4425.0,10.0,70,1,pontiac catalina
8,15.0,8,390.0,190.0,3850.0,8.5,70,1,amc ambassador dpl
9,15.0,8,383.0,170.0,3563.0,10.0,70,1,dodge challenger se


### header 인자

In [9]:
df = pd.read_csv('./data/auto-mpg.csv', header = 1)
df.head(10)

Unnamed: 0,15.0,8,350.0,165.0,3693.,11.5,70,1,buick skylark 320
0,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
1,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
2,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
3,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500
4,14.0,8,454.0,220.0,4354.0,9.0,70,1,chevrolet impala
5,14.0,8,440.0,215.0,4312.0,8.5,70,1,plymouth fury iii
6,14.0,8,455.0,225.0,4425.0,10.0,70,1,pontiac catalina
7,15.0,8,390.0,190.0,3850.0,8.5,70,1,amc ambassador dpl
8,15.0,8,383.0,170.0,3563.0,10.0,70,1,dodge challenger se
9,14.0,8,340.0,160.0,3609.0,8.0,70,1,plymouth 'cuda 340


### header = None

In [18]:
df = pd.read_csv('./data/auto-mpg.csv', header = None)

df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevel...
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


### columns 입력

In [14]:
df = pd.read_csv('./data/auto-mpg.csv', header = None)
df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower',
             'weight','acceleration','model_year','origin','name']
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevel...
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


### index_col  = 0   0번을 인덱스로 사용

In [17]:
df = pd.read_csv('./data/auto-mpg.csv', header = None, index_col = 0)
df.columns = ['cylinders', 'displacement', 'horsepower',
             'weight','acceleration','model_year','origin','name']
df.head()

Unnamed: 0_level_0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevel...
15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


### index_col = False    인덱스를 숫자로 지정

In [22]:
df = pd.read_csv('./data/auto-mpg.csv', header = None, index_col = False)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevel...
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


### sep = ','    구분 문자 지정  csv 는 잘 사용하지 않는다.

In [21]:
df = pd.read_csv('./data/auto-mpg.csv', header = None, sep = ',')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevel...
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [25]:
df = pd.read_csv('../NumPy/president_heights.csv')
df.head()

Unnamed: 0,order,name,height(cm)
0,1,George Washington,189
1,2,John Adams,170
2,3,Thomas Jefferson,189
3,4,James Madison,163
4,5,James Monroe,183


In [24]:
df = pd.read_csv('../NumPy/president_heights.csv', index_col = 0)
df.head()

Unnamed: 0_level_0,name,height(cm)
order,Unnamed: 1_level_1,Unnamed: 2_level_1
1,George Washington,189
2,John Adams,170
3,Thomas Jefferson,189
4,James Madison,163
5,James Monroe,183


In [27]:
df = pd.read_csv('../NumPy/Floating_Population_2004.csv', sep = ',', encoding = 'utf-8')
df.head()

Unnamed: 0,일자,시간(1시간단위),연령대(10세단위),성별,시,군구,유동인구수
0,20200401,0,20,남성,서울,도봉구,21490
1,20200401,0,30,여성,서울,동대문구,27260
2,20200401,0,50,남성,서울,구로구,35670
3,20200401,1,30,여성,서울,강남구,52380
4,20200401,1,40,남성,서울,동대문구,28420


## 데이터 둘러보기
### 1. 데이터를 읽어서 데이터프레임 생성

In [99]:
df = pd.read_csv('./data/auto-mpg.csv', header = None)
df.columns = ['mpg','cylinders', 'displacement', 'horsepower',
             'weight','acceleration','model_year','origin','name']

### 2. 생성된 데이터 프레임 확인

In [100]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevel...
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger


In [101]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevel...
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [102]:
df.tail()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger
397,31.0,4,119.0,82.0,2720.0,19.4,82,1,chevy s-10


In [103]:
df.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevel...
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
5,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220.0,4354.0,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215.0,4312.0,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225.0,4425.0,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190.0,3850.0,8.5,70,1,amc ambassador dpl


In [104]:
df.tail(8)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
390,32.0,4,144.0,96.0,2665.0,13.9,82,3,toyota celica gt
391,36.0,4,135.0,84.0,2370.0,13.0,82,1,dodge charger 2.2
392,27.0,4,151.0,90.0,2950.0,17.3,82,1,chevrolet camaro
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger
397,31.0,4,119.0,82.0,2720.0,19.4,82,1,chevy s-10


### 3. 데이터프레임 구조 확인

In [105]:
df.shape

(398, 9)

### 4. 데이터프레임 기본 정보 확인

In [106]:
df.info()
# non-null null==NaN (결측치, 데이터가 없다.)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


### 5. 데이터프레임 기술 통계 확인

In [107]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [108]:
df.describe(include = 'all')

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398
unique,,,,94.0,,,,,305
top,,,,150.0,,,,,ford pinto
freq,,,,22.0,,,,,6
mean,23.514573,5.454774,193.425879,,2970.424623,15.56809,76.01005,1.572864,
std,7.815984,1.701004,104.269838,,846.841774,2.757689,3.697627,0.802055,
min,9.0,3.0,68.0,,1613.0,8.0,70.0,1.0,
25%,17.5,4.0,104.25,,2223.75,13.825,73.0,1.0,
50%,23.0,4.0,148.5,,2803.5,15.5,76.0,1.0,
75%,29.0,8.0,262.0,,3608.0,17.175,79.0,2.0,


### 6. 데이터 개수 / 고유값 계산

In [109]:
df.count()

mpg             398
cylinders       398
displacement    398
horsepower      398
weight          398
acceleration    398
model_year      398
origin          398
name            398
dtype: int64

In [110]:
type(df.count())

pandas.core.series.Series

In [111]:
cnt = df.count()
cnt

mpg             398
cylinders       398
displacement    398
horsepower      398
weight          398
acceleration    398
model_year      398
origin          398
name            398
dtype: int64

In [113]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model_year        int64
origin            int64
name             object
dtype: object

In [114]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'name'],
      dtype='object')

In [40]:
df.index

RangeIndex(start=0, stop=398, step=1)

In [41]:
unique_value = df['origin'].value_counts()
unique_value

1    249
3     79
2     70
Name: origin, dtype: int64

In [42]:
df['cylinders'].value_counts()

4    204
8    103
6     84
3      4
5      3
Name: cylinders, dtype: int64

In [43]:
df['model_year'].value_counts()

73    40
78    36
76    34
82    31
75    30
81    29
80    29
79    29
70    29
77    28
72    28
71    28
74    27
Name: model_year, dtype: int64

### 7. 통계 함수(매서드)를 이용한 데이터 이해

In [115]:
df.mean() # 평규

mpg               23.514573
cylinders          5.454774
displacement     193.425879
weight          2970.424623
acceleration      15.568090
model_year        76.010050
origin             1.572864
dtype: float64

In [116]:
df['mpg'].mean()

23.514572864321615

In [117]:
df[['mpg','weight']].mean()

mpg         23.514573
weight    2970.424623
dtype: float64

In [118]:
df.median() # 중앙값

mpg               23.0
cylinders          4.0
displacement     148.5
weight          2803.5
acceleration      15.5
model_year        76.0
origin             1.0
dtype: float64

In [44]:
df.min() # 최솟값

mpg                               9
cylinders                         3
displacement                     68
horsepower                    100.0
weight                         1613
acceleration                      8
model_year                       70
origin                            1
name            amc ambassador b...
dtype: object

In [45]:
df.max() # 최대값

mpg                         46.6
cylinders                      8
displacement                 455
horsepower                     ?
weight                      5140
acceleration                24.8
model_year                    82
origin                         3
name            vw rabbit custom
dtype: object

In [46]:
df.std() # 표준편차

mpg               7.815984
cylinders         1.701004
displacement    104.269838
weight          846.841774
acceleration      2.757689
model_year        3.697627
origin            0.802055
dtype: float64

In [47]:
df.var() # 분산

mpg                 61.089611
cylinders            2.893415
displacement     10872.199152
weight          717140.990526
acceleration         7.604848
model_year          13.672443
origin               0.643292
dtype: float64

In [48]:
df.corr() # 상관계수(-1 ~ 1, -1에 가까우면 음의 상관관계, 1에 가까우면 양의 상관관계)

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
mpg,1.0,-0.775396,-0.804203,-0.831741,0.420289,0.579267,0.56345
cylinders,-0.775396,1.0,0.950721,0.896017,-0.505419,-0.348746,-0.562543
displacement,-0.804203,0.950721,1.0,0.932824,-0.543684,-0.370164,-0.609409
weight,-0.831741,0.896017,0.932824,1.0,-0.417457,-0.306564,-0.581024
acceleration,0.420289,-0.505419,-0.543684,-0.417457,1.0,0.288137,0.205873
model_year,0.579267,-0.348746,-0.370164,-0.306564,0.288137,1.0,0.180662
origin,0.56345,-0.562543,-0.609409,-0.581024,0.205873,0.180662,1.0


In [50]:
df[['mpg','weight']].corr()

Unnamed: 0,mpg,weight
mpg,1.0,-0.831741
weight,-0.831741,1.0


### EDA 실습
### 문제 정의 : 미국 역대 대통령의 신장에 대한 EDA'
    * 데이터셋 구성
        1. 미국 대통령 순번
        2. 대통령 이름
        3. 대통령 신장

## 1. 수집한 데이터셋 데이터 프레임 생성

In [121]:
df = pd.read_csv('../NumPy/president_heights.csv')

## 2. 데이터 프레임 확인

In [122]:
df.head()

Unnamed: 0,order,name,height(cm)
0,1,George Washington,189
1,2,John Adams,170
2,3,Thomas Jefferson,189
3,4,James Madison,163
4,5,James Monroe,183


In [54]:
df.tail()

Unnamed: 0,order,name,height(cm)
37,40,Ronald Reagan,185
38,41,George H. W. Bush,188
39,42,Bill Clinton,188
40,43,George W. Bush,182
41,44,Barack Obama,185


## 3. 데이터프레임 구조 확인

In [56]:
df.shape

(42, 3)

## 4. 데이터프레임 기본 정보 확인

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   order       42 non-null     int64 
 1   name        42 non-null     object
 2   height(cm)  42 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ KB


In [58]:
df.dtypes

order          int64
name          object
height(cm)     int64
dtype: object

In [59]:
df.columns

Index(['order', 'name', 'height(cm)'], dtype='object')

In [60]:
df.index

RangeIndex(start=0, stop=42, step=1)

## 5. 변수 이름 변경

In [124]:
df.rename(columns={'height(cm)':'height'}, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   order   42 non-null     int64 
 1   name    42 non-null     object
 2   height  42 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ KB


In [125]:
df.columns

Index(['order', 'name', 'height'], dtype='object')

## 6. 기초 통계 분석

### 6.1 데이터 요약 통계

In [69]:
df.describe().round(2)

Unnamed: 0,order,height
count,42.0,42.0
mean,22.48,179.74
std,13.15,7.02
min,1.0,163.0
25%,11.25,174.25
50%,22.0,182.0
75%,33.75,183.0
max,44.0,193.0


In [70]:
print(f'평균 신장             : {df.height.mean():6.2f}cm')
print(f'최소 신장             : {df.height.min():6.2f}cm')
print(f'최고 신장             : {df.height.max():6.2f}cm')
print(f'신장에 대한 표준편차  : {df.height.std():6.2f}')
print(f'신장에 대한 1사분위수 : {df.height.quantile(0.25):6.2f}cm')
print(f'신장에 대한 2사분위수 : {df.height.quantile(0.5):6.2f}cm')
print(f'신장에 대한 3사분위수 : {df.height.quantile(0.75):6.2f}cm')
print(f'신장에 대한 4사분위수 : {df.height.quantile(1.0):6.2f}cm')

평균 신장             : 179.74cm
최소 신장             : 163.00cm
최고 신장             : 193.00cm
신장에 대한 표준편차  :   7.02
신장에 대한 1사분위수 : 174.25cm
신장에 대한 2사분위수 : 182.00cm
신장에 대한 3사분위수 : 183.00cm
신장에 대한 4사분위수 : 193.00cm


## 7. 탐색적 데이터 분석(EDA)
### 7.1 신장순 정렬

In [71]:
df.sort_values(by='height', ascending = False)

Unnamed: 0,order,name,height
15,16,Abraham Lincoln,193
33,36,Lyndon B. Johnson,193
0,1,George Washington,189
2,3,Thomas Jefferson,189
39,42,Bill Clinton,188
38,41,George H. W. Bush,188
29,32,Franklin D. Roos...,188
37,40,Ronald Reagan,185
41,44,Barack Obama,185
6,7,Andrew Jackson,185


In [72]:
df.sort_values(by='height', ascending = False).head()

Unnamed: 0,order,name,height
15,16,Abraham Lincoln,193
33,36,Lyndon B. Johnson,193
0,1,George Washington,189
2,3,Thomas Jefferson,189
39,42,Bill Clinton,188


In [73]:
df.sort_values(by='height', ascending = False).tail()

Unnamed: 0,order,name,height
22,25,William McKinley,170
1,2,John Adams,170
7,8,Martin Van Buren,168
21,23,Benjamin Harrison,168
3,4,James Madison,163


### 7.2 신장별 인원수

In [126]:
height_count = df.height.value_counts()
height_count

183    8
173    4
182    4
178    4
185    3
188    3
189    2
168    2
170    2
193    2
175    2
174    1
171    1
179    1
180    1
163    1
177    1
Name: height, dtype: int64

In [138]:
count = df.height.count()
heights = list( height_count.index )
heights_count = list( height_count.values )
for i in range( len( heights_count ) ):
    percent = heights_count[ i ] / count * 100
    print( f'신장 [ {heights[ i ]}cm ] : {heights_count[ i ]}명( {percent:5.2f}%)' )

신장 [ 183cm ] : 8명( 19.05%)
신장 [ 173cm ] : 4명(  9.52%)
신장 [ 182cm ] : 4명(  9.52%)
신장 [ 178cm ] : 4명(  9.52%)
신장 [ 185cm ] : 3명(  7.14%)
신장 [ 188cm ] : 3명(  7.14%)
신장 [ 189cm ] : 2명(  4.76%)
신장 [ 168cm ] : 2명(  4.76%)
신장 [ 170cm ] : 2명(  4.76%)
신장 [ 193cm ] : 2명(  4.76%)
신장 [ 175cm ] : 2명(  4.76%)
신장 [ 174cm ] : 1명(  2.38%)
신장 [ 171cm ] : 1명(  2.38%)
신장 [ 179cm ] : 1명(  2.38%)
신장 [ 180cm ] : 1명(  2.38%)
신장 [ 163cm ] : 1명(  2.38%)
신장 [ 177cm ] : 1명(  2.38%)


### 7.3 평균 신장 이상 / 미만 대통령

In [131]:
mean_height_count = df.height.mean() # 평균 신장

mask_mean_above = df.height >= mean_height_count # 평균 신장 이상
mask_mean_under = df.height < mean_height_count # 평균 신장 미만

mean_height_above = df.loc[ mask_mean_above ]
mean_height_under = df.loc[ mask_mean_under ]

In [132]:
mean_height_above

Unnamed: 0,order,name,height
0,1,George Washington,189
2,3,Thomas Jefferson,189
4,5,James Monroe,183
6,7,Andrew Jackson,185
9,10,John Tyler,183
14,15,James Buchanan,183
15,16,Abraham Lincoln,193
19,20,James A. Garfield,183
20,21,Chester A. Arthur,183
24,27,William Howard Taft,182


In [135]:
print( '평균 신장 {:.2f}cm 이상 대통령은 {}명'.format( df.height.mean(), mean_height_above[ 'height' ].count() ) )

평균 신장 179.74cm 이상 대통령은 23명


### 평균 신장 미만 대통령

In [136]:
mean_height_under

Unnamed: 0,order,name,height
1,2,John Adams,170
3,4,James Madison,163
5,6,John Quincy Adams,171
7,8,Martin Van Buren,168
8,9,William Henry Ha...,173
10,11,James K. Polk,173
11,12,Zachary Taylor,173
12,13,Millard Fillmore,175
13,14,Franklin Pierce,178
16,17,Andrew Johnson,178


In [137]:
print( '평균 신장 {:.2f}cm 미만 대통령은 {}명'.format( df.height.mean(), mean_height_under[ 'height' ].count() ) )

평균 신장 179.74cm 미만 대통령은 19명


### 7.4 최고 신장 대통령 정보

In [87]:
mask = df.height >= df.height.max()
max_heights = df.loc[mask]
max_heights

Unnamed: 0,order,name,height
15,16,Abraham Lincoln,193
33,36,Lyndon B. Johnson,193


### 7.5 최저 신장 대통령 정보

In [88]:
mask = df.height <= df.height.min()
min_heights = df.loc[mask]
min_heights

Unnamed: 0,order,name,height
3,4,James Madison,163
