# Pandas 1

In [1]:
import pandas as pd

## Series

In [3]:
data = ['2017', '2018', '2019', '2020']
se = pd.Series(data)
se

0    2017
1    2018
2    2019
3    2020
dtype: object

In [4]:
type(se)

pandas.core.series.Series

### index, values

In [5]:
se.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
se.values

array(['2017', '2018', '2019', '2020'], dtype=object)

In [7]:
list(se.values)

['2017', '2018', '2019', '2020']

### 특정값 추출 - 인덱싱

In [8]:
se

0    2017
1    2018
2    2019
3    2020
dtype: object

In [9]:
se[0]

'2017'

In [10]:
se[3]

'2020'

### 특정 범위 추출 - 슬라이싱

In [11]:
se

0    2017
1    2018
2    2019
3    2020
dtype: object

In [12]:
se[0:3]

0    2017
1    2018
2    2019
dtype: object

In [13]:
se[:]

0    2017
1    2018
2    2019
3    2020
dtype: object

### index 이름과 columns 이름 지정하기 : name

In [14]:
data = ['2017', '2018', '2019', '2020']
se = pd.Series(data) # 인덱스가 자동으로 부여된다.
se

0    2017
1    2018
2    2019
3    2020
dtype: object

In [15]:
se.name = 'Year'
se

0    2017
1    2018
2    2019
3    2020
Name: Year, dtype: object

In [21]:
# 인덱스 이름 지정
se.index.name = 'No'
se

No
a    2017
b    2018
c    2019
d    2020
dtype: object

### Series 생성시 index 지정하기

In [22]:
data = ['2017', '2018', '2019', '2020']
se = pd.Series(data, index = ['a', 'b', 'c', 'd'])
se

a    2017
b    2018
c    2019
d    2020
dtype: object

In [23]:
se['a']

'2017'

In [24]:
se['a':'d']

a    2017
b    2018
c    2019
d    2020
dtype: object

In [25]:
se.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [26]:
se.values

array(['2017', '2018', '2019', '2020'], dtype=object)

In [27]:
data = ['2017', '2018', '2019', '2020']
se = pd.Series(data)
se

0    2017
1    2018
2    2019
3    2020
dtype: object

In [28]:
se.index = ['a', 'b', 'c', 'd'] 
se

a    2017
b    2018
c    2019
d    2020
dtype: object

In [29]:
se['b']

'2018'

### masking

In [32]:
data = ['2017', '2018', '2019', '2020']
se = pd.Series(data, index = ['a', 'b', 'c', 'd'])
se

a    2017
b    2018
c    2019
d    2020
dtype: object

In [33]:
se > '2018'

a    False
b    False
c     True
d     True
dtype: bool

In [34]:
# 참인 인덱스만 출력됨.
se[se > '2018']

c    2019
d    2020
dtype: object

In [42]:
# 거짓인 인덱스만 출력됨.
se[-(se > '2018')]

a    2017
b    2018
dtype: object

### Series를 DataFrame으로 변환
- to_frame() 메소드 이용

In [35]:
data = ['2017', '2018', '2019', '2020']
se = pd.Series(data, index = ['a', 'b', 'c', 'd'])
se.to_frame()

Unnamed: 0,0
a,2017
b,2018
c,2019
d,2020


## DataFrame
### DataFrame 생성

In [36]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
       'score':[100, 95, 80, 85],
        'grade':['A', 'A', 'B', 'B']
       }
df = pd.DataFrame(data)
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,80,B
3,Choi,85,B


In [37]:
type(df)

pandas.core.frame.DataFrame

### 컬럼 순서 변경하기

In [38]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
       'score':[100, 95, 80, 85],
        'grade':['A', 'A', 'B', 'B']
       }
df = pd.DataFrame(data, columns=['name', 'grade', 'score'])  
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,80
3,Choi,B,85


### index, columns, values

In [39]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [40]:
df.columns

Index(['name', 'grade', 'score'], dtype='object')

In [41]:
df.values

array([['Lee', 'A', 100],
       ['Hwang', 'A', 95],
       ['Kim', 'B', 80],
       ['Choi', 'B', 85]], dtype=object)

In [42]:
type(df.values)

numpy.ndarray

In [43]:
print(df.shape, df.ndim, df.size)

(4, 3) 2 12


### 특정 컬럼 값 추출

In [44]:
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,80
3,Choi,B,85


In [45]:
df['name'].values

array(['Lee', 'Hwang', 'Kim', 'Choi'], dtype=object)

In [46]:
df['score'].values

array([100,  95,  80,  85], dtype=int64)

In [47]:
df.loc[1, 'name']

'Hwang'

In [48]:
df.name

0      Lee
1    Hwang
2      Kim
3     Choi
Name: name, dtype: object

### index 이름과 columns 이름 지정하기

In [49]:
df.index.name = 'No'
df.columns.name = 'Info'
df

Info,name,grade,score
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,80
3,Choi,B,85


## DataFrame의 index 설정 

- 1) DataFrame 생성시 index 설정

In [50]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
       'score':[100, 95, 80, 85],
        'grade':['A', 'A', 'B', 'B']
       }
df = pd.DataFrame(data, index = ['Lee', 'Hwang', 'Kim', 'Choi'])
df

Unnamed: 0,name,score,grade
Lee,Lee,100,A
Hwang,Hwang,95,A
Kim,Kim,80,B
Choi,Choi,85,B


- 2) df.index() 이용하여 index 설정

In [51]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
       'score':[100, 95, 80, 85],
        'grade':['A', 'A', 'B', 'B']
       }
df = pd.DataFrame(data)
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,80,B
3,Choi,85,B


In [52]:
df.index = ['Lee', 'Hwang', 'Kim', 'Choi']
df

Unnamed: 0,name,score,grade
Lee,Lee,100,A
Hwang,Hwang,95,A
Kim,Kim,80,B
Choi,Choi,85,B


In [53]:
df.drop('name', axis=1)

Unnamed: 0,score,grade
Lee,100,A
Hwang,95,A
Kim,80,B
Choi,85,B


- 3) set_index() 사용하여 index 설정

In [54]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
       'score':[100, 95, 80, 85],
        'grade':['A', 'A', 'B', 'B']
       }
df = pd.DataFrame(data)
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,80,B
3,Choi,85,B


In [55]:
df = df.set_index('name')
df

Unnamed: 0_level_0,score,grade
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Lee,100,A
Hwang,95,A
Kim,80,B
Choi,85,B


## DataFrame 생성과 상호 변환
- DataFrame과 리스트, 딕셔너리, 넘파이 ndarray 간의 상호 변환

### 1.1) 리스트를 이용한 DataFrame 생성

In [56]:
data = [['Lee', 100, 'A'],
       ['Hwang',95, 'A'],
       ['Kim', 80, 'B'],
       ['Choi', 85, 'B']]
df = pd.DataFrame(data, columns=['name', 'score', 'grade'])
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,80,B
3,Choi,85,B


### 1.2) DataFrame을 리스트로 변환

In [57]:
df.values.tolist()

[['Lee', 100, 'A'], ['Hwang', 95, 'A'], ['Kim', 80, 'B'], ['Choi', 85, 'B']]

### 2.1) 딕셔너리를 이용한 DataFrame 생성

In [58]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
             'score':[100, 95, 90, 85],
             'grade':['A', 'A', 'B', 'B']}
df = pd.DataFrame(data)
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,90,B
3,Choi,85,B


### 2.2) DatatFrame을 딕셔너리로 변환

In [93]:
df.to_dict()

{'name': {0: 'Lee', 1: 'Hwang', 2: 'Kim', 3: 'Choi'},
 'score': {0: 100, 1: 95, 2: 90, 3: 85},
 'grade': {0: 'A', 1: 'A', 2: 'B', 3: 'B'}}

### 3.1) ndarray를 이용한 DataFrame 생성

In [94]:
import numpy as np

In [95]:
array = np.array([['Lee', 100, 'A'],
       ['Hwang',95, 'A'],
       ['Kim', 90, 'B'],
       ['Choi', 85, 'B']])
array.shape

(4, 3)

In [96]:
df = pd.DataFrame(array, columns=['name', 'score', 'grade'])
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,90,B
3,Choi,85,B


### 3.2) DataFrame을 ndarray로 변환

In [98]:
arr = df.values

In [100]:
arr

array([['Lee', '100', 'A'],
       ['Hwang', '95', 'A'],
       ['Kim', '90', 'B'],
       ['Choi', '85', 'B']], dtype=object)

In [99]:
type(arr)

numpy.ndarray

### 컬럼값이 없는 경우 -> NaN 처리

In [101]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
             'score':[100, 95, 90, 85],
             'grade':['A', 'A', 'B', 'B']}
c = ['name', 'grade', 'score', 'etc']
df2 = pd.DataFrame(data, columns=c)
df2

Unnamed: 0,name,grade,score,etc
0,Lee,A,100,
1,Hwang,A,95,
2,Kim,B,90,
3,Choi,B,85,


## 데이터 셀렉션 및 필터링

In [59]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
             'score':[100, 95, 90, 85],
             'grade':['A', 'A', 'B', 'B']}

columns = ['name', 'grade', 'score']
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90
3,Choi,B,85


### 1)  [ ] - df.컬럼명, df[‘컬럼명‘]

- 하나의 컬럼 검색

In [60]:
df.name

0      Lee
1    Hwang
2      Kim
3     Choi
Name: name, dtype: object

In [61]:
df['name']

0      Lee
1    Hwang
2      Kim
3     Choi
Name: name, dtype: object

- 2개 이상의 컬럼 검색

In [62]:
df[['name', 'grade']]

Unnamed: 0,name,grade
0,Lee,A
1,Hwang,A
2,Kim,B
3,Choi,B


In [63]:
df[['name', 'score', 'grade']]

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,90,B
3,Choi,85,B


### 2) loc[인덱스명, 컬럼명] - 명칭(label) 기반 인덱싱

In [64]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
             'score':[100, 95, 90, 85],
             'grade':['A', 'A', 'B', 'B']}

columns = ['name', 'grade', 'score']
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90
3,Choi,B,85


- 하나의 행 검색

In [65]:
df.loc[0]

name     Lee
grade      A
score    100
Name: 0, dtype: object

- 2개 이상의 행 검색

In [113]:
df.loc[0:2]

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90


In [115]:
df.loc[[0, 2]]

Unnamed: 0,name,grade,score
0,Lee,A,100
2,Kim,B,90


- 여러 행과 여러 열 검색 

In [42]:
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90
3,Choi,B,85


In [44]:
df_new = df.loc[0:2, ['score', 'grade']] # loc()는 다른함수와 다르게 인덱싱된다.(0~2까지의 행(인덱스 번호)을 선택)
df_new

Unnamed: 0,score,grade
0,100,A
1,95,A
2,90,B


### 3) iloc[인덱스, 컬럼인덱스] - 위치(position) 기반 인덱싱

In [58]:
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90
3,Choi,B,85


- 하나의 행 검색

In [55]:
df.iloc[3]

name     Choi
grade       B
score      85
Name: 3, dtype: object

- 여러 행 검색

In [121]:
df.iloc[0:3] # iloc()는 다른 함수처럼 인덱싱이 된다.

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90


- 여러 행과 여러 열 검색

In [59]:
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90
3,Choi,B,85


In [124]:
df.iloc[0:3, 0:2]

Unnamed: 0,name,grade
0,Lee,A
1,Hwang,A
2,Kim,B


### 4) 불린(Boolean) 인덱싱

In [60]:
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90
3,Choi,B,85


In [61]:
df['score'] > 90

0     True
1     True
2    False
3    False
Name: score, dtype: bool

In [20]:
df[df['score'] > 90]

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95


In [21]:
df.loc[df['score'] > 90]

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95


In [135]:
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90
3,Choi,B,85


In [22]:
df[df['grade'] == 'B']

Unnamed: 0,name,grade,score
2,Kim,B,90
3,Choi,B,85


In [23]:
df.loc[df['grade'] == 'B']

Unnamed: 0,name,grade,score
2,Kim,B,90
3,Choi,B,85


- 2가지 이상의 조건 불린 인덱싱

In [62]:
(df['score'] > 95) & (df['grade'] == 'A')

0     True
1    False
2    False
3    False
dtype: bool

In [63]:
df[(df['score'] > 95) & (df['grade'] == 'A')]  # & - and

Unnamed: 0,name,grade,score
0,Lee,A,100


In [64]:
df[(df['score'] > 95) | (df['grade'] == 'A')] # | - or

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95


### 5) filter() 함수를 이용한 검색

- filter() 함수

## ****유용*****

In [65]:
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90
3,Choi,B,85


In [141]:
# 필터링된 컬럼만 표시된다.
df.filter(items=['name', 'score'])

Unnamed: 0,name,score
0,Lee,100
1,Hwang,95
2,Kim,90
3,Choi,85


In [66]:
df.filter(like='g', axis=1) # 컬럼명에 'g'가 포함된 컬럼 출력

Unnamed: 0,grade
0,A
1,A
2,B
3,B


In [143]:
df.filter(like='a', axis=1) # 컬럼명에 'a'가 포함된 컬럼 출력

Unnamed: 0,name,grade
0,Lee,A
1,Hwang,A
2,Kim,B
3,Choi,B


In [67]:
df.filter? # 도움말 출력

- 정규식을 이용한 filter() 함수

In [144]:
df

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90
3,Choi,B,85


In [145]:
df.filter(regex='^n', axis=1)

Unnamed: 0,name
0,Lee
1,Hwang
2,Kim
3,Choi


In [147]:
df.filter(regex='e$', axis=1)

Unnamed: 0,name,grade,score
0,Lee,A,100
1,Hwang,A,95
2,Kim,B,90
3,Choi,B,85


## DataFrame 데이터 추가

In [66]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
      'score':[100, 95, 80, 85],
      'grade':['A', 'A', 'B', 'B']}
df = pd.DataFrame(data)
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,80,B
3,Choi,85,B


### 1) Serial 객체를 이용하여 열 데이터 추가

In [67]:
points = pd.Series([1.5, 1.7, 2.4, 3.0], index=[0, 2, 1, 3])
points

0    1.5
2    1.7
1    2.4
3    3.0
dtype: float64

In [68]:
df['points'] = points
df

Unnamed: 0,name,score,grade,points
0,Lee,100,A,1.5
1,Hwang,95,A,2.4
2,Kim,80,B,1.7
3,Choi,85,B,3.0


In [69]:
points2 = pd.Series([1.5, 1.7, 2.4], index=[3,1,0])
points2

3    1.5
1    1.7
0    2.4
dtype: float64

In [70]:
df['points2'] = points2
df

Unnamed: 0,name,score,grade,points,points2
0,Lee,100,A,1.5,2.4
1,Hwang,95,A,2.4,1.7
2,Kim,80,B,1.7,
3,Choi,85,B,3.0,1.5


### 2) numpy 함수를 이용하여 열 데이터 추가

In [71]:
df

Unnamed: 0,name,score,grade,points,points2
0,Lee,100,A,1.5,2.4
1,Hwang,95,A,2.4,1.7
2,Kim,80,B,1.7,
3,Choi,85,B,3.0,1.5


In [72]:
import numpy as np

In [183]:
np.zeros(4).astype(int)

array([0, 0, 0, 0])

In [74]:
df['etc'] = np.zeros(4).astype(int)
df

Unnamed: 0,name,score,grade,points,points2,etc
0,Lee,100,A,1.5,2.4,0
1,Hwang,95,A,2.4,1.7,0
2,Kim,80,B,1.7,,0
3,Choi,85,B,3.0,1.5,0


### 3) 연산을 통해 열 데이터 추가

In [75]:
df

Unnamed: 0,name,score,grade,points,points2,etc
0,Lee,100,A,1.5,2.4,0
1,Hwang,95,A,2.4,1.7,0
2,Kim,80,B,1.7,,0
3,Choi,85,B,3.0,1.5,0


In [76]:
df['score'] > 90

0     True
1     True
2    False
3    False
Name: score, dtype: bool

In [77]:
df['pass'] = df['score'] > 90
df

Unnamed: 0,name,score,grade,points,points2,etc,pass
0,Lee,100,A,1.5,2.4,0,True
1,Hwang,95,A,2.4,1.7,0,True
2,Kim,80,B,1.7,,0,False
3,Choi,85,B,3.0,1.5,0,False


### 4) loc() 함수를 이용하여 행 데이터 추가

In [78]:
df

Unnamed: 0,name,score,grade,points,points2,etc,pass
0,Lee,100,A,1.5,2.4,0,True
1,Hwang,95,A,2.4,1.7,0,True
2,Kim,80,B,1.7,,0,False
3,Choi,85,B,3.0,1.5,0,False


In [79]:
df.loc[5, :] = ['Park', 70, 'C', 1.0, np.NaN, 0, False]
df

Unnamed: 0,name,score,grade,points,points2,etc,pass
0,Lee,100.0,A,1.5,2.4,0.0,True
1,Hwang,95.0,A,2.4,1.7,0.0,True
2,Kim,80.0,B,1.7,,0.0,False
3,Choi,85.0,B,3.0,1.5,0.0,False
5,Park,70.0,C,1.0,,0.0,False


In [80]:
df = df.reset_index(drop=True)
df

Unnamed: 0,name,score,grade,points,points2,etc,pass
0,Lee,100.0,A,1.5,2.4,0.0,True
1,Hwang,95.0,A,2.4,1.7,0.0,True
2,Kim,80.0,B,1.7,,0.0,False
3,Choi,85.0,B,3.0,1.5,0.0,False
4,Park,70.0,C,1.0,,0.0,False


## DataFrame 데이터 삭제 
### 1) drop
- DataFrame.drop(labels=None, axis=0, index=None, columns=None, inplace=False)

In [81]:
df

Unnamed: 0,name,score,grade,points,points2,etc,pass
0,Lee,100.0,A,1.5,2.4,0.0,True
1,Hwang,95.0,A,2.4,1.7,0.0,True
2,Kim,80.0,B,1.7,,0.0,False
3,Choi,85.0,B,3.0,1.5,0.0,False
4,Park,70.0,C,1.0,,0.0,False


In [82]:
df.drop(index=4) # 실제로 삭제 X

Unnamed: 0,name,score,grade,points,points2,etc,pass
0,Lee,100.0,A,1.5,2.4,0.0,True
1,Hwang,95.0,A,2.4,1.7,0.0,True
2,Kim,80.0,B,1.7,,0.0,False
3,Choi,85.0,B,3.0,1.5,0.0,False


In [83]:
df

Unnamed: 0,name,score,grade,points,points2,etc,pass
0,Lee,100.0,A,1.5,2.4,0.0,True
1,Hwang,95.0,A,2.4,1.7,0.0,True
2,Kim,80.0,B,1.7,,0.0,False
3,Choi,85.0,B,3.0,1.5,0.0,False
4,Park,70.0,C,1.0,,0.0,False


In [84]:
df.drop(index=4, inplace=True ) # 실제로 삭제

In [85]:
df

Unnamed: 0,name,score,grade,points,points2,etc,pass
0,Lee,100.0,A,1.5,2.4,0.0,True
1,Hwang,95.0,A,2.4,1.7,0.0,True
2,Kim,80.0,B,1.7,,0.0,False
3,Choi,85.0,B,3.0,1.5,0.0,False


In [200]:
df.drop('etc', axis=1)

Unnamed: 0,name,score,grade,points,points2,pass
0,Lee,100.0,A,1.5,2.4,True
1,Hwang,95.0,A,1.7,1.7,True
2,Kim,80.0,B,2.4,,False
3,Choi,85.0,B,3.0,1.5,False


In [86]:
df.drop('etc', axis=1, inplace=True)
df

Unnamed: 0,name,score,grade,points,points2,pass
0,Lee,100.0,A,1.5,2.4,True
1,Hwang,95.0,A,2.4,1.7,True
2,Kim,80.0,B,1.7,,False
3,Choi,85.0,B,3.0,1.5,False


In [87]:
df

Unnamed: 0,name,score,grade,points,points2,pass
0,Lee,100.0,A,1.5,2.4,True
1,Hwang,95.0,A,2.4,1.7,True
2,Kim,80.0,B,1.7,,False
3,Choi,85.0,B,3.0,1.5,False


In [203]:
df2 = df.drop('points2', axis=1)

In [204]:
df

Unnamed: 0,name,score,grade,points,points2,pass
0,Lee,100.0,A,1.5,2.4,True
1,Hwang,95.0,A,1.7,1.7,True
2,Kim,80.0,B,2.4,,False
3,Choi,85.0,B,3.0,1.5,False


In [205]:
df2

Unnamed: 0,name,score,grade,points,pass
0,Lee,100.0,A,1.5,True
1,Hwang,95.0,A,1.7,True
2,Kim,80.0,B,2.4,False
3,Choi,85.0,B,3.0,False


### 2) del
- del df['컬럼명']

In [88]:
df

Unnamed: 0,name,score,grade,points,points2,pass
0,Lee,100.0,A,1.5,2.4,True
1,Hwang,95.0,A,2.4,1.7,True
2,Kim,80.0,B,1.7,,False
3,Choi,85.0,B,3.0,1.5,False


In [89]:
del df['points2'] # 주의 : 실제로 삭제
df

Unnamed: 0,name,score,grade,points,pass
0,Lee,100.0,A,1.5,True
1,Hwang,95.0,A,2.4,True
2,Kim,80.0,B,1.7,False
3,Choi,85.0,B,3.0,False


## 집합(Aggregation) 함수
- sum(), max(), min(), count() 등의 집합(Aggregation) 함수로 연산 수행

In [213]:
import pandas as pd

In [214]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
      'score':[100, 95, 80, 85],
      'point':[3.4, 2.3, 4.0, 1.9],
      'grade':['A', 'A', 'B', 'B']}

df = pd.DataFrame(data)
df

Unnamed: 0,name,score,point,grade
0,Lee,100,3.4,A
1,Hwang,95,2.3,A
2,Kim,80,4.0,B
3,Choi,85,1.9,B


In [215]:
df.sum()

name     LeeHwangKimChoi
score                360
point               11.6
grade               AABB
dtype: object

In [216]:
df.mean()

score    90.0
point     2.9
dtype: float64

In [217]:
df.max()

name     Lee
score    100
point      4
grade      B
dtype: object

In [218]:
df.min()

name     Choi
score      80
point     1.9
grade       A
dtype: object

In [219]:
## ****유용*****
df.describe()

Unnamed: 0,score,point
count,4.0,4.0
mean,90.0,2.9
std,9.128709,0.969536
min,80.0,1.9
25%,83.75,2.2
50%,90.0,2.85
75%,96.25,3.55
max,100.0,4.0


### 하나의 컬럼에 집합 함수 적용

In [221]:
df

Unnamed: 0,name,score,point,grade
0,Lee,100,3.4,A
1,Hwang,95,2.3,A
2,Kim,80,4.0,B
3,Choi,85,1.9,B


In [222]:
df.score

0    100
1     95
2     80
3     85
Name: score, dtype: int64

In [223]:
df.score.mean()

90.0

In [224]:
df['score'].mean()

90.0

In [225]:
df.loc[:, 'score'].mean()

90.0

In [226]:
df.iloc[:, 1].mean()

90.0

### 여러 컬럼에 집합 함수 적용

In [227]:
df[['score', 'point']]

Unnamed: 0,score,point
0,100,3.4
1,95,2.3
2,80,4.0
3,85,1.9


In [228]:
df[['score', 'point']].sum(axis=0)

score    360.0
point     11.6
dtype: float64

In [229]:
df

Unnamed: 0,name,score,point,grade
0,Lee,100,3.4,A
1,Hwang,95,2.3,A
2,Kim,80,4.0,B
3,Choi,85,1.9,B


In [230]:
df[['score', 'point']].sum(axis=1)

0    103.4
1     97.3
2     84.0
3     86.9
dtype: float64

## DataFrame 정렬

In [90]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
      'score':[100, 95, 80, 85],
      'grade':['A', 'A', 'B', 'B']}
df = pd.DataFrame(data)
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,80,B
3,Choi,85,B


### 1) index 순으로 정렬 : sort_index()

In [91]:
df.sort_index(axis=0) # 인덱스 순으로 정렬

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,80,B
3,Choi,85,B


In [92]:
df.sort_index(axis=0, ascending=False) # 인덱스 순으로 정렬

Unnamed: 0,name,score,grade
3,Choi,85,B
2,Kim,80,B
1,Hwang,95,A
0,Lee,100,A


In [93]:
df.sort_index(axis=1) # 컬럼순(오름차순)으로 정렬

Unnamed: 0,grade,name,score
0,A,Lee,100
1,A,Hwang,95
2,B,Kim,80
3,B,Choi,85


In [94]:
df.sort_index(axis=1, ascending=False) # 컬럼순(내림차순)으로 정렬

Unnamed: 0,score,name,grade
0,100,Lee,A
1,95,Hwang,A
2,80,Kim,B
3,85,Choi,B


### 2) 지정된 컬럼의 컬럼값 순으로 정렬 : sort_values(by=['컬럼명', ...])   

In [95]:
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,80,B
3,Choi,85,B


In [96]:
# 오름차순 정렬
df.sort_values(by=['score'])

Unnamed: 0,name,score,grade
2,Kim,80,B
3,Choi,85,B
1,Hwang,95,A
0,Lee,100,A


In [97]:
# 내림차순 정렬
df.sort_values(by=['score'], ascending=False)

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
3,Choi,85,B
2,Kim,80,B


In [98]:
# 2개 이상의 컬럼값으로 정렬
df.sort_values(by=['grade','score']) 

Unnamed: 0,name,score,grade
1,Hwang,95,A
0,Lee,100,A
2,Kim,80,B
3,Choi,85,B


In [244]:
# 2개 이상의 컬럼값으로 정렬
df.sort_values(by=['grade','score'], ascending=[True, False]) 

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
3,Choi,85,B
2,Kim,80,B


## 데이타의 개수 세기 : count(), value_counts()
- count() : 데이타의 개수 세기, NaN 값은 제외하고 센다.
- value_counts() : 정수, 문자열, 카테고리 값인 경우에는 value_counts 메서드로 각각의 값이 나온 횟수를 셀 수 있다.
    - DataFrame에서는 value_counts()가 없으므로 각 컬럼에 적용

In [99]:
import numpy as np

In [100]:
data = {'name':['Lee', 'Hwang', 'Kim', 'Choi'],
      'score':[100, 95, 80, 85],
      'grade':['A', 'A', 'B', np.NaN]}
df = pd.DataFrame(data)
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,80,B
3,Choi,85,


In [101]:
df.count()

name     4
score    4
grade    3
dtype: int64

In [102]:
df['grade'].count() # NaN은 제외한 전체 개수

3

In [103]:
df['grade'].value_counts() # 카테고리 컬럼의 데이타별 개수

A    2
B    1
Name: grade, dtype: int64

## 결손 데이타(Missing Data) 처리
- 1) isna() 또는 isnull() : 주어진 컬럼 값들이 NaN인지 확인
- 2) fillna() : NaN 데이터를 찾아 value로 지정된 값으로 변환 
- 3) dropna() : 행 데이터를 기준으로 NaN 값이 포함된 데이터를 제거

In [104]:
df

Unnamed: 0,name,score,grade
0,Lee,100,A
1,Hwang,95,A
2,Kim,80,B
3,Choi,85,


### NaN 값을 가진 컬럼 추가

In [105]:
df['point'] = np.nan
df

Unnamed: 0,name,score,grade,point
0,Lee,100,A,
1,Hwang,95,A,
2,Kim,80,B,
3,Choi,85,,


### 1) NaN 값 체크 : df.isnull(), df.isna()

In [106]:
df.isna()

Unnamed: 0,name,score,grade,point
0,False,False,False,True
1,False,False,False,True
2,False,False,False,True
3,False,False,True,True


In [107]:
df.isnull()

Unnamed: 0,name,score,grade,point
0,False,False,False,True
1,False,False,False,True
2,False,False,False,True
3,False,False,True,True


### 2) df.fillna()

- point 컬럼을 0으로 채우기

In [108]:
df.fillna(value = 0, inplace=True)
df

Unnamed: 0,name,score,grade,point
0,Lee,100,A,0.0
1,Hwang,95,A,0.0
2,Kim,80,B,0.0
3,Choi,85,0,0.0


- point2 컬럼 추가

In [109]:
df['point2'] = [50, 100, 70, np.nan]
df

Unnamed: 0,name,score,grade,point,point2
0,Lee,100,A,0.0,50.0
1,Hwang,95,A,0.0,100.0
2,Kim,80,B,0.0,70.0
3,Choi,85,0,0.0,


- point2 컬럼의 NaN에 평균값으로 채우기

In [110]:
df['point2'].mean()

73.33333333333333

In [112]:
df.fillna(value=df['point2'].mean())

Unnamed: 0,name,score,grade,point,point2
0,Lee,100,A,0.0,50.0
1,Hwang,95,A,0.0,100.0
2,Kim,80,B,0.0,70.0
3,Choi,85,0,0.0,73.333333


In [255]:
df

Unnamed: 0,name,score,grade,point,point2
0,Lee,100,A,0.0,50.0
1,Hwang,95,A,0.0,100.0
2,Kim,80,B,0.0,70.0
3,Choi,85,B,0.0,


### 3) df.dropna()

In [113]:
df

Unnamed: 0,name,score,grade,point,point2
0,Lee,100,A,0.0,50.0
1,Hwang,95,A,0.0,100.0
2,Kim,80,B,0.0,70.0
3,Choi,85,0,0.0,


In [114]:
df.dropna(how='any')

Unnamed: 0,name,score,grade,point,point2
0,Lee,100,A,0.0,50.0
1,Hwang,95,A,0.0,100.0
2,Kim,80,B,0.0,70.0


In [115]:
df

Unnamed: 0,name,score,grade,point,point2
0,Lee,100,A,0.0,50.0
1,Hwang,95,A,0.0,100.0
2,Kim,80,B,0.0,70.0
3,Choi,85,0,0.0,


In [116]:
df.dropna(how='all')

Unnamed: 0,name,score,grade,point,point2
0,Lee,100,A,0.0,50.0
1,Hwang,95,A,0.0,100.0
2,Kim,80,B,0.0,70.0
3,Choi,85,0,0.0,
