# 데이터 전처리 학습

- pandas를 이용한 데이터 전처리 문법

- 1 query() : 행 추출
- 2 df[] : 열 추출
- 3 sort_values() : 정렬
- 4 groupby() : 집단 별로 나누기
- 5 assign() : 변수 추가
- 6 agg() : 통계치 구하기
- 7 merge() : 데이터 합치기(열)
- 8 concat() : 데이터 합치기(행)

## 1. query() : 행 추출

In [1]:
import pandas as pd

exam = pd.read_csv('../exam.csv')
exam

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78
3,4,1,30,98,58
4,5,2,25,80,65
5,6,2,50,89,98
6,7,2,80,90,45
7,8,2,90,78,25
8,9,3,20,98,15
9,10,3,50,98,45


In [2]:
exam.query('nclass==1') # 1반인 행 출력

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78
3,4,1,30,98,58


In [3]:
exam.query('math==50')

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
5,6,2,50,89,98
9,10,3,50,98,45


In [4]:
exam.query('nclass!=1') # 1반이 아닌 행 출력

Unnamed: 0,id,nclass,math,english,science
4,5,2,25,80,65
5,6,2,50,89,98
6,7,2,80,90,45
7,8,2,90,78,25
8,9,3,20,98,15
9,10,3,50,98,45
10,11,3,65,65,65
11,12,3,45,85,32
12,13,4,46,98,65
13,14,4,48,87,12


In [5]:
# 비교 연산자 이용 가능 + 이상/이하
exam.query('math>50')

Unnamed: 0,id,nclass,math,english,science
1,2,1,60,97,60
6,7,2,80,90,45
7,8,2,90,78,25
10,11,3,65,65,65
14,15,4,75,56,78
15,16,4,58,98,65
16,17,5,65,68,98
17,18,5,80,78,90
18,19,5,89,68,87
19,20,5,78,83,58


In [6]:
# 논리 연산자 이용 가능
exam.query('nclass==1 & math<=50')

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
2,3,1,45,86,78
3,4,1,30,98,58


In [7]:
exam.query('nclass==1 | math<=50')

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78
3,4,1,30,98,58
4,5,2,25,80,65
5,6,2,50,89,98
8,9,3,20,98,15
9,10,3,50,98,45
11,12,3,45,85,32
12,13,4,46,98,65


In [8]:
exam.query('nclass==1 | nclass==2 | nclass==3') # 다중 조건

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78
3,4,1,30,98,58
4,5,2,25,80,65
5,6,2,50,89,98
6,7,2,80,90,45
7,8,2,90,78,25
8,9,3,20,98,15
9,10,3,50,98,45


In [9]:
# list 안에 원하는 값을 넣고 in으로 출력
exam.query('nclass in [1,2,3]')

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78
3,4,1,30,98,58
4,5,2,25,80,65
5,6,2,50,89,98
6,7,2,80,90,45
7,8,2,90,78,25
8,9,3,20,98,15
9,10,3,50,98,45


In [10]:
df_123 = exam.query('nclass in [1,2,3]')
df_123.sum()
df_123.mean()

id          6.500000
nclass      2.000000
math       50.833333
english    88.500000
science    53.000000
dtype: float64

### 문자열도 가능

In [27]:
df = pd.DataFrame({'name' : ['A','B','C','D'],
               'hakbun' : [1,2,3,4]})

In [29]:
df.query('name=="A" & hakbun==1')

Unnamed: 0,name,hakbun
0,A,1


### 외부 변수를 이용한 추출
- 잘 모름

In [31]:
var_df=1
exam.query('nclass==@var_df')

Unnamed: 0,id,nclass,math,english,science,total_sum,mean
0,1,1,50,98,50,198,66.0
1,2,1,60,97,60,217,72.333333
2,3,1,45,86,78,209,69.666667
3,4,1,30,98,58,186,62.0


## 참고 주피터 노트북에서 모든 행 출력

In [34]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## 2. df[] : 열 추출

## 변수 추출

In [35]:
exam['math']

0     50
1     60
2     45
3     30
4     25
5     50
6     80
7     90
8     20
9     50
10    65
11    45
12    46
13    48
14    75
15    58
16    65
17    80
18    89
19    78
Name: math, dtype: int64

In [37]:
exam[['math']] # 데이터프레임으로 출력(1)

Unnamed: 0,math
0,50
1,60
2,45
3,30
4,25
5,50
6,80
7,90
8,20
9,50


In [38]:
exam['math'].to_frame() # 데이터프레임으로 출력(2)

Unnamed: 0,math
0,50
1,60
2,45
3,30
4,25
5,50
6,80
7,90
8,20
9,50


In [36]:
# 여러 개의 변수 -> [ ] 안에
exam[['nclass', 'math']]

Unnamed: 0,nclass,math
0,1,50
1,1,60
2,1,45
3,1,30
4,2,25
5,2,50
6,2,80
7,2,90
8,3,20
9,3,50


## 변수 제거

In [39]:
# 1 : 원본 반영X
exam.drop(columns='math')

Unnamed: 0,id,nclass,english,science,total_sum,mean
0,1,1,98,50,198,66.0
1,2,1,97,60,217,72.333333
2,3,1,86,78,209,69.666667
3,4,1,98,58,186,62.0
4,5,2,80,65,170,56.666667
5,6,2,89,98,237,79.0
6,7,2,90,45,215,71.666667
7,8,2,78,25,193,64.333333
8,9,3,98,15,133,44.333333
9,10,3,98,45,193,64.333333


In [40]:
# 2 : 원본 반영
exam.drop(columns='math', inplace=True)

In [42]:
exam = pd.read_csv('../exam.csv')
exam

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
1,2,1,60,97,60
2,3,1,45,86,78
3,4,1,30,98,58
4,5,2,25,80,65
5,6,2,50,89,98
6,7,2,80,90,45
7,8,2,90,78,25
8,9,3,20,98,15
9,10,3,50,98,45


In [43]:
# 3 : 새로운 변수에 반영
df_new = exam.drop(columns='math')
df_new

Unnamed: 0,id,nclass,english,science
0,1,1,98,50
1,2,1,97,60
2,3,1,86,78
3,4,1,98,58
4,5,2,80,65
5,6,2,89,98
6,7,2,90,45
7,8,2,78,25
8,9,3,98,15
9,10,3,98,45


In [44]:
# 2열 이상 삭제 [ ]
exam.drop(columns=['math', 'science'])

Unnamed: 0,id,nclass,english
0,1,1,98
1,2,1,97
2,3,1,86
3,4,1,98
4,5,2,80
5,6,2,89
6,7,2,90
7,8,2,78
8,9,3,98
9,10,3,98


## query + [ ] 이용한 조합

In [48]:
exam.query('nclass==1')['math']

0    50
1    60
2    45
3    30
Name: math, dtype: int64

In [53]:
exam.query('math>50')[['id','nclass']]

Unnamed: 0,id,nclass
1,2,1
6,7,2
7,8,2
10,11,3
14,15,4
15,16,4
16,17,5
17,18,5
18,19,5
19,20,5


In [54]:
exam.query('math>50')[['id','nclass']].head()

Unnamed: 0,id,nclass
1,2,1
6,7,2
7,8,2
10,11,3
14,15,4


In [56]:
exam.query('math>50')[['id','nclass']].tail().sum()

id        90
nclass    24
dtype: int64

## 3. sort_values() : 정렬

In [11]:
# default는 오름차순
exam.sort_values('english')

Unnamed: 0,id,nclass,math,english,science
14,15,4,75,56,78
10,11,3,65,65,65
16,17,5,65,68,98
18,19,5,89,68,87
17,18,5,80,78,90
7,8,2,90,78,25
4,5,2,25,80,65
19,20,5,78,83,58
11,12,3,45,85,32
2,3,1,45,86,78


In [12]:
# 내림차순
exam.sort_values('english', ascending=False)

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
12,13,4,46,98,65
3,4,1,30,98,58
15,16,4,58,98,65
8,9,3,20,98,15
9,10,3,50,98,45
1,2,1,60,97,60
6,7,2,80,90,45
5,6,2,50,89,98
13,14,4,48,87,12


In [13]:
# 여러 개의 칼럼 정렬
exam.sort_values(['nclass', 'science']) # 앞 변수 먼저
## 변수가 2개면 [ ] 안에

Unnamed: 0,id,nclass,math,english,science
0,1,1,50,98,50
3,4,1,30,98,58
1,2,1,60,97,60
2,3,1,45,86,78
7,8,2,90,78,25
6,7,2,80,90,45
4,5,2,25,80,65
5,6,2,50,89,98
8,9,3,20,98,15
11,12,3,45,85,32


In [14]:
# 여러 개의 칼럼일 때 오름차순/내림차순 정하기
exam.sort_values(['nclass', 'science'],ascending=[True, False])

Unnamed: 0,id,nclass,math,english,science
2,3,1,45,86,78
1,2,1,60,97,60
3,4,1,30,98,58
0,1,1,50,98,50
5,6,2,50,89,98
4,5,2,25,80,65
6,7,2,80,90,45
7,8,2,90,78,25
10,11,3,65,65,65
9,10,3,50,98,45


## 4. groupby() : 집단 별로 나누기

## 5. assign() : 변수 추가

### 파생 변수 만들기

In [15]:
# 1   .assign()
exam_new = exam.assign(tot_sum=exam['math']+exam['science']+exam['english'])

In [16]:
exam_new

Unnamed: 0,id,nclass,math,english,science,tot_sum
0,1,1,50,98,50,198
1,2,1,60,97,60,217
2,3,1,45,86,78,209
3,4,1,30,98,58,186
4,5,2,25,80,65,170
5,6,2,50,89,98,237
6,7,2,80,90,45,215
7,8,2,90,78,25,193
8,9,3,20,98,15,133
9,10,3,50,98,45,193


In [17]:
# 2 바로 반영
exam['total_sum'] = exam['math']+exam['science']+exam['english']

In [18]:
exam

Unnamed: 0,id,nclass,math,english,science,total_sum
0,1,1,50,98,50,198
1,2,1,60,97,60,217
2,3,1,45,86,78,209
3,4,1,30,98,58,186
4,5,2,25,80,65,170
5,6,2,50,89,98,237
6,7,2,80,90,45,215
7,8,2,90,78,25,193
8,9,3,20,98,15,133
9,10,3,50,98,45,193


In [19]:
exam['mean'] = (exam['math']+exam['science']+exam['english'])/3

In [20]:
exam

Unnamed: 0,id,nclass,math,english,science,total_sum,mean
0,1,1,50,98,50,198,66.0
1,2,1,60,97,60,217,72.333333
2,3,1,45,86,78,209,69.666667
3,4,1,30,98,58,186,62.0
4,5,2,25,80,65,170,56.666667
5,6,2,50,89,98,237,79.0
6,7,2,80,90,45,215,71.666667
7,8,2,90,78,25,193,64.333333
8,9,3,20,98,15,133,44.333333
9,10,3,50,98,45,193,64.333333


### numpy where()로 조건에 따른 값을 부여한 변수 생성

In [21]:
import numpy as np

exam.assign(test=np.where(exam['science']>=50, 'pass', 'fail'))

Unnamed: 0,id,nclass,math,english,science,total_sum,mean,test
0,1,1,50,98,50,198,66.0,pass
1,2,1,60,97,60,217,72.333333,pass
2,3,1,45,86,78,209,69.666667,pass
3,4,1,30,98,58,186,62.0,pass
4,5,2,25,80,65,170,56.666667,pass
5,6,2,50,89,98,237,79.0,pass
6,7,2,80,90,45,215,71.666667,fail
7,8,2,90,78,25,193,64.333333,fail
8,9,3,20,98,15,133,44.333333,fail
9,10,3,50,98,45,193,64.333333,fail


In [22]:
np.where(exam['science']>=50) # 배열로 반환

(array([ 0,  1,  2,  3,  4,  5, 10, 12, 14, 15, 16, 17, 18, 19],
       dtype=int64),)

### lambda 식을 이용한 파생변수 만들기

- 잘 모르겠음

In [23]:
exam.assign(new= lambda x : x['math']+x['science']+x['english'])

Unnamed: 0,id,nclass,math,english,science,total_sum,mean,new
0,1,1,50,98,50,198,66.0,198
1,2,1,60,97,60,217,72.333333,217
2,3,1,45,86,78,209,69.666667,209
3,4,1,30,98,58,186,62.0,186
4,5,2,25,80,65,170,56.666667,170
5,6,2,50,89,98,237,79.0,237
6,7,2,80,90,45,215,71.666667,215
7,8,2,90,78,25,193,64.333333,193
8,9,3,20,98,15,133,44.333333,133
9,10,3,50,98,45,193,64.333333,193


## 6. agg() : 통계치 구하기

## 7. merge() : 데이터 합치기(열)

## 8. concat() : 데이터 합치기(행)