In [1]:
import pandas as pd

## pd.concat - 데이터베이스 합치기

In [64]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'],   'C': ['C0', 'C1', 'C2', 'C3']}, index=[0, 1,2,3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], 'B': ['B4', 'B5', 'B6', 'B7'], 'C': ['C4', 'C5', 'C6', 'C7']}, index=[3,4,5,6])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'], 'B': ['B8', 'B9', 'B10', 'B11'], 'C': ['C8', 'C9', 'C10', 'C11']}, index=[0,1,2,3])

In [72]:
df1

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3


In [66]:
df2

Unnamed: 0,A,B,C
3,A4,B4,C4
4,A5,B5,C5
5,A6,B6,C6
6,A7,B7,C7


In [67]:
df3

Unnamed: 0,A,B,C
0,A8,B8,C8
1,A9,B9,C9
2,A10,B10,C10
3,A11,B11,C11


#### 데이터 프레임들을 항상 리스트로 만들어 입력
#### 기본은 row 단위로, axis 설정으로 컬럼으로 변경 가능

In [78]:
pd.concat([df1,df2,df3]) # default는 컬럼 기준 -> row를 이어붙인다

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
3,A4,B4,C4
4,A5,B5,C5
5,A6,B6,C6
6,A7,B7,C7
0,A8,B8,C8
1,A9,B9,C9


In [79]:
pd.concat([df1,df2,df3], axis=1) # row 기준으로 -> 컬럼을 이어붙인다

Unnamed: 0,A,B,C,A.1,B.1,C.1,A.2,B.2,C.2
0,A0,B0,C0,,,,A8,B8,C8
1,A1,B1,C1,,,,A9,B9,C9
2,A2,B2,C2,,,,A10,B10,C10
3,A3,B3,C3,A4,B4,C4,A11,B11,C11
4,,,,A5,B5,C5,,,
5,,,,A6,B6,C6,,,
6,,,,A7,B7,C7,,,


#### ignore_index : index를 다시 0부터 정렬한다.

In [77]:
pd.concat([df1,df2,df3], ignore_index=True)

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
4,A4,B4,C4
5,A5,B5,C5
6,A6,B6,C6
7,A7,B7,C7
8,A8,B8,C8
9,A9,B9,C9


#### join : 'inner' - 공통된 부분만 합친다

In [87]:
df3.rename(columns={'C':'Z'}, inplace=True) # df3의 C 컬럼을 Z로 변경
pd.concat([df1,df2,df3], join='inner') # C가 df3에 없으므로 무시

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
3,A4,B4
4,A5,B5
5,A6,B6
6,A7,B7
0,A8,B8
1,A9,B9


#### 참고 : [ DataFrame ].append - 빈 데이터프레임에도 가능 

In [85]:
df1.append(df2, ignore_index=True)

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
4,A4,B4,C4
5,A5,B5,C5
6,A6,B6,C6
7,A7,B7,C7


## pd.merge - 데이터베이스 JOIN을 구현

In [90]:
data1 = [{'name': 'kevin', 'score':70, 'subject':'math'}, {'name': 'terry', 'score':80, 'subject':'english'}, \
         {'name': 'kevin', 'score':90, 'subject':'english'},{'name': 'terry', 'score':60, 'subject':'math'}]
data2 = [{'name':'kevin','age':23}, {'name':'terry','age':24}, {'name':'john','age':30}]

In [91]:
df1 = pd.DataFrame.from_dict(data1)
df1

Unnamed: 0,name,score,subject
0,kevin,70,math
1,terry,80,english
2,kevin,90,english
3,terry,60,math


In [92]:
df2 = pd.DataFrame.from_dict(data2)
df2

Unnamed: 0,age,name
0,23,kevin
1,24,terry
2,30,john


### how : JOIN 방법 ( left, right, inner, outer ) , on :  기준 컬럼

#### 기본적으로 컬럼명이 같을 경우

In [37]:
pd.merge(df1, df2, how='inner', on=['name'])

Unnamed: 0,name,score,subject,age
0,kevin,70,math,23
1,kevin,90,english,23
2,terry,80,english,24
3,terry,60,math,24


In [45]:
pd.merge(df1,df2, how='right', on=['name'])

Unnamed: 0,name,score,subject,age
0,kevin,70.0,math,23
1,kevin,90.0,english,23
2,terry,80.0,english,24
3,terry,60.0,math,24
4,john,,,30


In [55]:
pd.merge(df1,df2, how='outer') # full outer join

Unnamed: 0,name,score,subject,age
0,kevin,70.0,math,23
1,kevin,90.0,english,23
2,terry,80.0,english,24
3,terry,60.0,math,24
4,john,,,30


#### value의 의미는 같지만 컬럼명이 다른 경우

In [93]:
df2.rename(columns={'name':'id'},inplace=True) # name 컬럼을 id로 변경
df2

Unnamed: 0,age,id
0,23,kevin
1,24,terry
2,30,john


In [95]:
# left_on : df1의 컬럼명, right_on : df2의 컬럼명
pd.merge(df1, df2, left_on='name', right_on='id', how='inner')

Unnamed: 0,name,score,subject,age,id
0,kevin,70,math,23,kevin
1,kevin,90,english,23,kevin
2,terry,80,english,24,terry
3,terry,60,math,24,terry
