## pandas DataFrame : concat

In [1]:
import pandas as pd
import numpy as np

#### concat 함수 사용하여 dataframe 병합하기
 - pandas.concat 함수
 - 축을 따라 dataframe을 병합 가능
   - 기본 axis = 0 -> 행단위 병합

* column명이 같은 경우

In [2]:
df1 = pd.DataFrame({'key1' : np.arange(10), 'value1' : np.random.randn(10)})
df2 = pd.DataFrame({'key1' : np.arange(10), 'value1' : np.random.randn(10)})

In [3]:
df2

Unnamed: 0,key1,value1
0,0,-1.526023
1,1,0.005561
2,2,0.808193
3,3,0.771036
4,4,0.562787
5,5,-0.86115
6,6,-1.798224
7,7,-0.914244
8,8,-1.04657
9,9,-0.268861


In [4]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,key1,value1
0,0,0.597299
1,1,0.910137
2,2,-1.142114
3,3,-0.713986
4,4,-0.112986
5,5,-0.737238
6,6,0.828065
7,7,0.086722
8,8,-0.222503
9,9,-0.191225


In [5]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,key1,value1,key1.1,value1.1
0,0,0.597299,0,-1.526023
1,1,0.910137,1,0.005561
2,2,-1.142114,2,0.808193
3,3,-0.713986,3,0.771036
4,4,-0.112986,4,0.562787
5,5,-0.737238,5,-0.86115
6,6,0.828065,6,-1.798224
7,7,0.086722,7,-0.914244
8,8,-0.222503,8,-1.04657
9,9,-0.191225,9,-0.268861


* column 명이 다른 경우 

In [6]:
df3 = pd.DataFrame({'key2' : np.arange(10), 'value2' : np.random.randn(10)})

In [7]:
pd.concat([df1, df3], axis=1)

Unnamed: 0,key1,value1,key2,value2
0,0,0.597299,0,1.224742
1,1,0.910137,1,1.522369
2,2,-1.142114,2,-0.689215
3,3,-0.713986,3,-0.013529
4,4,-0.112986,4,-1.218712
5,5,-0.737238,5,0.554787
6,6,0.828065,6,0.863462
7,7,0.086722,7,1.258892
8,8,-0.222503,8,1.179902
9,9,-0.191225,9,-1.013785


## pandas DataFrame :  merge & join

#### dataframe merge
 - SQL의 join처럼 특정한 column을 기준으로 병합
   - join 방식: how 파라미터를 통해 명시
     - inner: 기본값, 일치하는 값이 있는 경우 
     - left: left outer join
     - right: right outer join
     - outer: full outer join
     
 - pandas.merge 함수가 사용됨

In [8]:
customer = pd.DataFrame({'customer_id' : np.arange(6), 
                    'name' : ['철수'"", '영희', '길동', '영수', '수민', '동건'], 
                    '나이' : [40, 20, 21, 30, 31, 18]})

customer

Unnamed: 0,customer_id,name,나이
0,0,철수,40
1,1,영희,20
2,2,길동,21
3,3,영수,30
4,4,수민,31
5,5,동건,18


In [9]:
orders = pd.DataFrame({'customer_id' : [1, 1, 2, 2, 2, 3, 3, 1, 4, 9], 
                    'item' : ['치약', '칫솔', '이어폰', '헤드셋', '수건', '생수', '수건', '치약', '생수', '케이스'], 
                    'quantity' : [1, 2, 1, 1, 3, 2, 2, 3, 2, 1]})
orders.head()

Unnamed: 0,customer_id,item,quantity
0,1,치약,1
1,1,칫솔,2
2,2,이어폰,1
3,2,헤드셋,1
4,2,수건,3


* on 
 - join 대상이 되는 column 명시

In [10]:
pd.merge(customer, orders, on='customer_id', how='inner')

Unnamed: 0,customer_id,name,나이,item,quantity
0,1,영희,20,치약,1
1,1,영희,20,칫솔,2
2,1,영희,20,치약,3
3,2,길동,21,이어폰,1
4,2,길동,21,헤드셋,1
5,2,길동,21,수건,3
6,3,영수,30,생수,2
7,3,영수,30,수건,2
8,4,수민,31,생수,2


In [11]:
pd.merge(customer, orders, on='customer_id', how='left')

Unnamed: 0,customer_id,name,나이,item,quantity
0,0,철수,40,,
1,1,영희,20,치약,1.0
2,1,영희,20,칫솔,2.0
3,1,영희,20,치약,3.0
4,2,길동,21,이어폰,1.0
5,2,길동,21,헤드셋,1.0
6,2,길동,21,수건,3.0
7,3,영수,30,생수,2.0
8,3,영수,30,수건,2.0
9,4,수민,31,생수,2.0


In [12]:
pd.merge(customer, orders, on='customer_id', how='right')

Unnamed: 0,customer_id,name,나이,item,quantity
0,1,영희,20.0,치약,1
1,1,영희,20.0,칫솔,2
2,1,영희,20.0,치약,3
3,2,길동,21.0,이어폰,1
4,2,길동,21.0,헤드셋,1
5,2,길동,21.0,수건,3
6,3,영수,30.0,생수,2
7,3,영수,30.0,수건,2
8,4,수민,31.0,생수,2
9,9,,,케이스,1


In [13]:
pd.merge(customer, orders, on='customer_id', how='outer')

Unnamed: 0,customer_id,name,나이,item,quantity
0,0,철수,40.0,,
1,1,영희,20.0,치약,1.0
2,1,영희,20.0,칫솔,2.0
3,1,영희,20.0,치약,3.0
4,2,길동,21.0,이어폰,1.0
5,2,길동,21.0,헤드셋,1.0
6,2,길동,21.0,수건,3.0
7,3,영수,30.0,생수,2.0
8,3,영수,30.0,수건,2.0
9,4,수민,31.0,생수,2.0


* index 기준으로 join하기

In [14]:
cust1 = customer.set_index('customer_id')
order1 = orders.set_index('customer_id')

In [15]:
cust1

Unnamed: 0_level_0,name,나이
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,철수,40
1,영희,20
2,길동,21
3,영수,30
4,수민,31
5,동건,18


In [16]:
order1

Unnamed: 0_level_0,item,quantity
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,치약,1
1,칫솔,2
2,이어폰,1
2,헤드셋,1
2,수건,3
3,생수,2
3,수건,2
1,치약,3
4,생수,2
9,케이스,1


In [17]:
pd.merge(cust1, order1, left_index=True, right_index=True)

Unnamed: 0_level_0,name,나이,item,quantity
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,영희,20,치약,1
1,영희,20,칫솔,2
1,영희,20,치약,3
2,길동,21,이어폰,1
2,길동,21,헤드셋,1
2,길동,21,수건,3
3,영수,30,생수,2
3,영수,30,수건,2
4,수민,31,생수,2


#### 연습문제
1. 가장 많이 팔린 아이템은?
2. 영희가 가장 많이 구매한 아이템은?

In [18]:
pd.merge(customer, orders, on='customer_id').groupby('item').sum().sort_values(by='quantity', ascending=False)

Unnamed: 0_level_0,customer_id,나이,quantity
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
수건,5,51,5
생수,7,61,4
치약,2,40,4
칫솔,1,20,2
이어폰,2,21,1
헤드셋,2,21,1


In [19]:
pd.merge(customer, orders, on='customer_id').groupby(['name', 'item']).sum().loc['영희', 'quantity']

item
치약    4
칫솔    2
Name: quantity, dtype: int64

#### join 함수
 - 내부적으로 pandas.merge 함수 사용
 - 기본적으로 index를 사용하여 left join

In [20]:
cust1.join(order1, how='inner')

Unnamed: 0_level_0,name,나이,item,quantity
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,영희,20,치약,1
1,영희,20,칫솔,2
1,영희,20,치약,3
2,길동,21,이어폰,1
2,길동,21,헤드셋,1
2,길동,21,수건,3
3,영수,30,생수,2
3,영수,30,수건,2
4,수민,31,생수,2
