# Merging DataFrames with pandas

In [1]:
import pandas as pd
import numpy as np

## 1. Reading multiple data files

In [2]:
filenames = ['sales_gold.csv', 'sales_silver.csv', 'sales_bronze.csv']

In [3]:
# 1. Loop
dataframes = []
for filename in filenames:
    dataframes.append(pd.read_csv('data/'+filename))

dataframes[0].head()

Unnamed: 0,item,total
0,apple,500
1,orange,300
2,peach,200
3,berry,100


In [4]:
# 2. comprehension
dataframes = [pd.read_csv('data/'+f) for f in filenames] 

dataframes[0].head()

Unnamed: 0,item,total
0,apple,500
1,orange,300
2,peach,200
3,berry,100


In [5]:
# 3. glob
from glob import glob

In [6]:
filenames = glob('data/sales*.csv')
dataframes = [pd.read_csv(f) for f in filenames] 

dataframes[0].head()

Unnamed: 0,month,total
0,apple,600
1,orange,750
2,peach,570
3,berry,210


In [7]:
# combine

In [8]:
filenames

['data\\sales_bronze.csv', 'data\\sales_gold.csv', 'data\\sales_silver.csv']

In [9]:
items = dataframes[1].copy()
items.columns = ['item','gold']
items['silver'] = dataframes[2]['total']
items['bronze'] = dataframes[0]['total']
items

Unnamed: 0,item,gold,silver,bronze
0,apple,500,700,600
1,orange,300,550,750
2,peach,200,400,570
3,berry,100,180,210


### index & sort

In [10]:
items2 = items.set_index('item')
items2

Unnamed: 0_level_0,gold,silver,bronze
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
apple,500,700,600
orange,300,550,750
peach,200,400,570
berry,100,180,210


In [11]:
items2.sort_index()

Unnamed: 0_level_0,gold,silver,bronze
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
apple,500,700,600
berry,100,180,210
orange,300,550,750
peach,200,400,570


In [12]:
items2.sort_values('gold')

Unnamed: 0_level_0,gold,silver,bronze
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
berry,100,180,210
peach,200,400,570
orange,300,550,750
apple,500,700,600


In [13]:
# Reindexing DataFrame from a list

In [14]:
itemlist = ['apple', 'banana', 'berry', 'orange', 'peach', 'mango']
items3 = items2.reindex(itemlist)
items3

Unnamed: 0_level_0,gold,silver,bronze
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
apple,500.0,700.0,600.0
banana,,,
berry,100.0,180.0,210.0
orange,300.0,550.0,750.0
peach,200.0,400.0,570.0
mango,,,


In [15]:
# Reindexing using another DataFrame Index

In [16]:
items4 = items3.reindex(items2.index)
items4

Unnamed: 0_level_0,gold,silver,bronze
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
apple,500.0,700.0,600.0
orange,300.0,550.0,750.0
peach,200.0,400.0,570.0
berry,100.0,180.0,210.0


### Arithmetic with Series & DataFrames

In [17]:
df = pd.DataFrame({'Date':['2017-01-10','2017-01-11','2017-01-12'], 'TemperatureF':[32,25,34]})
temps_f = df.set_index('Date')
temps_f

Unnamed: 0_level_0,TemperatureF
Date,Unnamed: 1_level_1
2017-01-10,32
2017-01-11,25
2017-01-12,34


In [18]:
temps_f.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 2017-01-10 to 2017-01-12
Data columns (total 1 columns):
TemperatureF    3 non-null int64
dtypes: int64(1)
memory usage: 48.0+ bytes


In [19]:
temps_c = ((temps_f - 32) * 5/9).round(2)
temps_c.columns = temps_c.columns.str.replace('F', 'C')
temps_c

Unnamed: 0_level_0,TemperatureC
Date,Unnamed: 1_level_1
2017-01-10,0.0
2017-01-11,-3.89
2017-01-12,1.11


## 2. Concatenating data

* result = s1.append(s2).append(s3)
* result = pd.concat([s1, s2, s3])

In [20]:
df1 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[320,205,845]})
df_area1 = df1.set_index('area')
df_area1

Unnamed: 0_level_0,stores
area,Unnamed: 1_level_1
CT,320
MA,205
NY,845


In [21]:
df2 = pd.DataFrame({'area':['FL', 'GA', 'AL'], 'stores':[90,115,25]})
df_area2 = df2.set_index('area')
df_area2

Unnamed: 0_level_0,stores
area,Unnamed: 1_level_1
FL,90
GA,115
AL,25


In [22]:
# append

In [23]:
store = df_area1.append(df_area2)
store

Unnamed: 0_level_0,stores
area,Unnamed: 1_level_1
CT,320
MA,205
NY,845
FL,90
GA,115
AL,25


In [24]:
# concat

In [25]:
store = pd.concat([df_area1, df_area2])
store

Unnamed: 0_level_0,stores
area,Unnamed: 1_level_1
CT,320
MA,205
NY,845
FL,90
GA,115
AL,25


In [26]:
# Dataframe의 컬럼이 서로 다른 경우

In [27]:
df_area1

Unnamed: 0_level_0,stores
area,Unnamed: 1_level_1
CT,320
MA,205
NY,845


In [28]:
df3 = pd.DataFrame({'area':['FL', 'NY', 'AL'], 'members':[90,115,25]})
df_area3 = df3.set_index('area')
df_area3

Unnamed: 0_level_0,members
area,Unnamed: 1_level_1
FL,90
NY,115
AL,25


In [29]:
store = df_area1.append(df_area3, sort=False)   # ---> index 동일한 것이 있으면 중복.
store

Unnamed: 0_level_0,stores,members
area,Unnamed: 1_level_1,Unnamed: 2_level_1
CT,320.0,
MA,205.0,
NY,845.0,
FL,,90.0
NY,,115.0
AL,,25.0


In [30]:
store = pd.concat([df_area1, df_area3], axis=1, sort=False)    # index 동일한 것은 하나로 묶음.
store

Unnamed: 0,stores,members
CT,320.0,
MA,205.0,
NY,845.0,115.0
FL,,90.0
AL,,25.0


### Concat using multi-index on rows / columns

In [31]:
df2015 = df_area1.copy()
df2015

Unnamed: 0_level_0,stores
area,Unnamed: 1_level_1
CT,320
MA,205
NY,845


In [32]:
df1 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[839,560,745]})
df2016 = df1.set_index('area')
df2016

Unnamed: 0_level_0,stores
area,Unnamed: 1_level_1
CT,839
MA,560
NY,745


In [33]:
df_store = pd.concat([df2015, df2016], keys=[2015,2016], axis=0)
df_store

Unnamed: 0_level_0,Unnamed: 1_level_0,stores
Unnamed: 0_level_1,area,Unnamed: 2_level_1
2015,CT,320
2015,MA,205
2015,NY,845
2016,CT,839
2016,MA,560
2016,NY,745


In [34]:
df_store = pd.concat([df2015, df2016], keys=[2015,2016], axis='columns')
df_store

Unnamed: 0_level_0,2015,2016
Unnamed: 0_level_1,stores,stores
area,Unnamed: 1_level_2,Unnamed: 2_level_2
CT,320,839
MA,205,560
NY,845,745


### concat with Dictionary

In [35]:
store_dic = {2015:df2015, 2016:df2016}
df_store = pd.concat(store_dic, axis='columns')
df_store

Unnamed: 0_level_0,2015,2016
Unnamed: 0_level_1,stores,stores
area,Unnamed: 1_level_2,Unnamed: 2_level_2
CT,320,839
MA,205,560
NY,845,745


### inner join / outer join

In [36]:
df_area1

Unnamed: 0_level_0,stores
area,Unnamed: 1_level_1
CT,320
MA,205
NY,845


In [37]:
df_area3

Unnamed: 0_level_0,members
area,Unnamed: 1_level_1
FL,90
NY,115
AL,25


In [38]:
pd.concat([df_area1, df_area3], axis=1, join='inner')

Unnamed: 0_level_0,stores,members
area,Unnamed: 1_level_1,Unnamed: 2_level_1
NY,845,115


In [39]:
pd.concat([df_area1, df_area3], axis=1, join='outer', sort=False)

Unnamed: 0,stores,members
CT,320.0,
MA,205.0,
NY,845.0,115.0
FL,,90.0
AL,,25.0


## 3. Merging DataFrames

In [40]:
# pd.merge  :  index가  없는 DataFrame 들을 컬럼 기준으로 합친다.

### 컬럼명이 서로 다른 경우

In [41]:
df1 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[320,205,845]})
df1

Unnamed: 0,area,stores
0,CT,320
1,MA,205
2,NY,845


In [42]:
df2 = pd.DataFrame({'area':['FL', 'NY', 'AL'], 'members':[90,115,25]})
df2

Unnamed: 0,area,members
0,FL,90
1,NY,115
2,AL,25


In [43]:
pd.merge(df1, df2)

Unnamed: 0,area,stores,members
0,NY,845,115


### 컬럼명이 서로 같은 경우

In [44]:
df2015 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[320,205,845]})
df2016 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[900,115,250]})

In [45]:
pd.merge(df2015, df2016)

Unnamed: 0,area,stores


In [46]:
pd.merge(df2015, df2016, on='area')

Unnamed: 0,area,stores_x,stores_y
0,CT,320,900
1,MA,205,115
2,NY,845,250


In [47]:
pd.merge(df2015, df2016, on='area', suffixes=['_2015', '_2016'])

Unnamed: 0,area,stores_2015,stores_2016
0,CT,320,900
1,MA,205,115
2,NY,845,250


### 기준이 되는 컬럼명이 서로 다른 경우

In [48]:
df1 = pd.DataFrame({'area':['CT', 'MA', 'NY'], 'stores':[320,205,845]})
df2 = pd.DataFrame({'state':['CT', 'MA', 'NY'], 'stores':[900,115,250]})

In [49]:
pd.merge(df1, df2, left_on='area', right_on='state')

Unnamed: 0,area,stores_x,state,stores_y
0,CT,320,CT,900
1,MA,205,MA,115
2,NY,845,NY,250


### Merging with inner / left / outer join

In [50]:
df2015 = pd.DataFrame({'area':['NY', 'MA', 'CT'], 'stores':[320,205,845]})
df2016 = pd.DataFrame({'area':['CT', 'TX', 'NY'], 'stores':[900,115,250]})

In [51]:
pd.merge(df2015, df2016, on='area', suffixes=['_2015', '_2016'])   # inner

Unnamed: 0,area,stores_2015,stores_2016
0,NY,320,250
1,CT,845,900


In [52]:
pd.merge(df2015, df2016, on='area', suffixes=['_2015', '_2016'], how='left')

Unnamed: 0,area,stores_2015,stores_2016
0,NY,320,250.0
1,MA,205,
2,CT,845,900.0


In [53]:
pd.merge(df2015, df2016, on='area', suffixes=['_2015', '_2016'], how='right')

Unnamed: 0,area,stores_2015,stores_2016
0,NY,320.0,250
1,CT,845.0,900
2,TX,,115


In [54]:
pd.merge(df2015, df2016, on='area', suffixes=['_2015', '_2016'], how='outer')

Unnamed: 0,area,stores_2015,stores_2016
0,NY,320.0,250.0
1,MA,205.0,
2,CT,845.0,900.0
3,TX,,115.0


In [55]:
# Ordered Merge

In [56]:
pd.merge_ordered(df2015, df2016, on='area', suffixes=['_2015', '_2016'], how='outer')

Unnamed: 0,area,stores_2015,stores_2016
0,CT,845.0,900.0
1,MA,205.0,
2,NY,320.0,250.0
3,TX,,115.0
