In [3]:
# rfriend pandas study
import numpy as np
import pandas as pd
from pandas import DataFrame as df


In [6]:
df_1 = df(data=np.arange(12).reshape(3, 4),
    index=['r0', 'r1', 'r2'], # Will default to np.arange(n) if no indexing
    columns=['c0', 'c1', 'c2', 'c3'],
    dtype='int', # Data type to force, otherwise infer
    copy=False) # Copy data from inputs
df_1

Unnamed: 0,c0,c1,c2,c3
r0,0,1,2,3
r1,4,5,6,7
r2,8,9,10,11


In [7]:
df_1.T

Unnamed: 0,r0,r1,r2
c0,0,4,8
c1,1,5,9
c2,2,6,10
c3,3,7,11


In [8]:
df_1.axes

[Index(['r0', 'r1', 'r2'], dtype='object'),
 Index(['c0', 'c1', 'c2', 'c3'], dtype='object')]

In [9]:
df_1.dtypes

c0    int32
c1    int32
c2    int32
c3    int32
dtype: object

In [10]:
df_1.index

Index(['r0', 'r1', 'r2'], dtype='object')

In [14]:
df_1.head(2)

Unnamed: 0,c0,c1,c2,c3
r0,0,1,2,3
r1,4,5,6,7


In [15]:
df_1.columns

Index(['c0', 'c1', 'c2', 'c3'], dtype='object')

In [17]:
df_1['c0']

r0    0
r1    4
r2    8
Name: c0, dtype: int32

In [18]:
df_1['r0'] # 열 기준으로는 안됨

KeyError: 'r0'

In [None]:
# [Python pandas] DataFrame의 index 재설정(reindex) 와 결측값 채우기(fill in missing values)
# 출처: https://rfriend.tistory.com/255?category=675917 [R, Python 분석과 프로그래밍의 친구 (by R Friend)]

In [27]:
new_idx= ['r0', 'r1', 'r2', 'r5', 'r6']
df_2 = df_1
df_2

Unnamed: 0,c0,c1,c2,c3
r0,0,1,2,3
r1,4,5,6,7
r2,8,9,10,11


In [28]:
df_2.reindex(new_idx, fill_value=0)

Unnamed: 0,c0,c1,c2,c3
r0,0,1,2,3
r1,4,5,6,7
r2,8,9,10,11
r5,0,0,0,0
r6,0,0,0,0


In [29]:
df_2.reindex(new_idx, fill_value='missing')

Unnamed: 0,c0,c1,c2,c3
r0,0,1,2,3
r1,4,5,6,7
r2,8,9,10,11
r5,missing,missing,missing,missing
r6,missing,missing,missing,missing


In [30]:
df_2.reindex(new_idx, fill_value='NA')

Unnamed: 0,c0,c1,c2,c3
r0,0.0,1.0,2.0,3.0
r1,4.0,5.0,6.0,7.0
r2,8.0,9.0,10.0,11.0
r5,,,,
r6,,,,


In [32]:
date_idx = pd.date_range('11/27/2016', periods=5, freq='D')
date_idx

DatetimeIndex(['2016-11-27', '2016-11-28', '2016-11-29', '2016-11-30',
               '2016-12-01'],
              dtype='datetime64[ns]', freq='D')

In [34]:
df_3 = pd.DataFrame({"c1": [10, 20, 30, 40, 50]}, index=date_idx)
df_3

Unnamed: 0,c1
2016-11-27,10
2016-11-28,20
2016-11-29,30
2016-11-30,40
2016-12-01,50


In [35]:
date_idx_3 = pd.date_range('11/25/2016', periods=10, freq='D')
df_3.reindex(date_idx_3)

Unnamed: 0,c1
2016-11-25,
2016-11-26,
2016-11-27,10.0
2016-11-28,20.0
2016-11-29,30.0
2016-11-30,40.0
2016-12-01,50.0
2016-12-02,
2016-12-03,
2016-12-04,


In [36]:
df_3.reindex(date_idx_3, method='ffill') # forward-propagation

Unnamed: 0,c1
2016-11-25,
2016-11-26,
2016-11-27,10.0
2016-11-28,20.0
2016-11-29,30.0
2016-11-30,40.0
2016-12-01,50.0
2016-12-02,50.0
2016-12-03,50.0
2016-12-04,50.0


In [37]:
df_3.reindex(date_idx_3, method='ffill') # forward-propagation

Unnamed: 0,c1
2016-11-25,
2016-11-26,
2016-11-27,10.0
2016-11-28,20.0
2016-11-29,30.0
2016-11-30,40.0
2016-12-01,50.0
2016-12-02,50.0
2016-12-03,50.0
2016-12-04,50.0


In [38]:
df_3.reindex(date_idx_3, method='bfill') # forward-propagation

Unnamed: 0,c1
2016-11-25,10.0
2016-11-26,10.0
2016-11-27,10.0
2016-11-28,20.0
2016-11-29,30.0
2016-11-30,40.0
2016-12-01,50.0
2016-12-02,
2016-12-03,
2016-12-04,


In [39]:
# [Python pandas] 여러개의 동일한 형태 DataFrame 합치기 : pd.concat()
# 출처: https://rfriend.tistory.com/256?category=675917 [R, Python 분석과 프로그래밍의 친구 (by R Friend)]

NameError: name 'reindex' is not defined

In [41]:
df_1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2'],
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']},
    index=[0, 1, 2])
df_1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2


In [42]:
df_2 = pd.DataFrame({'A': ['A3', 'A4', 'A5'],
    'B': ['B3', 'B4', 'B5'],
    'C': ['C3', 'C4', 'C5'],
    'D': ['D3', 'D4', 'D5']},
    index=[3, 4, 5])
df_2

Unnamed: 0,A,B,C,D
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5


In [47]:
df_12_axis0 = pd.concat([df_1, df_2]) # row bind : axis = 0, default
df_12_axis0

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5


In [48]:
df_13_axis0 = pd.concat([df_1, df_2], axis=1) # column bind
df_13_axis0

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,,,,,A3,B3,C3,D3
4,,,,,A4,B4,C4,D4
5,,,,,A5,B5,C5,D5


In [45]:
df_3 = pd.DataFrame({'E': ['A6', 'A7', 'A8'],
    'F': ['B6', 'B7', 'B8'],
    'G': ['C6', 'C7', 'C8'],
    'H': ['D6', 'D7', 'D8']},
    index=[0, 1, 2])
df_3

Unnamed: 0,E,F,G,H
0,A6,B6,C6,D6
1,A7,B7,C7,D7
2,A8,B8,C8,D8


In [46]:
df_13_axis1 = pd.concat([df_1, df_3], axis=1) # column bind
df_13_axis1

Unnamed: 0,A,B,C,D,E,F,G,H
0,A0,B0,C0,D0,A6,B6,C6,D6
1,A1,B1,C1,D1,A7,B7,C7,D7
2,A2,B2,C2,D2,A8,B8,C8,D8


In [49]:
df_4 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],

     'B': ['B0', 'B1', 'B2'],

     'C': ['C0', 'C1', 'C2'],

     'E': ['E0', 'E1', 'E2']},

     index=[0, 1, 3])
df_14_outer = pd.concat([df_1, df_4], join='outer') # union, default
df_14_outer

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,A,B,C,D,E
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
0,A0,B0,C0,,E0
1,A1,B1,C1,,E1
3,A2,B2,C2,,E2


In [51]:
df_14_inner = pd.concat([df_1, df_4], join='inner') # union, default
df_14_inner

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
0,A0,B0,C0
1,A1,B1,C1
3,A2,B2,C2


In [52]:
#  (3) axis=1일 경우 특정 DataFrame의 index를 그대로 이용하고자 할 경우 : join_axes
df_14_outer_axis1 = pd.concat([df_1, df_4], join='outer', axis=1) # default
df_14_outer_axis1

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,E
0,A0,B0,C0,D0,A0,B0,C0,E0
1,A1,B1,C1,D1,A1,B1,C1,E1
2,A2,B2,C2,D2,,,,
3,,,,,A2,B2,C2,E2


In [53]:
df_14_inner_axis1 = pd.concat([df_1, df_4], join='inner', axis=1)
df_14_inner_axis1

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,E
0,A0,B0,C0,D0,A0,B0,C0,E0
1,A1,B1,C1,D1,A1,B1,C1,E1


In [54]:
df_14_axis1_reindex = pd.concat([df_1, df_4], axis=1).reindex(df_1.index)
df_14_axis1_reindex

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,E
0,A0,B0,C0,D0,A0,B0,C0,E0
1,A1,B1,C1,D1,A1,B1,C1,E1
2,A2,B2,C2,D2,,,,


In [None]:
#  (4) 기존 index를 무시하고 싶을 때 : ignore_index

In [6]:
df_5 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
     'B': ['B0', 'B1', 'B2'],
     'C': ['C0', 'C1', 'C2'],
     'D': ['D0', 'D1', 'D2']},
     index=['r0', 'r1', 'r2'])
df_5

Unnamed: 0,A,B,C,D
r0,A0,B0,C0,D0
r1,A1,B1,C1,D1
r2,A2,B2,C2,D2


In [7]:
df_6 = pd.DataFrame({'A': ['A3', 'A4', 'A5'],
     'B': ['B3', 'B4', 'B5'],
     'C': ['C3', 'C4', 'C5'],
     'D': ['D3', 'D4', 'D5']},
    index=['r3', 'r4', 'r5'])
df_6

Unnamed: 0,A,B,C,D
r3,A3,B3,C3,D3
r4,A4,B4,C4,D4
r5,A5,B5,C5,D5


In [8]:
df_56_with_index = pd.concat([df_5, df_6], ignore_index=False) # default
df_56_with_index

Unnamed: 0,A,B,C,D
r0,A0,B0,C0,D0
r1,A1,B1,C1,D1
r2,A2,B2,C2,D2
r3,A3,B3,C3,D3
r4,A4,B4,C4,D4
r5,A5,B5,C5,D5


In [18]:
df_56_ignore_index = pd.concat([df_5, df_6], ignore_index=True)# index 0~(n-1)
df_56_ignore_index.loc[:1]

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


In [10]:
df_56_with_keys = pd.concat([df_5, df_6], keys=['df_5', 'df_6'])
df_56_with_keys

Unnamed: 0,Unnamed: 1,A,B,C,D
df_5,r0,A0,B0,C0,D0
df_5,r1,A1,B1,C1,D1
df_5,r2,A2,B2,C2,D2
df_6,r3,A3,B3,C3,D3
df_6,r4,A4,B4,C4,D4
df_6,r5,A5,B5,C5,D5


In [19]:
df_56_with_keys.loc['df_5']

Unnamed: 0,A,B,C,D
r0,A0,B0,C0,D0
r1,A1,B1,C1,D1
r2,A2,B2,C2,D2


In [20]:
df_56_with_keys.loc['df_6'][0:2]

Unnamed: 0,A,B,C,D
r3,A3,B3,C3,D3
r4,A4,B4,C4,D4


In [21]:
df_56_with_name = pd.concat([df_5, df_6],
     keys=['df_5', 'df_6'],
     names=['df_name', 'row_number'])
df_56_with_name

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
df_name,row_number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df_5,r0,A0,B0,C0,D0
df_5,r1,A1,B1,C1,D1
df_5,r2,A2,B2,C2,D2
df_6,r3,A3,B3,C3,D3
df_6,r4,A4,B4,C4,D4
df_6,r5,A5,B5,C5,D5


In [24]:
df_7 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
     'B': ['B0', 'B1', 'B2'],
     'C': ['C0', 'C1', 'C2'],
     'D': ['D0', 'D1', 'D2']},
     index=['r0', 'r1', 'r2'])
df_8 = pd.DataFrame({'A': ['A2', 'A3', 'A4'],
     'B': ['B2', 'B3', 'B4'],
     'C': ['C2', 'C3', 'C4'],
     'D': ['D2', 'D3', 'D4']},
     index=['r2', 'r3', 'r4'])

In [25]:
df_78_F_verify_integrity = pd.concat([df_7, df_8],
     verify_integrity=False) # default
df_78_F_verify_integrity

Unnamed: 0,A,B,C,D
r0,A0,B0,C0,D0
r1,A1,B1,C1,D1
r2,A2,B2,C2,D2
r2,A2,B2,C2,D2
r3,A3,B3,C3,D3
r4,A4,B4,C4,D4


In [26]:
df_78_ignore_index = pd.concat([df_7, df_8], ignore_index=True)# index 0~(n-1)
df_78_ignore_index

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A2,B2,C2,D2
4,A3,B3,C3,D3
5,A4,B4,C4,D4
