In [24]:
import numpy as np
import pandas as pd

df = pd.DataFrame({'c1': np.arange(5),
                  'c2': np.random.randn(5)},
                 index=['r0', 'r1', 'r2', 'r3', 'r4'])
df

Unnamed: 0,c1,c2
r0,0,-0.712064
r1,1,-0.350082
r2,2,0.912927
r3,3,-0.214117
r4,4,-0.976888


### 행 인덱스 재설정을 통해 행 제거

In [25]:
df = df.reindex(['r0', 'r1', 'r2'])
df

Unnamed: 0,c1,c2
r0,0,-0.712064
r1,1,-0.350082
r2,2,0.912927


In [21]:
df = df.reindex(['r0', 'r1', 'r2', 'r3', 'r4'], fill_value=0)
df

Unnamed: 0,c1,c2
r0,0,-0.705095
r1,1,-0.147778
r2,2,0.151457
r3,0,0.0
r4,0,0.0


In [26]:
df.reindex(['r0', 'r1', 'r2', 'r3', 'r4'], fill_value='missing')

Unnamed: 0,c1,c2
r0,0,-0.712064
r1,1,-0.350082
r2,2,0.912927
r3,missing,missing
r4,missing,missing


### concat()

In [144]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2'],
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']},
    index=[0, 1, 2])

df2 = pd.DataFrame({'A': ['A3', 'A4', 'A5'],
    'B': ['B3', 'B4', 'B5'],
    'C': ['C3', 'C4', 'C5'],
    'D': ['D3', 'D4', 'D5']},
    index=[3, 4, 5])

df3 = pd.DataFrame({'E': ['A6', 'A7', 'A8'],
    'F': ['B6', 'B7', 'B8'],
    'G': ['C6', 'C7', 'C8'],
    'H': ['D6', 'D7', 'D8']},
    index=[0, 1, 2])

df4 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
     'B': ['B0', 'B1', 'B2'],
     'C': ['C0', 'C1', 'C2'],
     'E': ['E0', 'E1', 'E2']},
     index=[0, 1, 3])


df5 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
     'B': ['B0', 'B1', 'B2'],
     'C': ['C0', 'C1', 'C2'],
     'D': ['D0', 'D1', 'D2']},
     index=['r0', 'r1', 'r2'])


df6 = pd.DataFrame({'A': ['A3', 'A4', 'A5'],
     'B': ['B3', 'B4', 'B5'],
     'C': ['C3', 'C4', 'C5'],
     'D': ['D3', 'D4', 'D5']},
     index=['r3', 'r4', 'r5'])

df7 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
     'B': ['B0', 'B1', 'B2'],
     'C': ['C0', 'C1', 'C2'],
     'D': ['D0', 'D1', 'D2']},
     index=['r0', 'r1', 'r2'])

df8 = pd.DataFrame({'A': ['A2', 'A3', 'A4'],
     'B': ['B2', 'B3', 'B4'],
     'C': ['C2', 'C3', 'C4'],
     'D': ['D2', 'D3', 'D4']},
     index=['r2', 'r3', 'r4'])

dfleft = pd.DataFrame({'KEY': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']})


dfright = pd.DataFrame({'KEY': ['K2', 'K3', 'K4', 'K5'],
    'C': ['C2', 'C3', 'C4', 'C5'],
    'D': ['D2', 'D3', 'D4', 'D5']})

   
dfleft2 = pd.DataFrame({'KEY': ['K0', 'K1', 'K2', 'K3'],
     'A': ['A0', 'A1', 'A2', 'A3'],
     'B': ['B0', 'B1', 'B2', 'B3'],
     'C': ['C0', 'C1', 'C2', 'C3']})

dfright_2 = pd.DataFrame({'KEY': ['K0', 'K1', 'K2', 'K3'],
     'B': ['B0_2', 'B1_2', 'B2_2', 'B3_2'],
     'C': ['C0_2', 'C1_2', 'C2_2', 'C3_2'],
     'D': ['D0_2', 'D1_2', 'D2_2', 'D3_3']})

In [33]:
pd.concat([df1, df4], join='outer')

Unnamed: 0,A,B,C,D,E
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
0,A0,B0,C0,,E0
1,A1,B1,C1,,E1
3,A2,B2,C2,,E2


In [34]:
pd.concat([df1, df4], join='outer', axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,E
0,A0,B0,C0,D0,A0,B0,C0,E0
1,A1,B1,C1,D1,A1,B1,C1,E1
2,A2,B2,C2,D2,,,,
3,,,,,A2,B2,C2,E2


In [36]:
pd.concat([df1, df4], join='inner')

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
0,A0,B0,C0
1,A1,B1,C1
3,A2,B2,C2


In [35]:
pd.concat([df1, df4], join='inner', axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,E
0,A0,B0,C0,D0,A0,B0,C0,E0
1,A1,B1,C1,D1,A1,B1,C1,E1


In [37]:
pd.concat([df5, df6])

Unnamed: 0,A,B,C,D
r0,A0,B0,C0,D0
r1,A1,B1,C1,D1
r2,A2,B2,C2,D2
r3,A3,B3,C3,D3
r4,A4,B4,C4,D4
r5,A5,B5,C5,D5


In [38]:
pd.concat([df5, df6], ignore_index=True)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5


### 계층구조

In [40]:
df56_with_keys = pd.concat([df5, df6], keys=['df5', 'df6'])
df56_with_keys

Unnamed: 0,Unnamed: 1,A,B,C,D
df5,r0,A0,B0,C0,D0
df5,r1,A1,B1,C1,D1
df5,r2,A2,B2,C2,D2
df6,r3,A3,B3,C3,D3
df6,r4,A4,B4,C4,D4
df6,r5,A5,B5,C5,D5


In [42]:
df56_with_keys.loc['df5']

Unnamed: 0,A,B,C,D
r0,A0,B0,C0,D0
r1,A1,B1,C1,D1
r2,A2,B2,C2,D2


In [52]:
df56_with_keys.loc['df5'].loc['r1']

A    A1
B    B1
C    C1
D    D1
Name: r1, dtype: object

In [54]:
df56_with_keys.loc['df5'].loc['r1', 'A']
df56_with_keys.loc['df5'].loc['r1']['A']

'A1'

In [56]:
pd.concat([df5, df6], keys=['df5', 'df6'], names=['dfName', 'rowNum'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
dfName,rowNum,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df5,r0,A0,B0,C0,D0
df5,r1,A1,B1,C1,D1
df5,r2,A2,B2,C2,D2
df6,r3,A3,B3,C3,D3
df6,r4,A4,B4,C4,D4
df6,r5,A5,B5,C5,D5


### Series Concat

In [59]:
pd.Series(['S1', 'S2', 'S3'])
pd.Series(['S1', 'S2', 'S3'], name='S', index=['c', 'd', 'e'])

c    S1
d    S2
e    S3
Name: S, dtype: object

In [61]:
s1 = pd.Series(['S1', 'S2', 'S3'], name='S')
pd.concat([df1, s1])

Unnamed: 0,A,B,C,D,0
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
0,,,,,S1
1,,,,,S2
2,,,,,S3


In [62]:
pd.concat([df1, s1], axis=1)

Unnamed: 0,A,B,C,D,S
0,A0,B0,C0,D0,S1
1,A1,B1,C1,D1,S2
2,A2,B2,C2,D2,S3


In [63]:
pd.concat([df1, s1], axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4
0,A0,B0,C0,D0,S1
1,A1,B1,C1,D1,S2
2,A2,B2,C2,D2,S3


In [65]:
s1 = pd.Series(['S1', 'S2', 'S3'], name='S')
s2 = pd.Series([0, 1, 2])
s3 = pd.Series([3, 4, 5])

In [66]:
pd.concat([s1, s2], axis=1)

Unnamed: 0,S,0
0,S1,0
1,S2,1
2,S3,2


In [71]:
s123 = pd.concat([s1, s2, s3], axis=1)
type(s123)

pandas.core.frame.DataFrame

In [74]:
pd.concat([s1, s2, s3], axis=1, keys=['c0', 'c1', 'c2'])

Unnamed: 0,c0,c1,c2
0,S1,0,3
1,S2,1,4
2,S3,2,5


### append

In [75]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2


In [80]:
s4 = pd.Series(['S1', 'S2', 'S3', 'S4'], index=['A', 'B', 'C', 'D'])
df1.append(s4, ignore_index=True)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,S1,S2,S3,S4


In [81]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [84]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2),
            index=['a', 'b', 'c'],
            columns=['one','two'])

df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [86]:
df2 = pd.DataFrame(np.arange(4).reshape(2, 2) + 5,
            index=['a', 'c'],
            columns=['three','four'])

df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [89]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [88]:
pd.concat([df1, df2], keys=['first', 'second'], axis=1)

Unnamed: 0_level_0,first,first,second,second
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [92]:
df3 = pd.concat({'first':df1, 'second':df2}, axis=1)
df3

Unnamed: 0_level_0,first,first,second,second
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [94]:
df3['first'].loc['a', 'one']

0

In [106]:
a=pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
            index=['f','e','d','c','b','a'])

b=pd.Series(np.arange(len(a),dtype=np.float64),
            index=['f','e','d','c','b','a'])

np.where(a.isnull(), b, a + b)

array([0. , 3.5, 2. , 6.5, 8.5, 5. ])

### merge
default: `how='inner', on= intersection of the columns in both dataframes`

#### columns

In [155]:
print(dfright)

  KEY   C   D
0  K2  C2  D2
1  K3  C3  D3
2  K4  C4  D4
3  K5  C5  D5


In [108]:
pd.merge(dfleft, dfright)

Unnamed: 0,KEY,A,B,C,D
0,K2,A2,B2,C2,D2
1,K3,A3,B3,C3,D3


In [109]:
pd.merge(dfleft, dfright, how='inner')

Unnamed: 0,KEY,A,B,C,D
0,K2,A2,B2,C2,D2
1,K3,A3,B3,C3,D3


In [110]:
pd.merge(dfleft, dfright, how='outer')

Unnamed: 0,KEY,A,B,C,D
0,K0,A0,B0,,
1,K1,A1,B1,,
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3
4,K4,,,C4,D4
5,K5,,,C5,D5


In [139]:
dfleft = pd.DataFrame({'KEY': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']})


dfright = pd.DataFrame({'KEY': ['K2', 'K3', 'K4', 'K5'],
    'A': ['C2', 'C3', 'C4', 'C5'],
    'D': ['D2', 'D3', 'D4', 'D5']})

pd.merge(dfleft, dfright)

Unnamed: 0,KEY,A,B,D


In [147]:
pd.merge(dfleft, dfright, how='inner', on='KEY') # how='inner', on='KEY' 생략

Unnamed: 0,KEY,A,B,C,D
0,K2,A2,B2,C2,D2
1,K3,A3,B3,C3,D3


In [148]:
pd.merge(dfleft, dfright, how ='right', indictator=True)

Unnamed: 0,KEY,A,B,C,D
0,K2,A2,B2,C2,D2
1,K3,A3,B3,C3,D3
2,K4,,,C4,D4
3,K5,,,C5,D5


#### indicator

In [151]:
pd.merge(dfleft, dfright, how ='right', indicator=True)

Unnamed: 0,KEY,A,B,C,D,_merge
0,K2,A2,B2,C2,D2,both
1,K3,A3,B3,C3,D3,both
2,K4,,,C4,D4,right_only
3,K5,,,C5,D5,right_only


#### index

In [156]:
dfright = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']},
                       index=['K0', 'K1', 'K2', 'K3'])

dfleft = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']},
                      index=['K2', 'K3', 'K4', 'K5'])

In [202]:
###데이터프레임을 index 기준으로 병합######

In [205]:
dfright = pd.DataFrame({
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']},
    index=['K0', 'K1', 'K2','K3'])
dfleft= pd.DataFrame({   
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']},
    index=['K2', 'K3', 'K4','K5'])
print(dfright)
print(dfleft)

     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
K3  A3  B3
     C   D
K2  C0  D0
K3  C1  D1
K4  C2  D2
K5  C3  D3


In [208]:
pd.merge(dfleft,dfright, left_index=True, right_index=True)

Unnamed: 0,C,D,A,B
K2,C0,D0,A2,B2
K3,C1,D1,A3,B3


In [209]:
pd.merge(dfleft,dfright, left_index=True, right_index=True, how='left')

Unnamed: 0,C,D,A,B
K2,C0,D0,A2,B2
K3,C1,D1,A3,B3
K4,C2,D2,,
K5,C3,D3,,


In [210]:
pd.merge(dfleft,dfright, left_index=True, right_index=True, how='right')

Unnamed: 0,C,D,A,B
K0,,,A0,B0
K1,,,A1,B1
K2,C0,D0,A2,B2
K3,C1,D1,A3,B3


In [211]:
dfleft

Unnamed: 0,C,D
K2,C0,D0
K3,C1,D1
K4,C2,D2
K5,C3,D3


In [212]:
dfright

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,B3


In [213]:
#pd.merge(dfleft, dfright, left_index=True, right_index=True, how='right')

dfleft.join(dfright, how='right')

Unnamed: 0,C,D,A,B
K0,,,A0,B0
K1,,,A1,B1
K2,C0,D0,A2,B2
K3,C1,D1,A3,B3


In [214]:
"""
C	D	A	B
K0	NaN	NaN	A0	B0
K1	NaN	NaN	A1	B1
K2	C0	D0	A2	B2
K3	C1	D1	A3	B3
"""



'\nC\tD\tA\tB\nK0\tNaN\tNaN\tA0\tB0\nK1\tNaN\tNaN\tA1\tB1\nK2\tC0\tD0\tA2\tB2\nK3\tC1\tD1\tA3\tB3\n'

In [217]:
pd.merge(dfleft,dfright, left_index=True, right_index=True, how='inner')

Unnamed: 0,C,D,A,B
K2,C0,D0,A2,B2
K3,C1,D1,A3,B3


In [218]:
dfleft.join(dfright, how='inner')

Unnamed: 0,C,D,A,B
K2,C0,D0,A2,B2
K3,C1,D1,A3,B3


In [219]:
dfleft.join(dfright, how='outer')

Unnamed: 0,C,D,A,B
K0,,,A0,B0
K1,,,A1,B1
K2,C0,D0,A2,B2
K3,C1,D1,A3,B3
K4,C2,D2,,
K5,C3,D3,,


In [220]:
###결측값 처리?###

In [223]:
dfleft=pd.DataFrame({'KEY':['K0','K1','K2','K3'],
                    'A':['A0','A1','A2','A3'],
                    'B':[0.5, 2.1, 3.1, 0.3]})
dfleft

Unnamed: 0,KEY,A,B
0,K0,A0,0.5
1,K1,A1,2.1
2,K2,A2,3.1
3,K3,A3,0.3


In [224]:
dfright=pd.DataFrame({'KEY':['K2','K3','K4','K5'],
                    'C':['C0','C1','C2','C3'],
                    'D':['D2','D3','D4','D5']})
dfright

Unnamed: 0,KEY,C,D
0,K2,C0,D2
1,K3,C1,D3
2,K4,C2,D4
3,K5,C3,D5


In [227]:
pd.merge(dfleft,dfright) #how='inner',on='KEY' 디폴트

Unnamed: 0,KEY,A,B,C,D
0,K2,A2,3.1,C0,D2
1,K3,A3,0.3,C1,D3


In [229]:
dfall=pd.merge(dfleft,dfright, how='outer')
dfall

Unnamed: 0,KEY,A,B,C,D
0,K0,A0,0.5,,
1,K1,A1,2.1,,
2,K2,A2,3.1,C0,D2
3,K3,A3,0.3,C1,D3
4,K4,,,C2,D4
5,K5,,,C3,D5


In [231]:
pd.isnull(dfall) #dfall.isnull()과 같음

Unnamed: 0,KEY,A,B,C,D
0,False,False,False,True,True
1,False,False,False,True,True
2,False,False,False,False,False
3,False,False,False,False,False
4,False,True,True,False,False
5,False,True,True,False,False


In [233]:
pd.notnull(dfall) #dfall.notnull()

Unnamed: 0,KEY,A,B,C,D
0,True,True,True,False,False
1,True,True,True,False,False
2,True,True,True,True,True
3,True,True,True,True,True
4,True,False,False,True,True
5,True,False,False,True,True


In [243]:
print(dfall)
print(dfall.iloc[0])
print(dfall.iloc[0].loc['A'])
#0, 1번 행 인덱스의 A열과 B열에 None을 저장
print(dfall.loc[0])
print("="*50)

print(dfall.loc[[0,1],['A','B']])

  KEY    A    B    C    D
0  K0   A0  0.5  NaN  NaN
1  K1   A1  2.1  NaN  NaN
2  K2   A2  3.1   C0   D2
3  K3   A3  0.3   C1   D3
4  K4  NaN  NaN   C2   D4
5  K5  NaN  NaN   C3   D5
KEY     K0
A       A0
B      0.5
C      NaN
D      NaN
Name: 0, dtype: object
A0
KEY     K0
A       A0
B      0.5
C      NaN
D      NaN
Name: 0, dtype: object
    A    B
0  A0  0.5
1  A1  2.1


In [248]:
dfall.loc[[0,1],['A','B']]=None
dfall
#dfall의 A, B열에 대해서만 널 값 여부(ISNULL())를 출력하시오.
dfall[['A','B']].isnull()

Unnamed: 0,A,B
0,True,True
1,True,True
2,False,False
3,False,False
4,True,True
5,True,True


In [255]:
dfall.isnull().sum(axis=1)
dfall.isnull()

Unnamed: 0,KEY,A,B,C,D
0,False,True,True,True,True
1,False,True,True,True,True
2,False,False,False,False,False
3,False,False,False,False,False
4,False,True,True,False,False
5,False,True,True,False,False


In [256]:
dfall
#dfall에 NaNCnt컬럼과 NotNulCnt 컬럼을 추가하시오
#NaNCnt: 해당 행의 NaN 개수, NotNulCnt: 해당 행의 notnull의 개수

Unnamed: 0,KEY,A,B,C,D
0,K0,,,,
1,K1,,,,
2,K2,A2,3.1,C0,D2
3,K3,A3,0.3,C1,D3
4,K4,,,C2,D4
5,K5,,,C3,D5


In [259]:
dfall.isnull().sum(axis=1)

0    4
1    4
2    0
3    0
4    2
5    2
dtype: int64

In [260]:
dfall['NanCnt']=dfall.isnull().sum(axis=1)
dfall

Unnamed: 0,KEY,A,B,C,D,NanCnt
0,K0,,,,,4
1,K1,,,,,4
2,K2,A2,3.1,C0,D2,0
3,K3,A3,0.3,C1,D3,0
4,K4,,,C2,D4,2
5,K5,,,C3,D5,2
