In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [38]:
dates = pd.date_range('20170101', periods = 6)

In [39]:
dates

DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06'],
              dtype='datetime64[ns]', freq='D')

In [40]:
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list('ABCD'))

In [41]:
df

Unnamed: 0,A,B,C,D
2017-01-01,0.657101,1.967444,0.021556,-0.400942
2017-01-02,0.533256,1.166478,1.196742,1.281902
2017-01-03,0.628022,-0.231812,0.86672,0.108487
2017-01-04,-0.768522,-2.192901,-1.216873,-0.009265
2017-01-05,0.400187,0.873615,0.873664,-0.338175
2017-01-06,0.153867,-1.075675,-0.293745,0.954509


In [42]:
df.head(1)

Unnamed: 0,A,B,C,D
2017-01-01,0.657101,1.967444,0.021556,-0.400942


In [43]:
df.tail(2)

Unnamed: 0,A,B,C,D
2017-01-05,0.400187,0.873615,0.873664,-0.338175
2017-01-06,0.153867,-1.075675,-0.293745,0.954509


In [44]:
df.describe() # 통계 보기

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.267318,0.084525,0.241344,0.266086
std,0.539676,1.547351,0.912659,0.695201
min,-0.768522,-2.192901,-1.216873,-0.400942
25%,0.215447,-0.864709,-0.21492,-0.255947
50%,0.466721,0.320902,0.444138,0.049611
75%,0.60433,1.093262,0.871928,0.743004
max,0.657101,1.967444,1.196742,1.281902


In [45]:
df.T # transpose

Unnamed: 0,2017-01-01,2017-01-02,2017-01-03,2017-01-04,2017-01-05,2017-01-06
A,0.657101,0.533256,0.628022,-0.768522,0.400187,0.153867
B,1.967444,1.166478,-0.231812,-2.192901,0.873615,-1.075675
C,0.021556,1.196742,0.86672,-1.216873,0.873664,-0.293745
D,-0.400942,1.281902,0.108487,-0.009265,-0.338175,0.954509


In [46]:
df.sort_index(axis=0, ascending = False) # axis=0 means 'row'

Unnamed: 0,A,B,C,D
2017-01-06,0.153867,-1.075675,-0.293745,0.954509
2017-01-05,0.400187,0.873615,0.873664,-0.338175
2017-01-04,-0.768522,-2.192901,-1.216873,-0.009265
2017-01-03,0.628022,-0.231812,0.86672,0.108487
2017-01-02,0.533256,1.166478,1.196742,1.281902
2017-01-01,0.657101,1.967444,0.021556,-0.400942


In [47]:
df.sort_index(axis=1, ascending = False) # axis=1 means 'column'

Unnamed: 0,D,C,B,A
2017-01-01,-0.400942,0.021556,1.967444,0.657101
2017-01-02,1.281902,1.196742,1.166478,0.533256
2017-01-03,0.108487,0.86672,-0.231812,0.628022
2017-01-04,-0.009265,-1.216873,-2.192901,-0.768522
2017-01-05,-0.338175,0.873664,0.873615,0.400187
2017-01-06,0.954509,-0.293745,-1.075675,0.153867


In [48]:
df[:3]

Unnamed: 0,A,B,C,D
2017-01-01,0.657101,1.967444,0.021556,-0.400942
2017-01-02,0.533256,1.166478,1.196742,1.281902
2017-01-03,0.628022,-0.231812,0.86672,0.108487


In [49]:
df.loc['2017-01-01'] # label 기반 인덱싱
df.loc[dates[0]]

A    0.657101
B    1.967444
C    0.021556
D   -0.400942
Name: 2017-01-01 00:00:00, dtype: float64

In [50]:
df.loc[:, 'A':'C'] 

Unnamed: 0,A,B,C
2017-01-01,0.657101,1.967444,0.021556
2017-01-02,0.533256,1.166478,1.196742
2017-01-03,0.628022,-0.231812,0.86672
2017-01-04,-0.768522,-2.192901,-1.216873
2017-01-05,0.400187,0.873615,0.873664
2017-01-06,0.153867,-1.075675,-0.293745


In [51]:
df.loc[:, ['A', 'C']]

Unnamed: 0,A,C
2017-01-01,0.657101,0.021556
2017-01-02,0.533256,1.196742
2017-01-03,0.628022,0.86672
2017-01-04,-0.768522,-1.216873
2017-01-05,0.400187,0.873664
2017-01-06,0.153867,-0.293745


In [52]:
df.iloc[3] # 위치 정수 기반 인덱싱

A   -0.768522
B   -2.192901
C   -1.216873
D   -0.009265
Name: 2017-01-04 00:00:00, dtype: float64

In [53]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2017-01-04,-0.768522,-2.192901
2017-01-05,0.400187,0.873615


In [54]:
df_filter = df[df>0] # 0 보다 큰 것만 거르는 filter
df_filter.fillna('zero') # 필터로 거르고 0보다 작은 데이터는 na인데, zero로 변환

Unnamed: 0,A,B,C,D
2017-01-01,0.657101,1.967444,0.021556,zero
2017-01-02,0.533256,1.166478,1.196742,1.281902
2017-01-03,0.628022,zero,0.86672,0.108487
2017-01-04,zero,zero,zero,zero
2017-01-05,0.400187,0.873615,0.873664,zero
2017-01-06,0.153867,zero,zero,0.954509


In [56]:
df_copy = df.copy()
df_copy['copy'] = ['one', 'two', 'three', np.nan, np.nan, 'six'] # col 추가
df_copy

Unnamed: 0,A,B,C,D,copy
2017-01-01,0.657101,1.967444,0.021556,-0.400942,one
2017-01-02,0.533256,1.166478,1.196742,1.281902,two
2017-01-03,0.628022,-0.231812,0.86672,0.108487,three
2017-01-04,-0.768522,-2.192901,-1.216873,-0.009265,
2017-01-05,0.400187,0.873615,0.873664,-0.338175,
2017-01-06,0.153867,-1.075675,-0.293745,0.954509,six


In [57]:
df_copy['A+10'] = df_copy['A'].apply(lambda x : x + 10) # 존재하는 col을 이용하여 다른 col 생성
df_copy

Unnamed: 0,A,B,C,D,copy,A+10
2017-01-01,0.657101,1.967444,0.021556,-0.400942,one,10.657101
2017-01-02,0.533256,1.166478,1.196742,1.281902,two,10.533256
2017-01-03,0.628022,-0.231812,0.86672,0.108487,three,10.628022
2017-01-04,-0.768522,-2.192901,-1.216873,-0.009265,,9.231478
2017-01-05,0.400187,0.873615,0.873664,-0.338175,,10.400187
2017-01-06,0.153867,-1.075675,-0.293745,0.954509,six,10.153867


In [61]:
s = pd.Series([1,3,5,np.nan,6,8]) # 순서에 맞게 index가 생김(enumerate 느낌?)
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [64]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates) # index를 dates로 매핑
s

2017-01-01    1.0
2017-01-02    3.0
2017-01-03    5.0
2017-01-04    NaN
2017-01-05    6.0
2017-01-06    8.0
Freq: D, dtype: float64

In [65]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) # value를 두칸씩 shift해줌, 앞에서는 nan으로 채워짐
s

2017-01-01    NaN
2017-01-02    NaN
2017-01-03    1.0
2017-01-04    3.0
2017-01-05    5.0
2017-01-06    NaN
Freq: D, dtype: float64

In [73]:
df.sub(s, axis = 0) # df에서 s를 뺀다. (axis = 0)  'axis 연산 기준 이해하기'

Unnamed: 0,A,B,C,D
2017-01-01,,,,
2017-01-02,,,,
2017-01-03,-0.371978,-1.231812,-0.13328,-0.891513
2017-01-04,-3.768522,-5.192901,-4.216873,-3.009265
2017-01-05,-4.599813,-4.126385,-4.126336,-5.338175
2017-01-06,,,,


In [79]:
df

Unnamed: 0,A,B,C,D
2017-01-01,0.657101,1.967444,0.021556,-0.400942
2017-01-02,0.533256,1.166478,1.196742,1.281902
2017-01-03,0.628022,-0.231812,0.86672,0.108487
2017-01-04,-0.768522,-2.192901,-1.216873,-0.009265
2017-01-05,0.400187,0.873615,0.873664,-0.338175
2017-01-06,0.153867,-1.075675,-0.293745,0.954509


In [74]:
df.apply(np.cumsum, axis=1) # col 순서대로 요소 누적합을 저장

Unnamed: 0,A,B,C,D
2017-01-01,0.657101,2.624545,2.646101,2.245159
2017-01-02,0.533256,1.699734,2.896476,4.178378
2017-01-03,0.628022,0.39621,1.26293,1.371418
2017-01-04,-0.768522,-2.961424,-4.178297,-4.187561
2017-01-05,0.400187,1.273802,2.147466,1.809291
2017-01-06,0.153867,-0.921808,-1.215553,-0.261043


In [75]:
df.apply(np.cumsum, axis=0) # row 순서대로 요소 누적합 저장

Unnamed: 0,A,B,C,D
2017-01-01,0.657101,1.967444,0.021556,-0.400942
2017-01-02,1.190357,3.133922,1.218298,0.88096
2017-01-03,1.818379,2.902111,2.085018,0.989448
2017-01-04,1.049856,0.70921,0.868145,0.980183
2017-01-05,1.450043,1.582825,1.74181,0.642008
2017-01-06,1.60391,0.507149,1.448065,1.596517


In [78]:
df.apply(lambda x : x.max() - x.min()) # axis default : col

A    1.425623
B    4.160346
C    2.413615
D    1.682844
dtype: float64

In [80]:
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4,
                'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12),
                  'E' : np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,1.101503,0.407794
1,one,B,foo,-1.055297,-0.098427
2,two,C,foo,-0.484068,-0.839053
3,three,A,bar,0.673642,0.760965
4,one,B,bar,-0.436574,0.37716
5,one,C,bar,-0.324239,2.050735
6,two,A,foo,-0.005922,-2.102601
7,three,B,foo,-0.537046,-1.379635
8,one,C,foo,-0.30199,0.306149
9,one,A,bar,-0.987462,0.719444


In [84]:
df_val = pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) # https://wikidocs.net/46755
df_val

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.987462,1.101503
one,B,-0.436574,-1.055297
one,C,-0.324239,-0.30199
three,A,0.673642,
three,B,,-0.537046
three,C,1.101202,
two,A,,-0.005922
two,B,-0.960519,
two,C,,-0.484068
