##### [pandas cheat sheet](https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf)

##### [10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html)

# 1. pandas의 설치 여부를 확인한다.
!pip show pandas

In [24]:
!pip show pandas

Name: pandas
Version: 0.25.1
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: http://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: /opt/conda/lib/python3.7/site-packages
Requires: numpy, pytz, python-dateutil
Required-by: vincent, statsmodels, seaborn, plotnine, mizani


# 2. pandas를 import한다.
import pandas as pd

In [2]:
import numpy as np
import pandas as pd

In [8]:
pd.Series((1,3,'5',np.nan,6,8)) # 리스트, 튜플 가능(묶어서 넣어주면 Series - 같은 하나의 DataType(Table의 Column개념으로 생각하기) 가능)

0      1
1      3
2      5
3    NaN
4      6
5      8
dtype: object

In [9]:
dates = pd.date_range('20200328', periods = 6)

In [11]:
dates # print 없어도 가능

DatetimeIndex(['2020-03-28', '2020-03-29', '2020-03-30', '2020-03-31',
               '2020-04-01', '2020-04-02'],
              dtype='datetime64[ns]', freq='D')

In [16]:
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list('ABCD')) # randn(6, 4) : 6행 4열의 DataFrame 생성

In [17]:
df # 0, 1, 2, 3, 4, 5는 Index, Index의 속성은 row, 0, 1, 2, 3의 속성은 Column 

Unnamed: 0,A,B,C,D
2020-03-28,-0.921923,0.007243,0.827131,-0.052369
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-03-30,0.41721,0.990094,-0.505462,-0.786353
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-04-01,0.155963,0.004905,-0.508943,-0.304544
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654


In [18]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})

In [19]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [20]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [21]:
df2.<TAB> # JupyterLab에서는 실행 불가

SyntaxError: invalid syntax (<ipython-input-21-af10f82265d2>, line 1)

In [36]:
df.head() # head는 위에서 부터 몇 개를 볼 지

Unnamed: 0,A,B,C,D
2020-03-28,-0.921923,0.007243,0.827131,-0.052369
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-03-30,0.41721,0.990094,-0.505462,-0.786353
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-04-01,0.155963,0.004905,-0.508943,-0.304544


In [30]:
df.tail(3) # 밑에서 부터 몇 개를 볼 지

Unnamed: 0,A,B,C,D
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-04-01,0.155963,0.004905,-0.508943,-0.304544
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654


In [28]:
df.sample(3) # sample은 섞여서 나옴

Unnamed: 0,A,B,C,D
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654
2020-03-31,0.090566,0.476263,-1.465175,0.371908


In [37]:
df.index

DatetimeIndex(['2020-03-28', '2020-03-29', '2020-03-30', '2020-03-31',
               '2020-04-01', '2020-04-02'],
              dtype='datetime64[ns]', freq='D')

In [38]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [39]:
df.to_numpy() # numpy 형태로 바꿔주기

array([[-0.9219225 ,  0.0072426 ,  0.82713128, -0.0523689 ],
       [-0.33600926, -0.43990393, -0.07136358, -0.4459431 ],
       [ 0.4172102 ,  0.99009425, -0.50546162, -0.78635314],
       [ 0.0905659 ,  0.47626336, -1.46517537,  0.37190824],
       [ 0.15596318,  0.00490538, -0.50894326, -0.30454415],
       [-0.97916176,  0.36949023, -1.84101242, -0.25365443]])

In [41]:
df2.to_numpy() # numpy는 2차원 배열 형태로 바꿔줌

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [43]:
df.describe() # 50% : 중위값

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.262226,0.234682,-0.594137,-0.245159
std,0.585768,0.491279,0.961439,0.388521
min,-0.979162,-0.439904,-1.841012,-0.786353
25%,-0.775444,0.00549,-1.226117,-0.410593
50%,-0.122722,0.188366,-0.507202,-0.279099
75%,0.139614,0.44957,-0.179888,-0.10269
max,0.41721,0.990094,0.827131,0.371908


In [46]:
df.T # Transposing (Index와 Column 교차변경, 행렬 변경)

Unnamed: 0,2020-03-28,2020-03-29,2020-03-30,2020-03-31,2020-04-01,2020-04-02
A,-0.921923,-0.336009,0.41721,0.090566,0.155963,-0.979162
B,0.007243,-0.439904,0.990094,0.476263,0.004905,0.36949
C,0.827131,-0.071364,-0.505462,-1.465175,-0.508943,-1.841012
D,-0.052369,-0.445943,-0.786353,0.371908,-0.304544,-0.253654


In [47]:
df

Unnamed: 0,A,B,C,D
2020-03-28,-0.921923,0.007243,0.827131,-0.052369
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-03-30,0.41721,0.990094,-0.505462,-0.786353
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-04-01,0.155963,0.004905,-0.508943,-0.304544
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654


#### [pandas.DataFrame.sort_index](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_index.html?highlight=sort_index)

In [52]:
df.sort_index(axis = 0, ascending = False)
# DataFrame.sort_index(self, axis=0, level=None, ascending=True,
# inplace=False, kind='quicksort', na_position='last', sort_remaining=True,
# ignore_index: bool = False

Unnamed: 0,A,B,C,D
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654
2020-04-01,0.155963,0.004905,-0.508943,-0.304544
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-03-30,0.41721,0.990094,-0.505462,-0.786353
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-03-28,-0.921923,0.007243,0.827131,-0.052369


In [55]:
df.sort_values(by = '20200330', axis = 1)

Unnamed: 0,D,C,A,B
2020-03-28,-0.052369,0.827131,-0.921923,0.007243
2020-03-29,-0.445943,-0.071364,-0.336009,-0.439904
2020-03-30,-0.786353,-0.505462,0.41721,0.990094
2020-03-31,0.371908,-1.465175,0.090566,0.476263
2020-04-01,-0.304544,-0.508943,0.155963,0.004905
2020-04-02,-0.253654,-1.841012,-0.979162,0.36949


## - 특정 값만 행, 열 지정해서 가지고 올 때
df.컬럼명
df['컬럼명']
df[['컬럼명', '컬럼명']]
df.log[행]
df.log[행, 열]

In [56]:
df.A

2020-03-28   -0.921923
2020-03-29   -0.336009
2020-03-30    0.417210
2020-03-31    0.090566
2020-04-01    0.155963
2020-04-02   -0.979162
Freq: D, Name: A, dtype: float64

In [57]:
df['A']

2020-03-28   -0.921923
2020-03-29   -0.336009
2020-03-30    0.417210
2020-03-31    0.090566
2020-04-01    0.155963
2020-04-02   -0.979162
Freq: D, Name: A, dtype: float64

In [61]:
df[['A', 'B']] # 컬럼명이 여러 개일 때, 컬럼들을 리스트로 묶어준다

Unnamed: 0,A,B
2020-03-28,-0.921923,0.007243
2020-03-29,-0.336009,-0.439904
2020-03-30,0.41721,0.990094
2020-03-31,0.090566,0.476263
2020-04-01,0.155963,0.004905
2020-04-02,-0.979162,0.36949


In [66]:
df[0:3] # 행 단위로 빼고 싶다면,

Unnamed: 0,A,B,C,D
2020-03-28,-0.921923,0.007243,0.827131,-0.052369
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-03-30,0.41721,0.990094,-0.505462,-0.786353


In [68]:
df[2:3] # 하나를 빼올 때, [2]가 아니라 범위로 지정해야함([2:3])

Unnamed: 0,A,B,C,D
2020-03-30,0.41721,0.990094,-0.505462,-0.786353


In [69]:
df.loc[dates[0]] # index가 2020-03-28인 행

A   -0.921923
B    0.007243
C    0.827131
D   -0.052369
Name: 2020-03-28 00:00:00, dtype: float64

In [71]:
df

Unnamed: 0,A,B,C,D
2020-03-28,-0.921923,0.007243,0.827131,-0.052369
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-03-30,0.41721,0.990094,-0.505462,-0.786353
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-04-01,0.155963,0.004905,-0.508943,-0.304544
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654


In [73]:
df.loc['20200331']

A    0.090566
B    0.476263
C   -1.465175
D    0.371908
Name: 2020-03-31 00:00:00, dtype: float64

In [75]:
df.loc[:, ['A', 'B']] # log[행, 열] - 행의 ':'은 전부 다, 열은 A와 B만 (['A','B'])

Unnamed: 0,A,B
2020-03-28,-0.921923,0.007243
2020-03-29,-0.336009,-0.439904
2020-03-30,0.41721,0.990094
2020-03-31,0.090566,0.476263
2020-04-01,0.155963,0.004905
2020-04-02,-0.979162,0.36949


In [77]:
df.loc['20200328' : '20200330', ['A', 'B']]

Unnamed: 0,A,B
2020-03-28,-0.921923,0.007243
2020-03-29,-0.336009,-0.439904
2020-03-30,0.41721,0.990094


In [80]:
df

Unnamed: 0,A,B,C,D
2020-03-28,-0.921923,0.007243,0.827131,-0.052369
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-03-30,0.41721,0.990094,-0.505462,-0.786353
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-04-01,0.155963,0.004905,-0.508943,-0.304544
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654


In [79]:
df.iloc[3]

A    0.090566
B    0.476263
C   -1.465175
D    0.371908
Name: 2020-03-31 00:00:00, dtype: float64

In [82]:
df.iloc[:, 1:3] # 1:3은 열이고, 1번째부터 2번째까지(B,C)

Unnamed: 0,B,C
2020-03-28,0.007243,0.827131
2020-03-29,-0.439904,-0.071364
2020-03-30,0.990094,-0.505462
2020-03-31,0.476263,-1.465175
2020-04-01,0.004905,-0.508943
2020-04-02,0.36949,-1.841012


In [83]:
df

Unnamed: 0,A,B,C,D
2020-03-28,-0.921923,0.007243,0.827131,-0.052369
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-03-30,0.41721,0.990094,-0.505462,-0.786353
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-04-01,0.155963,0.004905,-0.508943,-0.304544
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654


In [85]:
df['A'] > 0 # 'A'컬럼의 데이터를 보여준다. DataFrame X

2020-03-28    False
2020-03-29    False
2020-03-30     True
2020-03-31     True
2020-04-01     True
2020-04-02    False
Freq: D, Name: A, dtype: bool

In [88]:
df[df['A'] > 0] # 위의 값을 전체 DataFrame 상의 값으로 보여줌(True인 값만)

Unnamed: 0,A,B,C,D
2020-03-30,0.41721,0.990094,-0.505462,-0.786353
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-04-01,0.155963,0.004905,-0.508943,-0.304544


In [94]:
df4 = df[df > 0]

In [90]:
df3 = df

In [92]:
id(df3)

140360509920720

In [93]:
id(df)

140360509920720

In [95]:
id(df4)

140360498712528

In [96]:
df3 = df.copy()

In [97]:
id(df3)

140360498534672

In [98]:
id(df)

140360509920720

In [100]:
df3

Unnamed: 0,A,B,C,D
2020-03-28,-0.921923,0.007243,0.827131,-0.052369
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-03-30,0.41721,0.990094,-0.505462,-0.786353
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-04-01,0.155963,0.004905,-0.508943,-0.304544
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654


In [107]:
df3['E'] = ['one', 'one', 'two', 'three', 'four', 'three'] # E 컬럼의 속성 갯수를 맞춰줘야함(6개)

In [109]:
df

Unnamed: 0,A,B,C,D
2020-03-28,-0.921923,0.007243,0.827131,-0.052369
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943
2020-03-30,0.41721,0.990094,-0.505462,-0.786353
2020-03-31,0.090566,0.476263,-1.465175,0.371908
2020-04-01,0.155963,0.004905,-0.508943,-0.304544
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654


In [110]:
df3

Unnamed: 0,A,B,C,D,E
2020-03-28,-0.921923,0.007243,0.827131,-0.052369,one
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,one
2020-03-30,0.41721,0.990094,-0.505462,-0.786353,two
2020-03-31,0.090566,0.476263,-1.465175,0.371908,three
2020-04-01,0.155963,0.004905,-0.508943,-0.304544,four
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654,three


In [112]:
df3.E.isin(['two', 'four'])

2020-03-28    False
2020-03-29    False
2020-03-30     True
2020-03-31    False
2020-04-01     True
2020-04-02    False
Freq: D, Name: E, dtype: bool

In [114]:
df3[df3.E.isin(['two', 'four'])] # 위의 값을 DataFrame 상으로 보여줌

Unnamed: 0,A,B,C,D,E
2020-03-30,0.41721,0.990094,-0.505462,-0.786353,two
2020-04-01,0.155963,0.004905,-0.508943,-0.304544,four


In [115]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index = pd.date_range('20200329', periods = 6))

In [116]:
s1

2020-03-29    1
2020-03-30    2
2020-03-31    3
2020-04-01    4
2020-04-02    5
2020-04-03    6
Freq: D, dtype: int64

In [117]:
df['F'] = s1

In [118]:
df

Unnamed: 0,A,B,C,D,F
2020-03-28,-0.921923,0.007243,0.827131,-0.052369,
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,1.0
2020-03-30,0.41721,0.990094,-0.505462,-0.786353,2.0
2020-03-31,0.090566,0.476263,-1.465175,0.371908,3.0
2020-04-01,0.155963,0.004905,-0.508943,-0.304544,4.0
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654,5.0


In [121]:
df.at[dates[0], 'A'] = 0 # 행, 열을 가지고 위치값에 0(값)을 넣어라

In [122]:
df

Unnamed: 0,A,B,C,D,F
2020-03-28,0.0,0.007243,0.827131,-0.052369,
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,1.0
2020-03-30,0.41721,0.990094,-0.505462,-0.786353,2.0
2020-03-31,0.090566,0.476263,-1.465175,0.371908,3.0
2020-04-01,0.155963,0.004905,-0.508943,-0.304544,4.0
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654,5.0


In [123]:
df.iat[0, 1] = 0

In [124]:
df

Unnamed: 0,A,B,C,D,F
2020-03-28,0.0,0.0,0.827131,-0.052369,
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,1.0
2020-03-30,0.41721,0.990094,-0.505462,-0.786353,2.0
2020-03-31,0.090566,0.476263,-1.465175,0.371908,3.0
2020-04-01,0.155963,0.004905,-0.508943,-0.304544,4.0
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654,5.0


In [125]:
df5 = df.copy()

In [126]:
df5

Unnamed: 0,A,B,C,D,F
2020-03-28,0.0,0.0,0.827131,-0.052369,
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,1.0
2020-03-30,0.41721,0.990094,-0.505462,-0.786353,2.0
2020-03-31,0.090566,0.476263,-1.465175,0.371908,3.0
2020-04-01,0.155963,0.004905,-0.508943,-0.304544,4.0
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654,5.0


In [128]:
df5 > 0

Unnamed: 0,A,B,C,D,F
2020-03-28,False,False,True,False,False
2020-03-29,False,False,False,False,True
2020-03-30,True,True,False,False,True
2020-03-31,True,True,False,True,True
2020-04-01,True,True,False,False,True
2020-04-02,False,True,False,False,True


In [129]:
df[df5 > 0]

Unnamed: 0,A,B,C,D,F
2020-03-28,,,0.827131,,
2020-03-29,,,,,1.0
2020-03-30,0.41721,0.990094,,,2.0
2020-03-31,0.090566,0.476263,,0.371908,3.0
2020-04-01,0.155963,0.004905,,,4.0
2020-04-02,,0.36949,,,5.0


In [130]:
df[df5 > 0] = -df5

In [131]:
df5

Unnamed: 0,A,B,C,D,F
2020-03-28,0.0,0.0,0.827131,-0.052369,
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,1.0
2020-03-30,0.41721,0.990094,-0.505462,-0.786353,2.0
2020-03-31,0.090566,0.476263,-1.465175,0.371908,3.0
2020-04-01,0.155963,0.004905,-0.508943,-0.304544,4.0
2020-04-02,-0.979162,0.36949,-1.841012,-0.253654,5.0


In [133]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

In [134]:
df1

Unnamed: 0,A,B,C,D,F,E
2020-03-28,0.0,0.0,-0.827131,-0.052369,,
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,-1.0,
2020-03-30,-0.41721,-0.990094,-0.505462,-0.786353,-2.0,
2020-03-31,-0.090566,-0.476263,-1.465175,-0.371908,-3.0,


In [135]:
df1.loc[dates[0]:dates[1], 'E'] = 1

In [136]:
df1

Unnamed: 0,A,B,C,D,F,E
2020-03-28,0.0,0.0,-0.827131,-0.052369,,1.0
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,-1.0,1.0
2020-03-30,-0.41721,-0.990094,-0.505462,-0.786353,-2.0,
2020-03-31,-0.090566,-0.476263,-1.465175,-0.371908,-3.0,


In [138]:
df1.dropna(how = 'any')

Unnamed: 0,A,B,C,D,F,E
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,-1.0,1.0


In [140]:
df1.fillna(value = 5)

Unnamed: 0,A,B,C,D,F,E
2020-03-28,0.0,0.0,-0.827131,-0.052369,5.0,1.0
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,-1.0,1.0
2020-03-30,-0.41721,-0.990094,-0.505462,-0.786353,-2.0,5.0
2020-03-31,-0.090566,-0.476263,-1.465175,-0.371908,-3.0,5.0


In [141]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2020-03-28,False,False,False,False,True,False
2020-03-29,False,False,False,False,False,False
2020-03-30,False,False,False,False,False,True
2020-03-31,False,False,False,False,False,True


In [142]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)

In [143]:
s

2020-03-28    NaN
2020-03-29    NaN
2020-03-30    1.0
2020-03-31    3.0
2020-04-01    5.0
2020-04-02    NaN
Freq: D, dtype: float64

In [144]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2020-03-28,,,,,
2020-03-29,,,,,
2020-03-30,-1.41721,-1.990094,-1.505462,-1.786353,-3.0
2020-03-31,-3.090566,-3.476263,-4.465175,-3.371908,-6.0
2020-04-01,-5.155963,-5.004905,-5.508943,-5.304544,-9.0
2020-04-02,,,,,


In [145]:
df

Unnamed: 0,A,B,C,D,F
2020-03-28,0.0,0.0,-0.827131,-0.052369,
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,-1.0
2020-03-30,-0.41721,-0.990094,-0.505462,-0.786353,-2.0
2020-03-31,-0.090566,-0.476263,-1.465175,-0.371908,-3.0
2020-04-01,-0.155963,-0.004905,-0.508943,-0.304544,-4.0
2020-04-02,-0.979162,-0.36949,-1.841012,-0.253654,-5.0


In [146]:
df.add(s, axis='index')

Unnamed: 0,A,B,C,D,F
2020-03-28,,,,,
2020-03-29,,,,,
2020-03-30,0.58279,0.009906,0.494538,0.213647,-1.0
2020-03-31,2.909434,2.523737,1.534825,2.628092,0.0
2020-04-01,4.844037,4.995095,4.491057,4.695456,1.0
2020-04-02,,,,,


In [148]:
df

Unnamed: 0,A,B,C,D,F
2020-03-28,0.0,0.0,-0.827131,-0.052369,
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,-1.0
2020-03-30,-0.41721,-0.990094,-0.505462,-0.786353,-2.0
2020-03-31,-0.090566,-0.476263,-1.465175,-0.371908,-3.0
2020-04-01,-0.155963,-0.004905,-0.508943,-0.304544,-4.0
2020-04-02,-0.979162,-0.36949,-1.841012,-0.253654,-5.0


In [149]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2020-03-28,0.0,0.0,-0.827131,-0.052369,
2020-03-29,-0.336009,-0.439904,-0.898495,-0.498312,-1.0
2020-03-30,-0.753219,-1.429998,-1.403956,-1.284665,-3.0
2020-03-31,-0.843785,-1.906262,-2.869132,-1.656573,-6.0
2020-04-01,-0.999749,-1.911167,-3.378075,-1.961118,-10.0
2020-04-02,-1.97891,-2.280657,-5.219088,-2.214772,-15.0


In [150]:
df.apply(lambda x : x.max() - x.min())

A    0.979162
B    0.990094
C    1.769649
D    0.733984
F    4.000000
dtype: float64

In [151]:
df

Unnamed: 0,A,B,C,D,F
2020-03-28,0.0,0.0,-0.827131,-0.052369,
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,-1.0
2020-03-30,-0.41721,-0.990094,-0.505462,-0.786353,-2.0
2020-03-31,-0.090566,-0.476263,-1.465175,-0.371908,-3.0
2020-04-01,-0.155963,-0.004905,-0.508943,-0.304544,-4.0
2020-04-02,-0.979162,-0.36949,-1.841012,-0.253654,-5.0


In [152]:
s = pd.Series(np.random.randint(0, 7, size=10))

In [153]:
s

0    2
1    2
2    6
3    0
4    1
5    1
6    6
7    3
8    6
9    3
dtype: int64

In [156]:
s.value_counts()

6    3
3    2
2    2
1    2
0    1
dtype: int64

In [157]:
df['A'].value_counts()

-0.090566    1
-0.336009    1
-0.155963    1
-0.979162    1
-0.417210    1
 0.000000    1
Name: A, dtype: int64

In [158]:
df

Unnamed: 0,A,B,C,D,F
2020-03-28,0.0,0.0,-0.827131,-0.052369,
2020-03-29,-0.336009,-0.439904,-0.071364,-0.445943,-1.0
2020-03-30,-0.41721,-0.990094,-0.505462,-0.786353,-2.0
2020-03-31,-0.090566,-0.476263,-1.465175,-0.371908,-3.0
2020-04-01,-0.155963,-0.004905,-0.508943,-0.304544,-4.0
2020-04-02,-0.979162,-0.36949,-1.841012,-0.253654,-5.0


In [159]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [160]:
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [162]:
s.str.lower() # 문자열을 소문자로

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [164]:
s.str.upper() # 문자열을 대문자로

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
6    CABA
7     DOG
8     CAT
dtype: object

In [165]:
s.str.capitalize() # 첫 글자만 대문자

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    Caba
7     Dog
8     Cat
dtype: object

In [171]:
s.str.swapcase() # 대,소문자 혼용

0       a
1       b
2       c
3    aABA
4    bACA
5     NaN
6    caba
7     DOG
8     CAT
dtype: object

In [172]:
df = pd.DataFrame(np.random.randn(10, 4))

In [181]:
df

Unnamed: 0,0,1,2,3
0,-0.037273,-1.139965,0.168816,0.505335
1,0.331841,0.953172,-0.042143,-1.931078
2,1.256728,-1.220628,-0.403009,-0.680491
3,-1.00248,0.328378,0.211094,0.587133
4,0.430835,-0.9561,-2.682885,0.360978
5,0.230893,-0.02239,1.57789,-0.185423
6,-0.443073,0.872477,-0.597225,-0.951318
7,-0.488992,0.482843,0.18835,0.22014
8,0.60427,1.743458,-1.04521,-2.86052
9,-0.3066,0.088832,-0.719116,-1.208603


In [182]:
pieces = [df[:3], df[3:7], df[7:]]

In [184]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.037273,-1.139965,0.168816,0.505335
1,0.331841,0.953172,-0.042143,-1.931078
2,1.256728,-1.220628,-0.403009,-0.680491
3,-1.00248,0.328378,0.211094,0.587133
4,0.430835,-0.9561,-2.682885,0.360978
5,0.230893,-0.02239,1.57789,-0.185423
6,-0.443073,0.872477,-0.597225,-0.951318
7,-0.488992,0.482843,0.18835,0.22014
8,0.60427,1.743458,-1.04521,-2.86052
9,-0.3066,0.088832,-0.719116,-1.208603


In [185]:
pieces

[          0         1         2         3
 0 -0.037273 -1.139965  0.168816  0.505335
 1  0.331841  0.953172 -0.042143 -1.931078
 2  1.256728 -1.220628 -0.403009 -0.680491,
           0         1         2         3
 3 -1.002480  0.328378  0.211094  0.587133
 4  0.430835 -0.956100 -2.682885  0.360978
 5  0.230893 -0.022390  1.577890 -0.185423
 6 -0.443073  0.872477 -0.597225 -0.951318,
           0         1         2         3
 7 -0.488992  0.482843  0.188350  0.220140
 8  0.604270  1.743458 -1.045210 -2.860520
 9 -0.306600  0.088832 -0.719116 -1.208603]

In [188]:
df1 = pd.concat(pieces)

In [192]:
pd.concat([df, df1]) # concat() : 연결하기 / 연결 시 column으로 연결
                     # DataFrame을 연결함
                     # axis = 1 -> 축을 바꿈

Unnamed: 0,0,1,2,3
0,-0.037273,-1.139965,0.168816,0.505335
1,0.331841,0.953172,-0.042143,-1.931078
2,1.256728,-1.220628,-0.403009,-0.680491
3,-1.00248,0.328378,0.211094,0.587133
4,0.430835,-0.9561,-2.682885,0.360978
5,0.230893,-0.02239,1.57789,-0.185423
6,-0.443073,0.872477,-0.597225,-0.951318
7,-0.488992,0.482843,0.18835,0.22014
8,0.60427,1.743458,-1.04521,-2.86052
9,-0.3066,0.088832,-0.719116,-1.208603


#### [concat url](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html?highlight=concat#pandas.concat)

In [193]:
df

Unnamed: 0,0,1,2,3
0,-0.037273,-1.139965,0.168816,0.505335
1,0.331841,0.953172,-0.042143,-1.931078
2,1.256728,-1.220628,-0.403009,-0.680491
3,-1.00248,0.328378,0.211094,0.587133
4,0.430835,-0.9561,-2.682885,0.360978
5,0.230893,-0.02239,1.57789,-0.185423
6,-0.443073,0.872477,-0.597225,-0.951318
7,-0.488992,0.482843,0.18835,0.22014
8,0.60427,1.743458,-1.04521,-2.86052
9,-0.3066,0.088832,-0.719116,-1.208603


In [194]:
df.to_csv('foo.csv') # csv로 저장

In [199]:
df1 = pd.read_csv('user.csv') # csv파일 읽어오기

In [200]:
df1

Unnamed: 0,userid,username,userage,usertel
0,hong,홍길동,23,010-1111-2222
1,kim,김철수,33,010-3333-4444
