# Pandas 10분 완성

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 목차

1. Object Creation (객체 생성)
2. Viewing Data (데이터 확인하기)
3. Selection (선택)
4. Missing Data (결측치)
5. Operation (연산)
6. Merge (병합)
7. Grouping (그룹화)
8. Reshaping (변형)
9. Time Series (시계열)
10. Categoricals (범주화)
11. Plotting (그래프)
12. Getting Data In / Out (데이터 입 / 출력)
13. Gotchas (잡았다!)

In [16]:
# 객체 생성
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

dates = pd.date_range('20130101', periods=6)
print(dates)

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
print(np.random.randn(6,4))
print(list('ABCD'))
print(df)

df2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index=list(range(5)), dtype='float32'),
    'D': np.array([3] * 5, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train", "*?"]),
    'F': 'foo'
})
print(df2)
print(df2.dtypes)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
[[-0.3588458  -0.92611796  0.58745472 -0.22728711]
 [ 0.57004508  1.90634618  0.67136806  1.10942256]
 [ 0.81876016 -0.49843181 -0.84789308  0.29797585]
 [ 2.04439866 -1.70974802  0.3408946  -1.66946368]
 [ 1.93700418  0.74160032 -0.40274651 -0.800447  ]
 [ 0.33036735  1.34180716  0.78762185 -0.09690301]]
['A', 'B', 'C', 'D']
                   A         B         C         D
2013-01-01 -1.044815 -0.556482 -1.660190  0.614819
2013-01-02  0.143443  0.216248 -1.227351 -1.131831
2013-01-03 -1.876789 -1.277111 -0.582013  0.337718
2013-01-04  0.604158 -0.049026  1.499959  1.359599
2013-01-05  1.200862 -0.879425  0.051814  0.095130
2013-01-06  1.013146  0.589762 -0.041208  0.721239
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 

In [30]:
# 데이터 확인
df.tail(3) # 끝에서 마지막 3줄
df.tail() # 끝에서 마지막 5줄

print(df.head())
print(df.tail(3))

print(df.index)
print(df.columns)
print(df.values)
print(df.describe())

print(df.T)
print(df.sort_index(axis=1, ascending=False)) # axis=0 은 y축 axis=1은 x축
print(df.sort_values(by='B'))

                   A         B         C         D
2013-01-01 -1.044815 -0.556482 -1.660190  0.614819
2013-01-02  0.143443  0.216248 -1.227351 -1.131831
2013-01-03 -1.876789 -1.277111 -0.582013  0.337718
2013-01-04  0.604158 -0.049026  1.499959  1.359599
2013-01-05  1.200862 -0.879425  0.051814  0.095130
                   A         B         C         D
2013-01-04  0.604158 -0.049026  1.499959  1.359599
2013-01-05  1.200862 -0.879425  0.051814  0.095130
2013-01-06  1.013146  0.589762 -0.041208  0.721239
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')
[[-1.04481525 -0.55648241 -1.66019024  0.61481857]
 [ 0.14344323  0.21624829 -1.22735149 -1.13183128]
 [-1.87678934 -1.2771114  -0.58201267  0.33771819]
 [ 0.60415847 -0.04902557  1.4999588   1.35959869]
 [ 1.20086244 -0.87942509  0.05181414  0.09513006]
 [ 1.0131456   0.58976228 -0

In [47]:
# selection
print(df['A'])
print(df[0:3])
print(df['20130102':'20130104'])

2013-01-01   -1.044815
2013-01-02    0.143443
2013-01-03   -1.876789
2013-01-04    0.604158
2013-01-05    1.200862
2013-01-06    1.013146
Freq: D, Name: A, dtype: float64
                   A         B         C         D
2013-01-01 -1.044815 -0.556482 -1.660190  0.614819
2013-01-02  0.143443  0.216248 -1.227351 -1.131831
2013-01-03 -1.876789 -1.277111 -0.582013  0.337718
                   A         B         C         D
2013-01-02  0.143443  0.216248 -1.227351 -1.131831
2013-01-03 -1.876789 -1.277111 -0.582013  0.337718
2013-01-04  0.604158 -0.049026  1.499959  1.359599


In [48]:
# selection by Label
print(df.loc[dates[0]])
print(df.loc[:, ['A','B']])
print(df.loc['20130102':'20130104', ['A', 'B']])
print(df.loc['20130102', ['A','B']])
print(df.loc[dates[0],'A'])
print(df.at[dates[0],'A'])

A   -1.044815
B   -0.556482
C   -1.660190
D    0.614819
Name: 2013-01-01 00:00:00, dtype: float64
                   A         B
2013-01-01 -1.044815 -0.556482
2013-01-02  0.143443  0.216248
2013-01-03 -1.876789 -1.277111
2013-01-04  0.604158 -0.049026
2013-01-05  1.200862 -0.879425
2013-01-06  1.013146  0.589762
                   A         B
2013-01-02  0.143443  0.216248
2013-01-03 -1.876789 -1.277111
2013-01-04  0.604158 -0.049026
A    0.143443
B    0.216248
Name: 2013-01-02 00:00:00, dtype: float64
-1.0448152504090664
-1.0448152504090664


In [52]:
# selection by position
print(df.iloc[3])
print(df.iloc[3:5, 0:2])
print(df.iloc[[1,2,4], [0,2]])
print(df.iloc[1:3,:])
print(df.iloc[:,1:3])
print(df.iloc[1,1])
print(df.iat[1,1])

A    0.604158
B   -0.049026
C    1.499959
D    1.359599
Name: 2013-01-04 00:00:00, dtype: float64
                   A         B
2013-01-04  0.604158 -0.049026
2013-01-05  1.200862 -0.879425
                   A         C
2013-01-02  0.143443 -1.227351
2013-01-03 -1.876789 -0.582013
2013-01-05  1.200862  0.051814
                   A         B         C         D
2013-01-02  0.143443  0.216248 -1.227351 -1.131831
2013-01-03 -1.876789 -1.277111 -0.582013  0.337718
                   B         C
2013-01-01 -0.556482 -1.660190
2013-01-02  0.216248 -1.227351
2013-01-03 -1.277111 -0.582013
2013-01-04 -0.049026  1.499959
2013-01-05 -0.879425  0.051814
2013-01-06  0.589762 -0.041208
0.21624828557299305
0.21624828557299305


In [58]:
# Boolean indexing
print(df[df.A > 0]) #filter row
print(df[df > 0]) #Boolean 조건을 충족하는 데이터프레임에서 값을 선택

df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
print(df2)
print(df2[df2['E'].isin(['two','four'])])

                   A         B         C         D
2013-01-02  0.143443  0.216248 -1.227351 -1.131831
2013-01-04  0.604158 -0.049026  1.499959  1.359599
2013-01-05  1.200862 -0.879425  0.051814  0.095130
2013-01-06  1.013146  0.589762 -0.041208  0.721239
                   A         B         C         D
2013-01-01       NaN       NaN       NaN  0.614819
2013-01-02  0.143443  0.216248       NaN       NaN
2013-01-03       NaN       NaN       NaN  0.337718
2013-01-04  0.604158       NaN  1.499959  1.359599
2013-01-05  1.200862       NaN  0.051814  0.095130
2013-01-06  1.013146  0.589762       NaN  0.721239
                   A         B         C         D      E
2013-01-01 -1.044815 -0.556482 -1.660190  0.614819    one
2013-01-02  0.143443  0.216248 -1.227351 -1.131831    one
2013-01-03 -1.876789 -1.277111 -0.582013  0.337718    two
2013-01-04  0.604158 -0.049026  1.499959  1.359599  three
2013-01-05  1.200862 -0.879425  0.051814  0.095130   four
2013-01-06  1.013146  0.589762 -0.041208

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.876789,-1.277111,-0.582013,0.337718,two
2013-01-05,1.200862,-0.879425,0.051814,0.09513,four


In [68]:
# Setting
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
print(s1)
df['F'] = s1

# set by label
df.at[dates[0], 'A'] = 0
# set by position
df.iat[0,1] = 0
# using Numpy array
df.loc[:, 'D'] = np.array([5] * len(df))
print(df)

df2 = df.copy()
df2[df2 > 0] = -df2
print(df2)

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64
                   A         B         C  D    F
2013-01-01  0.000000  0.000000 -1.660190  5  NaN
2013-01-02  0.143443  0.216248 -1.227351  5  1.0
2013-01-03 -1.876789 -1.277111 -0.582013  5  2.0
2013-01-04  0.604158 -0.049026  1.499959  5  3.0
2013-01-05  1.200862 -0.879425  0.051814  5  4.0
2013-01-06  1.013146  0.589762 -0.041208  5  5.0
                   A         B         C  D    F
2013-01-01  0.000000  0.000000 -1.660190 -5  NaN
2013-01-02 -0.143443 -0.216248 -1.227351 -5 -1.0
2013-01-03 -1.876789 -1.277111 -0.582013 -5 -2.0
2013-01-04 -0.604158 -0.049026 -1.499959 -5 -3.0
2013-01-05 -1.200862 -0.879425 -0.051814 -5 -4.0
2013-01-06 -1.013146 -0.589762 -0.041208 -5 -5.0


In [75]:
# 4. Missing Data
# Pandas는 기본적으로 np.nan값을 결측치로 포함
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
print(df1)

# delete nan
print(df1.dropna(how='any'))

# replace nan
print(df1.fillna(value=5))

# replace all nan as True and others as False
print(pd.isna(df1))

                   A         B         C  D    F    E
2013-01-01  0.000000  0.000000 -1.660190  5  NaN  1.0
2013-01-02  0.143443  0.216248 -1.227351  5  1.0  1.0
2013-01-03 -1.876789 -1.277111 -0.582013  5  2.0  NaN
2013-01-04  0.604158 -0.049026  1.499959  5  3.0  NaN
                   A         B         C  D    F    E
2013-01-02  0.143443  0.216248 -1.227351  5  1.0  1.0
                   A         B         C  D    F    E
2013-01-01  0.000000  0.000000 -1.660190  5  5.0  1.0
2013-01-02  0.143443  0.216248 -1.227351  5  1.0  1.0
2013-01-03 -1.876789 -1.277111 -0.582013  5  2.0  5.0
2013-01-04  0.604158 -0.049026  1.499959  5  3.0  5.0
                A      B      C      D      F      E
2013-01-01  False  False  False  False   True  False
2013-01-02  False  False  False  False  False  False
2013-01-03  False  False  False  False  False   True
2013-01-04  False  False  False  False  False   True


## 5. Operation(연산)

In [91]:
# stats are executed after excluding the nan
# descriptive statistics
print(df.mean())

# different axis
print(df.mean(1))

s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

print(df)
# subtraction
print(df.sub(s, axis='index'))

# apply to each column values
print(df.apply(np.cumsum))
print(df.apply(lambda x: np.array([x.max() - x.min(), x.max(), x.min()])))

# histogramming
s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
print(s.value_counts())
print(s.value_counts().sort_index())

# string methods
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s.str.lower())

A    0.180803
B   -0.233259
C   -0.326498
D    5.000000
F    3.000000
dtype: float64
2013-01-01    0.834952
2013-01-02    1.026468
2013-01-03    0.652817
2013-01-04    2.011018
2013-01-05    1.874650
2013-01-06    2.312340
Freq: D, dtype: float64
                   A         B         C  D    F
2013-01-01  0.000000  0.000000 -1.660190  5  NaN
2013-01-02  0.143443  0.216248 -1.227351  5  1.0
2013-01-03 -1.876789 -1.277111 -0.582013  5  2.0
2013-01-04  0.604158 -0.049026  1.499959  5  3.0
2013-01-05  1.200862 -0.879425  0.051814  5  4.0
2013-01-06  1.013146  0.589762 -0.041208  5  5.0
                   A         B         C    D    F
2013-01-01       NaN       NaN       NaN  NaN  NaN
2013-01-02       NaN       NaN       NaN  NaN  NaN
2013-01-03 -2.876789 -2.277111 -1.582013  4.0  1.0
2013-01-04 -2.395842 -3.049026 -1.500041  2.0  0.0
2013-01-05 -3.799138 -5.879425 -4.948186  0.0 -1.0
2013-01-06       NaN       NaN       NaN  NaN  NaN
                   A         B         C   D     F
20

## Merge (병합)