패키지 호출

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Object Creation (객체 생성)

## Creating Series

Pandas는 값을 가지고 있는 리스트를 통해 Series를 만들고, 정수로 만들어진 인덱스를 기본값으로 불러온다.

> 아래 예제에서는 [1, 2, 5, NaN, 6, 8] 리스트를 사용하고 결과값의 오른쪽에 표시됨
>
> 0부터 시작하는 인덱스 값은 왼쪽에 표시됨

In [6]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

## Creating DataFrame

### By Numpy 배열

In [9]:
dates = pd.date_range('20200101', periods=6)
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

__pd.date_range(시작날짜, periods=생성할 날짜-시간 개수)__ 함수를 사용하여 2020년 1월 1일부터 6일까지의 날짜 데이터 생성

__freq 옵션__ 은 'S' 1초 단위, '10S' 10초 단위, 'H' 1시간 단위, 'D' 1일 단위, 'M' 1달 단위(월 말일 기준), 'Y' 1년 단위 (년 말일 기준) 등으로 날짜-시간 시계열 데이터 생성 주기 나타냄 (default = 'D')

참고로, __pd.date_range(start='시작 날짜-시간', end='끝 날짜-시간')__ 처럼 명시적으로 시작과 끝의 날짜-시간을 지정해주어도 위의 perieds를 사용한 예와 동일한 결과를 얻을 수 있다.

In [11]:
pd.date_range(start='20200101', end='20200106')

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2020-01-01,-0.999641,-0.706367,-0.807672,0.320407
2020-01-02,-0.775322,1.28191,0.010319,0.399705
2020-01-03,-0.647642,-0.490912,-0.317904,0.773827
2020-01-04,-1.199544,2.09992,0.066917,-0.420068
2020-01-05,-0.573793,-0.871332,1.376489,-1.048176
2020-01-06,5.6e-05,0.40612,0.367155,1.305775


__np.random.randn(m, n)__ : 평균 0, 표준편차 1의 가우시안 표준정규분포 난수를 matrix array(m, n)으로 생성

> 예제에서는 6x4 크기의 행렬(행[index]: 1일부터 6일까지, 열[columns]: ABCD)에 난수를 할당

### By dict objects

Series와 같은 것으로 변환될 수 있는 객체들의 dict

In [13]:
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })

df

Unnamed: 0,A,B,C,D
2020-01-01,-0.999641,-0.706367,-0.807672,0.320407
2020-01-02,-0.775322,1.28191,0.010319,0.399705
2020-01-03,-0.647642,-0.490912,-0.317904,0.773827
2020-01-04,-1.199544,2.09992,0.066917,-0.420068
2020-01-05,-0.573793,-0.871332,1.376489,-1.048176
2020-01-06,5.6e-05,0.40612,0.367155,1.305775


데이터프레임 결과물의 열은 다양한 데이터 타입 (dtypes)으로 구성되어 있는 것을 확인

In [16]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing Data (데이터 확인)

데이터프레임의 가장 윗 줄과 마지막 줄을 확인하고 싶을 때에 사용하는 방법은 다음과 같습니다.

> 괄호() 안에는 숫자가 들어갈 수도 있고 안 들어갈 수도 있다. 
>
> 숫자가 들어간다면, 윗 / 마지막 줄의 특정 줄을 불러올 수 있다. 숫자가 들어가지 않다면, 기본값인 5로 처리

In [17]:
df.head() # 처음 5줄

Unnamed: 0,A,B,C,D
2020-01-01,-0.999641,-0.706367,-0.807672,0.320407
2020-01-02,-0.775322,1.28191,0.010319,0.399705
2020-01-03,-0.647642,-0.490912,-0.317904,0.773827
2020-01-04,-1.199544,2.09992,0.066917,-0.420068
2020-01-05,-0.573793,-0.871332,1.376489,-1.048176


In [18]:
df.tail(3) # 마지막 3줄

Unnamed: 0,A,B,C,D
2020-01-04,-1.199544,2.09992,0.066917,-0.420068
2020-01-05,-0.573793,-0.871332,1.376489,-1.048176
2020-01-06,5.6e-05,0.40612,0.367155,1.305775


인덱스 (index), 열 (column) 그리고 Numpy 데이터에 대한 세부 정보

In [19]:
df.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [21]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [22]:
df.values

array([[ -9.99641483e-01,  -7.06366633e-01,  -8.07672330e-01,
          3.20406591e-01],
       [ -7.75322192e-01,   1.28191019e+00,   1.03188122e-02,
          3.99704778e-01],
       [ -6.47641786e-01,  -4.90912248e-01,  -3.17903858e-01,
          7.73826789e-01],
       [ -1.19954409e+00,   2.09992007e+00,   6.69172254e-02,
         -4.20067952e-01],
       [ -5.73792997e-01,  -8.71332302e-01,   1.37648876e+00,
         -1.04817607e+00],
       [  5.60698051e-05,   4.06120029e-01,   3.67155470e-01,
          1.30577538e+00]])

describe()는 데이터의 대략적인 통계적 정보 요약

In [23]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.699314,0.286557,0.115884,0.221912
std,0.413219,1.202029,0.73579,0.842133
min,-1.199544,-0.871332,-0.807672,-1.048176
25%,-0.943562,-0.652503,-0.235848,-0.234949
50%,-0.711482,-0.042396,0.038618,0.360056
75%,-0.592255,1.062963,0.292096,0.680296
max,5.6e-05,2.09992,1.376489,1.305775


# Selection (선택)

> 선택과 설정을 위한 Python / Numpy의 표준화된 표현들이 직관적이며, 코드 작성을 위한 양방향 작업에 유용하지만 
>
> Pandas에 최적화된 데이터 접근 방법인 .at, .iat, .loc 및 .iloc 을 추천

## Getting (데이터 얻기)

df.A 와 동일한 Series를 생성하는 단일 열을 선택

In [25]:
df['A']

2020-01-01   -0.999641
2020-01-02   -0.775322
2020-01-03   -0.647642
2020-01-04   -1.199544
2020-01-05   -0.573793
2020-01-06    0.000056
Freq: D, Name: A, dtype: float64

행을 분할하는 [ ]를 통해 선택

In [26]:
df[0:3]

Unnamed: 0,A,B,C,D
2020-01-01,-0.999641,-0.706367,-0.807672,0.320407
2020-01-02,-0.775322,1.28191,0.010319,0.399705
2020-01-03,-0.647642,-0.490912,-0.317904,0.773827


In [28]:
df['20200104':'20200106']

Unnamed: 0,A,B,C,D
2020-01-04,-1.199544,2.09992,0.066917,-0.420068
2020-01-05,-0.573793,-0.871332,1.376489,-1.048176
2020-01-06,5.6e-05,0.40612,0.367155,1.305775


## Selection by Label (Label 을 통한 선택)

횡단면(cross section)을 출력

> 예제는 2020년 1월 1일 값을 출력함

In [29]:
df.loc[dates[0]]

A   -0.999641
B   -0.706367
C   -0.807672
D    0.320407
Name: 2020-01-01 00:00:00, dtype: float64

여러 축(multi-axis)을 출력

In [30]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2020-01-01,-0.999641,-0.706367
2020-01-02,-0.775322,1.28191
2020-01-03,-0.647642,-0.490912
2020-01-04,-1.199544,2.09992
2020-01-05,-0.573793,-0.871332
2020-01-06,5.6e-05,0.40612


양쪽 종단점을 포함한 라벨 슬라이싱

In [32]:
df.loc['20200102':'20200104', ['A', 'B']]

Unnamed: 0,A,B
2020-01-02,-0.775322,1.28191
2020-01-03,-0.647642,-0.490912
2020-01-04,-1.199544,2.09992


스칼라 값

In [37]:
df.loc[dates[0], 'A']

-0.99964148295041122

스칼라 값을 더 빠르게 구하는 방법(앞선 메소드와 동일)

In [36]:
df.at[dates[0], 'A']

-0.99964148295041122

## Selection by Position (위치로 선택하기)

넘겨받은 정수의 위치를 기준으로 선택

In [38]:
df.iloc[3]

A   -1.199544
B    2.099920
C    0.066917
D   -0.420068
Name: 2020-01-04 00:00:00, dtype: float64

정수로 표기된 슬라이스들을 통해, numpy / python과 유사하게 작동

In [39]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2020-01-04,-1.199544,2.09992
2020-01-05,-0.573793,-0.871332


정수로 표기된 위치값의 리스트들을 통해, numpy / python의 스타일과 유사

In [40]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2020-01-02,-0.775322,0.010319
2020-01-03,-0.647642,-0.317904
2020-01-05,-0.573793,1.376489


명시적으로 (특정한) 값을 얻고자 하는 경우 (스칼라 값)

In [41]:
df.iloc[1,1]

1.2819101876702725

스칼라 값을 빠르게 얻는 방법(위의 방식과 동일)

In [43]:
df.iat[1,1]

1.2819101876702725

## Boolean Indexing

In [44]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2020-01-06,5.6e-05,0.40612,0.367155,1.305775


In [45]:
df[df > 0]

Unnamed: 0,A,B,C,D
2020-01-01,,,,0.320407
2020-01-02,,1.28191,0.010319,0.399705
2020-01-03,,,,0.773827
2020-01-04,,2.09992,0.066917,
2020-01-05,,,1.376489,
2020-01-06,5.6e-05,0.40612,0.367155,1.305775


필터링을 위한 메소드 isin()을 사용

In [46]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

df2

Unnamed: 0,A,B,C,D,E
2020-01-01,-0.999641,-0.706367,-0.807672,0.320407,one
2020-01-02,-0.775322,1.28191,0.010319,0.399705,one
2020-01-03,-0.647642,-0.490912,-0.317904,0.773827,two
2020-01-04,-1.199544,2.09992,0.066917,-0.420068,three
2020-01-05,-0.573793,-0.871332,1.376489,-1.048176,four
2020-01-06,5.6e-05,0.40612,0.367155,1.305775,three


In [47]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2020-01-03,-0.647642,-0.490912,-0.317904,0.773827,two
2020-01-05,-0.573793,-0.871332,1.376489,-1.048176,four


## Setting (설정)

In [54]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20200102', periods=6))
s1

2020-01-02    1
2020-01-03    2
2020-01-04    3
2020-01-05    4
2020-01-06    5
2020-01-07    6
Freq: D, dtype: int64

In [57]:
df['F'] = s1
df.at[dates[0],'A'] = 0
df.iat[0,1] = 0
df.loc[:,'D'] = np.array([5] * len(df))

In [58]:
df

Unnamed: 0,A,B,C,D,F
2020-01-01,0.0,0.0,-0.807672,5,
2020-01-02,-0.775322,1.28191,0.010319,5,1.0
2020-01-03,-0.647642,-0.490912,-0.317904,5,2.0
2020-01-04,-1.199544,2.09992,0.066917,5,3.0
2020-01-05,-0.573793,-0.871332,1.376489,5,4.0
2020-01-06,5.6e-05,0.40612,0.367155,5,5.0


In [59]:
df2 = df.copy()
df2[df2 > 0] = -df2

df2

Unnamed: 0,A,B,C,D,F
2020-01-01,0.0,0.0,-0.807672,-5,
2020-01-02,-0.775322,-1.28191,-0.010319,-5,-1.0
2020-01-03,-0.647642,-0.490912,-0.317904,-5,-2.0
2020-01-04,-1.199544,-2.09992,-0.066917,-5,-3.0
2020-01-05,-0.573793,-0.871332,-1.376489,-5,-4.0
2020-01-06,-5.6e-05,-0.40612,-0.367155,-5,-5.0
