## pandas 기초

- pandas는 R의 핵심 데이터 시리즈와 프레임을 파이썬에 추가한 것이다.
- numpy를 기반으로 구현되었고, numpy 대비 기능을 더 확장하여 재구현한 것
- Python Data Analysis Library
> https://pandas.pydata.org/

In [2]:
import numpy as np
import pandas as pd

- 파이썬의 자료구조
> 수치형, 문자열, 리스트, 딕셔너리, 튜플, 집합, boolean
- numpy의 자료구조
> ndarray(배열) : 배열의 데이터는 모두 같은 타입이다.
- pandas의 자료구조
> Series(시리즈), DataFrame(데이터프레임)  
> - Series : 인덱스와 데이터만 존재하는, 컬럼이 없는 자료구조  
> - DataFrame : 인덱스와 컬럼이 존재하는 자료구조  
> DataFrame 인덱싱 → Series 인덱싱 → 값(스칼라), 수치, 문자, boolean, Nan 등   
> Nan(Not a Number → np.nan) : 데이터가 없다.  

In [4]:
# Series
# 데이터를 정수로 넣었으나 기본형으로 float64가 반영되었다
a = pd.Series([1, 3, 5, np.nan, 6, 8])   # 생성자
a

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
# 속성-타입
a.dtype

dtype('float64')

In [6]:
# 속성-크기
a.shape
# (6,) -> 1차원 데이터로 총 6개의 데이터가 존재

(6,)

In [8]:
# DataFrame : 인덱스와 컬럼이 존재하는 자료구조
cols = list('ABCD')
indexs = pd.date_range('20190812', periods=7)
# 컬럼 4개, 인덱스 7개
cols, indexs

(['A', 'B', 'C', 'D'],
 DatetimeIndex(['2019-08-12', '2019-08-13', '2019-08-14', '2019-08-15',
                '2019-08-16', '2019-08-17', '2019-08-18'],
               dtype='datetime64[ns]', freq='D'))

In [10]:
# 데이터 shape -> (7, 4)
datas = np.random.randn(7, 4)
datas, datas.shape

(array([[-0.75554389, -0.0599567 ,  0.42666649, -1.09926096],
        [ 0.6135535 , -1.01539292, -0.81516382, -2.6537993 ],
        [-2.00661708, -0.50071063, -1.62881814, -1.90799813],
        [-2.00625455, -1.47571026, -0.85068141,  0.21002483],
        [-1.75539127,  2.15996551,  1.25102319, -0.70120523],
        [-0.86225417,  0.64819462, -0.03557292, -0.23070858],
        [ 1.50373213,  0.25558352,  1.49665739,  1.17681281]]), (7, 4))

In [11]:
# df 생성
df = pd.DataFrame(datas, index = indexs, columns = cols)  # 생성자
df

Unnamed: 0,A,B,C,D
2019-08-12,-0.755544,-0.059957,0.426666,-1.099261
2019-08-13,0.613554,-1.015393,-0.815164,-2.653799
2019-08-14,-2.006617,-0.500711,-1.628818,-1.907998
2019-08-15,-2.006255,-1.47571,-0.850681,0.210025
2019-08-16,-1.755391,2.159966,1.251023,-0.701205
2019-08-17,-0.862254,0.648195,-0.035573,-0.230709
2019-08-18,1.503732,0.255584,1.496657,1.176813


In [12]:
# 상위 5개
df.head()

Unnamed: 0,A,B,C,D
2019-08-12,-0.755544,-0.059957,0.426666,-1.099261
2019-08-13,0.613554,-1.015393,-0.815164,-2.653799
2019-08-14,-2.006617,-0.500711,-1.628818,-1.907998
2019-08-15,-2.006255,-1.47571,-0.850681,0.210025
2019-08-16,-1.755391,2.159966,1.251023,-0.701205


In [13]:
# 하위 2개
df.tail(2)

Unnamed: 0,A,B,C,D
2019-08-17,-0.862254,0.648195,-0.035573,-0.230709
2019-08-18,1.503732,0.255584,1.496657,1.176813


In [15]:
df.index

DatetimeIndex(['2019-08-12', '2019-08-13', '2019-08-14', '2019-08-15',
               '2019-08-16', '2019-08-17', '2019-08-18'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [16]:
df.values

array([[-0.75554389, -0.0599567 ,  0.42666649, -1.09926096],
       [ 0.6135535 , -1.01539292, -0.81516382, -2.6537993 ],
       [-2.00661708, -0.50071063, -1.62881814, -1.90799813],
       [-2.00625455, -1.47571026, -0.85068141,  0.21002483],
       [-1.75539127,  2.15996551,  1.25102319, -0.70120523],
       [-0.86225417,  0.64819462, -0.03557292, -0.23070858],
       [ 1.50373213,  0.25558352,  1.49665739,  1.17681281]])

In [17]:
type(df.values)   # numpy.ndarray

numpy.ndarray

In [18]:
df.shape

(7, 4)

In [20]:
df.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [21]:
# df의 개요
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2019-08-12 to 2019-08-18
Freq: D
Data columns (total 4 columns):
A    7 non-null float64
B    7 non-null float64
C    7 non-null float64
D    7 non-null float64
dtypes: float64(4)
memory usage: 280.0 bytes


In [22]:
# 통계요약 : 개수, 평균, 표준편차, 최소, 25%, 50%, 75%, 최대
df.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,-0.752682,0.00171,-0.02227,-0.743734
std,1.361325,1.19819,1.156581,1.29057
min,-2.006617,-1.47571,-1.628818,-2.653799
25%,-1.880823,-0.758052,-0.832923,-1.50363
50%,-0.862254,-0.059957,-0.035573,-0.701205
75%,-0.070995,0.451889,0.838845,-0.010342
max,1.503732,2.159966,1.496657,1.176813


In [23]:
# B열 기준 데이터를 정렬, 내림차순
df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2019-08-16,-1.755391,2.159966,1.251023,-0.701205
2019-08-17,-0.862254,0.648195,-0.035573,-0.230709
2019-08-18,1.503732,0.255584,1.496657,1.176813
2019-08-12,-0.755544,-0.059957,0.426666,-1.099261
2019-08-14,-2.006617,-0.500711,-1.628818,-1.907998
2019-08-13,0.613554,-1.015393,-0.815164,-2.653799
2019-08-15,-2.006255,-1.47571,-0.850681,0.210025


In [25]:
# 특정 컬럼의 데이터만 보기 -> 인덱싱 -> 차원 축소
df['C'], type(df['C'])   # DataFrame을 인덱싱하면 Series

(2019-08-12    0.426666
 2019-08-13   -0.815164
 2019-08-14   -1.628818
 2019-08-15   -0.850681
 2019-08-16    1.251023
 2019-08-17   -0.035573
 2019-08-18    1.496657
 Freq: D, Name: C, dtype: float64, pandas.core.series.Series)

In [26]:
# 데이터가 슬라이싱되어 나온다 -> 차원을 유지해야 하니까
# a <= x < b
df[1:3]

Unnamed: 0,A,B,C,D
2019-08-13,0.613554,-1.015393,-0.815164,-2.653799
2019-08-14,-2.006617,-0.500711,-1.628818,-1.907998


In [27]:
# 슬라이싱을 하는데 인덱스값이 아닌 실제값으로 자르기
# a <= x <= b
df['2019-08-13':'2019-08-15']

Unnamed: 0,A,B,C,D
2019-08-13,0.613554,-1.015393,-0.815164,-2.653799
2019-08-14,-2.006617,-0.500711,-1.628818,-1.907998
2019-08-15,-2.006255,-1.47571,-0.850681,0.210025


### loc

- 전통적인 인덱싱과 슬라이싱을 진행하면 표현의 한계점에 도달
- 이를 극복하기 위해서 pandas만의 데이터 추출법이 추가되었다.
- loc, iloc ← 2개를 주로 사용한다
- 연속 데이터 추출 ↔ 비연속 데이터 추출(펜시인덱싱, 쿼리수행 등)

In [28]:
# loc : location 정보를 옵션으로 하여 슬라이싱 지원
# loc을 통한 데이터 추출
# df.loc[ 인덱스명 ]   // ()가 아닌 []임에 주의!!
df.loc['2019-08-12'], type(df.loc['2019-08-12'])

(A   -0.755544
 B   -0.059957
 C    0.426666
 D   -1.099261
 Name: 2019-08-12 00:00:00, dtype: float64, pandas.core.series.Series)

In [29]:
# 원본 copy
df[:]

Unnamed: 0,A,B,C,D
2019-08-12,-0.755544,-0.059957,0.426666,-1.099261
2019-08-13,0.613554,-1.015393,-0.815164,-2.653799
2019-08-14,-2.006617,-0.500711,-1.628818,-1.907998
2019-08-15,-2.006255,-1.47571,-0.850681,0.210025
2019-08-16,-1.755391,2.159966,1.251023,-0.701205
2019-08-17,-0.862254,0.648195,-0.035573,-0.230709
2019-08-18,1.503732,0.255584,1.496657,1.176813


In [30]:
# 인덱스는 전부 포함시키고(1차원), 컬럼은 A, C만 포함(2차원)
df.loc[:, ['A', 'C']]

Unnamed: 0,A,C
2019-08-12,-0.755544,0.426666
2019-08-13,0.613554,-0.815164
2019-08-14,-2.006617,-1.628818
2019-08-15,-2.006255,-0.850681
2019-08-16,-1.755391,1.251023
2019-08-17,-0.862254,-0.035573
2019-08-18,1.503732,1.496657


In [34]:
# 차원을 축소하고 싶다면
df.loc[:, 'A']

2019-08-12   -0.755544
2019-08-13    0.613554
2019-08-14   -2.006617
2019-08-15   -2.006255
2019-08-16   -1.755391
2019-08-17   -0.862254
2019-08-18    1.503732
Freq: D, Name: A, dtype: float64

In [35]:
type(df.loc[ : , 'A'])

pandas.core.series.Series

In [36]:
# 차원을 유지하고 싶다면
df.loc[:, ['A']]

Unnamed: 0,A
2019-08-12,-0.755544
2019-08-13,0.613554
2019-08-14,-2.006617
2019-08-15,-2.006255
2019-08-16,-1.755391
2019-08-17,-0.862254
2019-08-18,1.503732


In [37]:
type(df.loc[ : , ['A']])

pandas.core.frame.DataFrame

In [38]:
df.loc['2019-08-13' :'2019-08-15' , ['A', 'C'] ]

Unnamed: 0,A,C
2019-08-13,0.613554,-0.815164
2019-08-14,-2.006617,-1.628818
2019-08-15,-2.006255,-0.850681


In [39]:
# 차원 축소 -> 인덱스를 한 개만 지정
df.loc['2019-08-13' , ['A', 'C'] ]

A    0.613554
C   -0.815164
Name: 2019-08-13 00:00:00, dtype: float64

In [None]:
# error : 인덱스에 [] 하면 에러 발생
# df.loc[['2019-08-13'] , ['A', 'C'] ]
# df.loc[['2019-08-13' : '2019-08-13'] , ['A', 'C'] ]

In [41]:
df.loc['2019-08-13' :'2019-08-13' , ['A', 'C'] ]

Unnamed: 0,A,C
2019-08-13,0.613554,-0.815164


In [42]:
# 차원축소가 2회 진행 -> 스칼라(값)
df.loc['2019-08-13', 'A']

0.613553500610859

### iloc

- 펜시인덱싱과 유사
- 행과 열의 번호를 이용하여 데이터에 접근하는 방식
- i → index

In [43]:
# 인덱스 값이 1인 데이터
df.iloc[1]

A    0.613554
B   -1.015393
C   -0.815164
D   -2.653799
Name: 2019-08-13 00:00:00, dtype: float64

In [44]:
# iloc 슬라이싱
# 경계값 미포함
# a <= index < b, c <= column < d
df.iloc[ 1:3 , 1:3 ]

Unnamed: 0,B,C
2019-08-13,-1.015393,-0.815164
2019-08-14,-0.500711,-1.628818


In [45]:
# iloc + 펜시인덱싱 기법 사용 (인덱스, 컬럼을 비연속적 위치를 나열)
df.iloc[[1, 4, 2], [0, 2]]

Unnamed: 0,A,C
2019-08-13,0.613554,-0.815164
2019-08-16,-1.755391,1.251023
2019-08-14,-2.006617,-1.628818


In [None]:
# 특정 조건에 만족하는 데이터만 추출
# 데이터프레임이 생성되면 컬럼명은 멤버 변수로 자동생성된다
# 조건을 부여하여 boolean 데이터를 만들어 참만 포함시키는 방식 : boolean 인덱싱
# ex) [ T, F, F, F, T, T, T ] 데이터를 and하면 참만 살아남아서 아래와 같은 결과를 발생
# df에 식을 치면 -> 전체 구성원에 전부 다 연산이 진행된다
# 행렬 (연산) 값 -> 각 구성원에 일일이 다 연산하는 것과 동일

In [46]:
# C 컬럼에 존재하는 데이터 중에 양수만 (양수면 True, 0 이하면 False)
df[df.C > 0]

Unnamed: 0,A,B,C,D
2019-08-12,-0.755544,-0.059957,0.426666,-1.099261
2019-08-16,-1.755391,2.159966,1.251023,-0.701205
2019-08-18,1.503732,0.255584,1.496657,1.176813


In [47]:
# 데이터 전체를 기준으로 0보다 큰가?
# 0보다 같거나 작은 데이터들은 NaN으로 대체된다.
df[df > 0]

Unnamed: 0,A,B,C,D
2019-08-12,,,0.426666,
2019-08-13,0.613554,,,
2019-08-14,,,,
2019-08-15,,,,0.210025
2019-08-16,,2.159966,1.251023,
2019-08-17,,0.648195,,
2019-08-18,1.503732,0.255584,1.496657,1.176813


In [48]:
# 복사(2가지)
df[:], df.copy()

(                   A         B         C         D
 2019-08-12 -0.755544 -0.059957  0.426666 -1.099261
 2019-08-13  0.613554 -1.015393 -0.815164 -2.653799
 2019-08-14 -2.006617 -0.500711 -1.628818 -1.907998
 2019-08-15 -2.006255 -1.475710 -0.850681  0.210025
 2019-08-16 -1.755391  2.159966  1.251023 -0.701205
 2019-08-17 -0.862254  0.648195 -0.035573 -0.230709
 2019-08-18  1.503732  0.255584  1.496657  1.176813,
                    A         B         C         D
 2019-08-12 -0.755544 -0.059957  0.426666 -1.099261
 2019-08-13  0.613554 -1.015393 -0.815164 -2.653799
 2019-08-14 -2.006617 -0.500711 -1.628818 -1.907998
 2019-08-15 -2.006255 -1.475710 -0.850681  0.210025
 2019-08-16 -1.755391  2.159966  1.251023 -0.701205
 2019-08-17 -0.862254  0.648195 -0.035573 -0.230709
 2019-08-18  1.503732  0.255584  1.496657  1.176813)

In [49]:
# 기존 데이터 df에 새로운 컬럼을 추가한다(아주 중요) -> 파생변수
# 기존 df의 1차원과 동수의 데이터가 존재해야 한다
# 데이터는 리스트 ok, Series도 ok
new_data = ['one', 'one', 'two', 'three', 'four', 'three', 'five']
# 데이터 추가
# 대상[컬럼] = 데이터
df['E'] = new_data
df

Unnamed: 0,A,B,C,D,E
2019-08-12,-0.755544,-0.059957,0.426666,-1.099261,one
2019-08-13,0.613554,-1.015393,-0.815164,-2.653799,one
2019-08-14,-2.006617,-0.500711,-1.628818,-1.907998,two
2019-08-15,-2.006255,-1.47571,-0.850681,0.210025,three
2019-08-16,-1.755391,2.159966,1.251023,-0.701205,four
2019-08-17,-0.862254,0.648195,-0.035573,-0.230709,three
2019-08-18,1.503732,0.255584,1.496657,1.176813,five


In [50]:
# 데이터 조사
# 안에 해당 값이 있는가?
df['E'].isin(['two', 'four'])

2019-08-12    False
2019-08-13    False
2019-08-14     True
2019-08-15    False
2019-08-16     True
2019-08-17    False
2019-08-18    False
Freq: D, Name: E, dtype: bool

In [51]:
# df에서 True인 값만 추출
df[ df['E'].isin(['two', 'four']) ]

Unnamed: 0,A,B,C,D,E
2019-08-14,-2.006617,-0.500711,-1.628818,-1.907998,two
2019-08-16,-1.755391,2.159966,1.251023,-0.701205,four


In [52]:
# 누적합
# apply(함수를 표현) -> 멤버들을 다 건드린다
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,E
2019-08-12,-0.755544,-0.059957,0.426666,-1.099261,one
2019-08-13,-0.14199,-1.07535,-0.388497,-3.75306,oneone
2019-08-14,-2.148607,-1.57606,-2.017315,-5.661058,oneonetwo
2019-08-15,-4.154862,-3.051771,-2.867997,-5.451034,oneonetwothree
2019-08-16,-5.910253,-0.891805,-1.616974,-6.152239,oneonetwothreefour
2019-08-17,-6.772507,-0.24361,-1.652547,-6.382947,oneonetwothreefourthree
2019-08-18,-5.268775,0.011973,-0.155889,-5.206135,oneonetwothreefourthreefive
