In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**Pandas version 0.25.1 (`pip install pandas==0.25.1`)**

# `Series` Data type

-  Series는 Numpy의 Wrapper (numpy를 한번 감싸서, 넘파이의 기능 + 부가기능을 추가)
-  index를 지정 안해주면 numpy처럼 0부터 1씩 오름차순으로
-  Numpy's ndarray + 숫자가 아닌 다른 type의 index (E.g. 문자열)

In [3]:
import pandas as pd

In [11]:
a = pd.Series([1,2,3,4])
a

0    1
1    2
2    3
3    4
dtype: int64

In [12]:
# 첫번째 방법
s2 = pd.Series(
    [1, 2, 3, 4],
    index=['a', 'b', 'c', 'd']
)
s2

a    1
b    2
c    3
d    4
dtype: int64

In [13]:
s2.head(2)

a    1
b    2
dtype: int64

In [14]:
# 두번째방법
s2 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5})
s2.head()

a    1
b    2
c    3
d    4
e    5
dtype: int64


- 한가지 data type만 가지고 있을 수 있음 

## `nan`과 관련된 함수

- nan이 존재하면 해당 컬럼 데이터타입 int더라도 float로 바뀜

In [4]:
import numpy as np

In [5]:
np.nan

nan

In [6]:
s = pd.Series([10, 0, 1, 1, 2, 3, 4, 5, 6, np.nan])
s

0    10.0
1     0.0
2     1.0
3     1.0
4     2.0
5     3.0
6     4.0
7     5.0
8     6.0
9     NaN
dtype: float64

In [7]:
len(s)       # 물리적 길이 출력
s.shape      
s.count()    # not count `nan`, 유효한 값을 가진 element의 개수만 카운팅

10

(10,)

9

In [8]:
s.unique()  # nan도 포함!

# 수업에서는 다루지 않았지만, nunique()는 unique한 값들의 총 갯수를 알려주는 함수입니다.
# s.nunique()

array([10.,  0.,  1.,  2.,  3.,  4.,  5.,  6., nan])

In [9]:
s.value_counts()  # 내가 넣은 value 개수 series 형태로 반환 (nan 포함 안됨)

1.0     2
10.0    1
0.0     1
2.0     1
3.0     1
4.0     1
5.0     1
6.0     1
dtype: int64

- 이 외의 함수들에 대해서는 이후 수업에서 하나씩 다룰 예정!

## index label을 기준으로 Series간에 operation이 일어남  (매우 중요!!)

- Data의 '순서'가 아니라 index label이 자동으로 정렬되어 연산이 진행됨!

In [10]:
s3 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s4 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a']) 

In [11]:
s3 + s4  # 만약 numpy 연산이라면 [5, 5, 5, 5] 나올것

a    2
b    4
c    6
d    8
dtype: int64

# `DataFrame` Data type

- 다수의 Series를 하나의 변수로 관리할 수 있도록 만든 자료형
    - Series의 dict 형태라고 보면됨
        - `{'컬럼명1': Series1, '컬럼명2': Series2}`
        - 각 Series는 DataFrame의 column을 이룸
        - 당연히 DataFrame을 이루는 Series간의 index는 서로 다 같음! => 동일 index 사용

## DataFrame을 만드는 다양한 방법들

In [12]:
s1 = np.arange(1, 6, 1)
s2 = np.arange(6, 11, 1)
s1
s2

array([1, 2, 3, 4, 5])

array([ 6,  7,  8,  9, 10])

In [13]:
df = pd.DataFrame(
    {
        'c1': s1,
        'c2': s2
    }
)
df

Unnamed: 0,c1,c2
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [14]:
# 1번째 방법  (Default index and columns would be set) (index와 column가 설정 안돼있을 때)
pd.DataFrame(
    [
        [10,11],
        [10,12]
    ]
)
pd.DataFrame(
    np.array(
        [
            [10, 11],
            [20, 21]
        ]
    )
) 

Unnamed: 0,0,1
0,10,11
1,10,12


Unnamed: 0,0,1
0,10,11
1,20,21


In [15]:
# 2번째 방법 (많이 안쓰임)

pd.Series(np.arange(10, 15)),   
pd.Series(np.arange(15, 20)), 

# DataFrame 만들면 가로로 뉘어짐
pd.DataFrame(
    [
        pd.Series(np.arange(10, 15)),   # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
        pd.Series(np.arange(15, 20)),   # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
    ]
)

pd.DataFrame(
    [
        np.arange(10, 15),
        np.arange(15, 20),
    ]
)

(0    10
 1    11
 2    12
 3    13
 4    14
 dtype: int32,)

(0    15
 1    16
 2    17
 3    18
 4    19
 dtype: int32,)

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


In [16]:
# 3번째 방법 (with column & index names)
pd.DataFrame(
    np.array(
        [
            [10, 11],
            [20, 21]
        ]
    ), 
    columns=['a', 'b'],
    index=['r1', 'r2']
)

    

Unnamed: 0,a,b
r1,10,11
r2,20,21


In [17]:
# 4번째 방법
# 이러한 series의 dictionary 형태가 제일 많이 쓰임

s1 = pd.Series(np.arange(1, 6, 1))    # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
s2 = pd.Series(np.arange(6, 11, 1))   # 굳이 Series가 아니고 list형태이기만 하면 됨(=iterable한 object면 다 가능)
pd.DataFrame(
    {
        'c1': s1,    # list, np.array, Series 전부 다 올 수 있음!
        'c2': s2
    }
)

Unnamed: 0,c1,c2
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [18]:
# 참고: 1줄짜리 만들 때도 dictionary의 value에 해당하는 값들은 iterable한 data type(e.g. list, np.array, Series 등)으로 설정해줘야함
pd.DataFrame({'c1': [0], 'c2': [1]})

Unnamed: 0,c1,c2
0,0,1


In [19]:
s1 = pd.Series(np.arange(1, 6, 1), index=['a', 'b', 'c', 'd', 'e'])
s2 = pd.Series(np.arange(6, 11, 1), index=['b', 'c', 'd', 'f', 'g'])
df = pd.DataFrame(
    {
        'c1': s1,
        'c2': s2
    }
)
df

Unnamed: 0,c1,c2
a,1.0,
b,2.0,6.0
c,3.0,7.0
d,4.0,8.0
e,5.0,
f,,9.0
g,,10.0


## DataFrame 생성시, Series간에 Index 기준으로 자동정렬!

In [20]:
s1 = pd.Series(np.arange(1, 6, 1))
s2 = pd.Series(np.arange(6, 11, 1))
s3 = pd.Series(np.arange(12, 15), index=[1, 2, 10])  # this one has index values unlike s1, s2
s1
s2
s3

0    1
1    2
2    3
3    4
4    5
dtype: int32

0     6
1     7
2     8
3     9
4    10
dtype: int32

1     12
2     13
10    14
dtype: int32

In [21]:
df = pd.DataFrame({'c1': s1, 'c2': s2, 'c3': s3}) 
df

Unnamed: 0,c1,c2,c3
0,1.0,6.0,
1,2.0,7.0,12.0
2,3.0,8.0,13.0
3,4.0,9.0,
4,5.0,10.0,
10,,,14.0


## DataFrame에 새로운 column 추가하기

- DataFrame에서 column에 대한 인덱싱은 []를 사용 (딕셔너리처럼)
- DataFrame은 Series의 딕셔너리이므로 위와 같이 사용


In [22]:
df['c4'] = pd.Series([1,2,3,4], index=[0, 1, 2, 10])

In [23]:
df

Unnamed: 0,c1,c2,c3,c4
0,1.0,6.0,,1.0
1,2.0,7.0,12.0,2.0
2,3.0,8.0,13.0,3.0
3,4.0,9.0,,
4,5.0,10.0,,
10,,,14.0,4.0


## Reindexing

### 참고: index 자체를 바꾸는 것("index-value" mapping이 깨짐)

In [24]:
s = pd.Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [25]:
s.index = ['a', 'b', 'c', 'd', 'e']
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

### 참고 :  `set_index()` : 특정 column을 index로 만듦

In [26]:
# 위의 'DataFrame 생성시, Series간에 Index 기준으로 자동정렬!' 챕터에서 정의한 dataframe입니다
df

Unnamed: 0,c1,c2,c3,c4
0,1.0,6.0,,1.0
1,2.0,7.0,12.0,2.0
2,3.0,8.0,13.0,3.0
3,4.0,9.0,,
4,5.0,10.0,,
10,,,14.0,4.0


In [27]:
df['c5'] = pd.Series([1,2,3,4,5,6], index=[0,1,2,3,4,10])
df

Unnamed: 0,c1,c2,c3,c4,c5
0,1.0,6.0,,1.0,1
1,2.0,7.0,12.0,2.0,2
2,3.0,8.0,13.0,3.0,3
3,4.0,9.0,,,4
4,5.0,10.0,,,5
10,,,14.0,4.0,6


In [29]:
df.set_index("c5")  # c5는 컬럼명이 아닌 index명 (평소에 index명은 nan)

Unnamed: 0_level_0,c1,c2,c3,c4
c5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,6.0,,1.0
2,2.0,7.0,12.0,2.0
3,3.0,8.0,13.0,3.0
4,4.0,9.0,,
5,5.0,10.0,,
6,,,14.0,4.0


### Reindex

- 새로운 index label을 기반으로 기존의 "index-value" mapping은 유지한채 재배열하는 것
- 새로운 index label에 해당하는 값이 기존에 있으면 가져오고 없으면 nan. 해당하지 않는 기존의 index label은 버림

In [30]:
s2 = s.reindex(
    ['a', 'c', 'e', 'g']
)
s2

a    1.0
c    3.0
e    5.0
g    NaN
dtype: float64

In [31]:
# Copied
s2['a'] = 0
s2

a    0.0
c    3.0
e    5.0
g    NaN
dtype: float64

In [32]:
# s는 s2의 값을 바꿔도 안 건드려짐
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [33]:
# [X] 이렇게 하면 안됨
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])
s1
s2

0    0
1    1
2    2
dtype: int64

0    3
1    4
2    5
dtype: int64

In [34]:
s1 + s2

0   NaN
1   NaN
2   NaN
0   NaN
1   NaN
2   NaN
dtype: float64

In [35]:
s1.index

Int64Index([0, 1, 2], dtype='int64')

In [36]:
s2 = s2.reindex(s1.index)
s2

0   NaN
1   NaN
2   NaN
dtype: float64

In [37]:
# 첫번째 방법
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])

In [38]:
s2.index = s2.index.astype(int)

In [39]:
s2

0    3
1    4
2    5
dtype: int64

In [40]:
s2.index

Int64Index([0, 1, 2], dtype='int64')

In [41]:
s1 + s2

0    3
1    5
2    7
dtype: int64

In [42]:
# 두번째 방법
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])

In [43]:
s1.index = ['a', 'b', 'c']
s2.index = ['a', 'b', 'c']

In [44]:
s1 + s2

a    3
b    5
c    7
dtype: int64

#### `reindex()`의 유용한 Arguments

- `fill_value`

In [45]:
s2 = s.copy()
s2

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [46]:
s2.reindex(['a', 'f'])

a    1.0
f    NaN
dtype: float64

In [47]:
s2.reindex(['a', 'f'], fill_value=0)  # fill 0 instead of Nan

a    1
f    0
dtype: int64

- `method`

In [48]:
s3 = pd.Series(['red', 'green', 'blue'], index=[0, 3, 5])
s3

0      red
3    green
5     blue
dtype: object

In [49]:
s3.reindex(np.arange(0,7))

0      red
1      NaN
2      NaN
3    green
4      NaN
5     blue
6      NaN
dtype: object

In [65]:
s3.reindex(np.arange(0,7), method='ffill')  # ffill: Forward Fill. 앞의 데이터로 채우기

0      red
1      red
2      red
3    green
4    green
5     blue
6     blue
dtype: object

#### 예제

In [51]:
# 맨 첫 강의에서 라이브러리를 설치할 때 requirements.txt를 이용해서 설치를 했으면, 건너뛰셔도 됩니다. 
!pip install finance_datareader == 0.9.1

ERROR: Invalid requirement: '=='

[notice] A new release of pip available: 22.2.2 -> 22.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [53]:
import FinanceDataReader as fdr

In [54]:
# 삼성전자
df1 = fdr.DataReader("005930", '2018-01-02', '2018-10-30')

# KODEX 200 (ETF)
df2 = fdr.DataReader("069500", '2018-01-03', '2018-10-30')

In [55]:
df1.head(2)
df1.tail(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,51380,51400,50780,51020,169485,0.001177
2018-01-03,52540,52560,51420,51620,200270,0.01176


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-29,40850,41950,40550,41400,14460521,0.009756
2018-10-30,41400,43000,41000,42350,14205190,0.022947


In [56]:
df2.head(2)
df2.tail(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-03,29894,29984,29822,29899,7371281,0.004299
2018-01-04,30066,30080,29651,29662,9062548,-0.007927


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-29,24335,24497,24009,24058,5389624,-0.008081
2018-10-30,24023,24439,23936,24225,8144135,0.006942


In [57]:
# 삼성전자
df1 = fdr.DataReader("005930", '2018-01-02', '2018-10-30')

# KODEX 200 (ETF)
df2 = fdr.DataReader("069500", '2018-01-02', '2018-10-30')

In [58]:
df1.shape
df2.shape

(202, 6)

(202, 6)

In [59]:
df2 = df2.drop(pd.to_datetime("2018-01-03"))
df2.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,29748,29836,29656,29771,5099782,0.00442
2018-01-04,30066,30080,29651,29662,9062548,-0.007927
2018-01-05,29736,30057,29736,30065,8256774,0.013586
2018-01-08,30184,30309,30016,30267,8156834,0.006719
2018-01-09,30175,30391,30011,30148,8103079,-0.003932


In [60]:
df1.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,51380,51400,50780,51020,169485,0.001177
2018-01-03,52540,52560,51420,51620,200270,0.01176
2018-01-04,52120,52180,50640,51080,233909,-0.010461
2018-01-05,51300,52120,51200,52120,189623,0.02036
2018-01-08,52400,52520,51500,52020,167673,-0.001919


- 데이터가 누락된 df2와 그렇지 않은 df1을 합치고 싶을 때 index가 맞지 않다.
- 이때 사용하는 것이 reindex

In [62]:
new_df2 = df2.reindex(df1.index)
new_df2.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,29748.0,29836.0,29656.0,29771.0,5099782.0,0.00442
2018-01-03,,,,,,
2018-01-04,30066.0,30080.0,29651.0,29662.0,9062548.0,-0.007927
2018-01-05,29736.0,30057.0,29736.0,30065.0,8256774.0,0.013586
2018-01-08,30184.0,30309.0,30016.0,30267.0,8156834.0,0.006719


In [63]:
df1.shape
new_df2.shape

(202, 6)

(202, 6)

In [64]:
new_df2.fillna(method="ffill") 

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,29748.0,29836.0,29656.0,29771.0,5099782.0,0.004420
2018-01-03,29748.0,29836.0,29656.0,29771.0,5099782.0,0.004420
2018-01-04,30066.0,30080.0,29651.0,29662.0,9062548.0,-0.007927
2018-01-05,29736.0,30057.0,29736.0,30065.0,8256774.0,0.013586
2018-01-08,30184.0,30309.0,30016.0,30267.0,8156834.0,0.006719
...,...,...,...,...,...,...
2018-10-24,25300.0,25321.0,24998.0,25045.0,12298450.0,-0.005164
2018-10-25,24521.0,24654.0,24283.0,24645.0,11874080.0,-0.015971
2018-10-26,24668.0,24673.0,24032.0,24254.0,8367475.0,-0.015865
2018-10-29,24335.0,24497.0,24009.0,24058.0,5389624.0,-0.008081
