In [7]:
import pandas as pd     # pandas 라이브러리를 pd 이름으로 호출

# Series

In [8]:
prices = [1000, 1010, 1020]     # 주가를 담아놓은 리스트 생성

In [9]:
dates = pd.date_range('20181201', periods=3)     # date_range 함수를 이용해 날짜 생성
dates

DatetimeIndex(['2018-12-01', '2018-12-02', '2018-12-03'], dtype='datetime64[ns]', freq='D')

In [10]:
pd.date_range('20181201', '20181231', freq='W')

DatetimeIndex(['2018-12-02', '2018-12-09', '2018-12-16', '2018-12-23',
               '2018-12-30'],
              dtype='datetime64[ns]', freq='W-SUN')

In [11]:
s = pd.Series(prices, index=dates)     # 주가를 데이터로, 날짜를 인덱스로 하는 Series 생성
s

2018-12-01    1000
2018-12-02    1010
2018-12-03    1020
Freq: D, dtype: int64

In [12]:
s2 = pd.Series(prices)     # 인덱스를 지정하지 않은 Series
s2

0    1000
1    1010
2    1020
dtype: int64

In [13]:
s2[3] = 1030     # Series에 데이터 추가
s2

0    1000
1    1010
2    1020
3    1030
dtype: int64

In [14]:
print('123' + '123')
print(123 + 123)

123123
246


In [15]:
s[pd.to_datetime('2018-12-04')] = 1030     # 인덱스를 이용한 데이터 추가
s

2018-12-01    1000
2018-12-02    1010
2018-12-03    1020
2018-12-04    1030
Freq: D, dtype: int64

Series 데이터 조회

In [16]:
s[2]     # 배열 스타일로 데이터 추출

1020

In [17]:
s['2018-12-03']     # 인덱스를 이용한 데이터 추출

1020

# DataFrame 

In [12]:
prices = {'A전자' : [1000, 1010, 1020],
          'B화학' : [2000, 2010, 2020],
          'C금융' : [3000, 3010, 3020]}
df1 = pd.DataFrame(prices)
df1

Unnamed: 0,A전자,B화학,C금융
0,1000,2000,3000
1,1010,2010,3010
2,1020,2020,3020


In [13]:
df2 = pd.DataFrame(prices, index=dates)     # 인덱스가 있는 DataFrame
df2

Unnamed: 0,A전자,B화학,C금융
2018-12-01,1000,2000,3000
2018-12-02,1010,2010,3010
2018-12-03,1020,2020,3020


데이터 선택

In [14]:
df2.iloc[0]     # 행 선택

A전자    1000
B화학    2000
C금융    3000
Name: 2018-12-01 00:00:00, dtype: int64

In [15]:
df2.iloc[:, 0]     # 열 선택

2018-12-01    1000
2018-12-02    1010
2018-12-03    1020
Freq: D, Name: A전자, dtype: int64

In [16]:
df2.iloc[0, 0]     # 행, 열 지정

1000

In [17]:
df2.loc['2018-12-01']

A전자    1000
B화학    2000
C금융    3000
Name: 2018-12-01 00:00:00, dtype: int64

In [18]:
df2.loc[:, 'A전자']

2018-12-01    1000
2018-12-02    1010
2018-12-03    1020
Freq: D, Name: A전자, dtype: int64

In [19]:
df2.loc['2018-12-01', 'A전자']

1000

In [20]:
df2['A전자']     # 열 선택

2018-12-01    1000
2018-12-02    1010
2018-12-03    1020
Freq: D, Name: A전자, dtype: int64

In [21]:
df2.A전자     # df2['A전자']  와 동일

2018-12-01    1000
2018-12-02    1010
2018-12-03    1020
Freq: D, Name: A전자, dtype: int64

In [22]:
df2['A전자']['2018-12-01']

1000

In [23]:
df2.loc[:, 'A전자']['2018-12-01']

1000

데이터 추가

In [24]:
df2['D엔터'] = [4000, 4010, 4020]     # DataFrame에 열 추가
df2

Unnamed: 0,A전자,B화학,C금융,D엔터
2018-12-01,1000,2000,3000,4000
2018-12-02,1010,2010,3010,4010
2018-12-03,1020,2020,3020,4020


In [25]:
df2['E텔레콤'] = s     # Series로 부터 DataFrame 열 추가
df2

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤
2018-12-01,1000,2000,3000,4000,1000
2018-12-02,1010,2010,3010,4010,1010
2018-12-03,1020,2020,3020,4020,1020


데이터프레임 확장

In [26]:
s.name = 'F소프트'
s

2018-12-01    1000
2018-12-02    1010
2018-12-03    1020
2018-12-04    1030
Freq: D, Name: F소프트, dtype: int64

In [27]:
df2 = pd.concat([df2, s], axis=1)
df2

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020
2018-12-04,,,,,,1030


In [28]:
df3 = df2.iloc[0]
df3 = df3 + 60
df3.name = pd.to_datetime('20181207')
df3

A전자     1060.0
B화학     2060.0
C금융     3060.0
D엔터     4060.0
E텔레콤    1060.0
F소프트    1060.0
Name: 2018-12-07 00:00:00, dtype: float64

In [29]:
df2 = df2.append(df3)
df2

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-04,,,,,,1030.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0


In [30]:
df3 = df2.iloc[0] + 50
df3.name = pd.to_datetime('20181206')
df2 = df2.append(df3)
df2

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-04,,,,,,1030.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0
2018-12-06,1050.0,2050.0,3050.0,4050.0,1050.0,1050.0


In [31]:
df2 = df2.sort_index(axis=0)     # 날짜 순으로 인덱스 재정렬
df2

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-04,,,,,,1030.0
2018-12-06,1050.0,2050.0,3050.0,4050.0,1050.0,1050.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0


데이터 삭제

In [32]:
help(pd.DataFrame.drop)

Help on function drop in module pandas.core.generic:

drop(self, labels, axis=0, level=None, inplace=False, errors='raise')
    Return new object with labels in requested axis removed.
    
    Parameters
    ----------
    labels : single label or list-like
    axis : int or axis name
    level : int or level name, default None
        For MultiIndex
    inplace : bool, default False
        If True, do operation inplace and return None.
    errors : {'ignore', 'raise'}, default 'raise'
        If 'ignore', suppress error and existing labels are dropped.
    
        .. versionadded:: 0.16.1
    
    Returns
    -------
    dropped : type of caller



In [33]:
'''
    삭제한 DataFrame을 저장하지 않으므로
    현재 결과값에서는 삭제된것처럼 보이나
    df2에 삭제한 결과가 저장되지 않음에 주의
'''
df2.drop(pd.to_datetime('2018-12-06'))     # 행 삭제

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-04,,,,,,1030.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0


In [34]:
df2.drop([pd.to_datetime('2018-12-02'), pd.to_datetime('2018-12-06')])     # 여러 행 삭제

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-04,,,,,,1030.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0


In [35]:
df2.drop('D엔터', axis=1)     # 열 삭제

Unnamed: 0,A전자,B화학,C금융,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,1020.0,1020.0
2018-12-04,,,,,1030.0
2018-12-06,1050.0,2050.0,3050.0,1050.0,1050.0
2018-12-07,1060.0,2060.0,3060.0,1060.0,1060.0


In [36]:
df2.drop(['C금융', 'E텔레콤'], axis=1)

Unnamed: 0,A전자,B화학,D엔터,F소프트
2018-12-01,1000.0,2000.0,4000.0,1000.0
2018-12-02,1010.0,2010.0,4010.0,1010.0
2018-12-03,1020.0,2020.0,4020.0,1020.0
2018-12-04,,,,1030.0
2018-12-06,1050.0,2050.0,4050.0,1050.0
2018-12-07,1060.0,2060.0,4060.0,1060.0


In [37]:
df2.head()     # DataFrame의 최초 5줄 조회

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-04,,,,,,1030.0
2018-12-06,1050.0,2050.0,3050.0,4050.0,1050.0,1050.0


In [38]:
df2.tail(3)     # DataFrame의 마지막 3줄 조회

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-04,,,,,,1030.0
2018-12-06,1050.0,2050.0,3050.0,4050.0,1050.0,1050.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0


In [1]:
df2.iloc[2]     # 인덱스 위치번호(iloc: Index Location)로 슬라이싱

NameError: name 'df2' is not defined

In [2]:
df2.loc['2018-12-03']     # 인덱스 이름으로 슬라이싱 시 iloc 대신 loc 사용

NameError: name 'df2' is not defined

In [3]:
df2.iloc[1:3]     # 행 다중 선택

NameError: name 'df2' is not defined

In [4]:
df2.loc['2018-12-02':'2018-12-03']     # 행 다중 선택

NameError: name 'df2' is not defined

In [5]:
df2[1:3]     # 행 다중 선택 시 NumPy 배열처럼 슬라이싱 가능

NameError: name 'df2' is not defined

In [6]:
df2['C금융']    # 열 슬라이싱은 열 이름으로 가능

NameError: name 'df2' is not defined

In [45]:
df2.iloc[1:3, 2]     # 위치 번호로 행, 열 선택

2018-12-02    3010.0
2018-12-03    3020.0
Name: C금융, dtype: float64

In [46]:
df2.loc['2018-12-02':'2018-12-03', 'C금융']    # 이름으로 행, 열 선택

2018-12-02    3010.0
2018-12-03    3020.0
Name: C금융, dtype: float64

In [47]:
df2['E텔레콤'] * 10     # DataFrame의 스칼라 연산

2018-12-01    10000.0
2018-12-02    10100.0
2018-12-03    10200.0
2018-12-04        NaN
2018-12-06    10500.0
2018-12-07    10600.0
Name: E텔레콤, dtype: float64

In [48]:
df2.sum(axis=0)     # 행간 연산, 즉 열별 합산

A전자      5140.0
B화학     10140.0
C금융     15140.0
D엔터     20140.0
E텔레콤     5140.0
F소프트     6170.0
dtype: float64

In [49]:
df2.median(axis=1)     # 열간 연산, 즉 행별 합산

2018-12-01    1500.0
2018-12-02    1510.0
2018-12-03    1520.0
2018-12-04    1030.0
2018-12-06    1550.0
2018-12-07    1560.0
dtype: float64

In [50]:
df2.describe()     # 통계 요약

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
count,5.0,5.0,5.0,5.0,5.0,6.0
mean,1028.0,2028.0,3028.0,4028.0,1028.0,1028.333333
std,25.884358,25.884358,25.884358,25.884358,25.884358,23.166067
min,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
25%,1010.0,2010.0,3010.0,4010.0,1010.0,1012.5
50%,1020.0,2020.0,3020.0,4020.0,1020.0,1025.0
75%,1050.0,2050.0,3050.0,4050.0,1050.0,1045.0
max,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0


In [51]:
df2

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-04,,,,,,1030.0
2018-12-06,1050.0,2050.0,3050.0,4050.0,1050.0,1050.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0


In [52]:
df2.dropna()     # NaN 제거

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-06,1050.0,2050.0,3050.0,4050.0,1050.0,1050.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0


In [53]:
df2.fillna(0)     # NaN을 0으로 바꿈

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-04,0.0,0.0,0.0,0.0,0.0,1030.0
2018-12-06,1050.0,2050.0,3050.0,4050.0,1050.0,1050.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0


In [54]:
df2.fillna(method='ffill')     # NaN을 앞의 값으로 채움

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-04,1020.0,2020.0,3020.0,4020.0,1020.0,1030.0
2018-12-06,1050.0,2050.0,3050.0,4050.0,1050.0,1050.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0


In [55]:
df2.fillna(method='bfill')     # NaN을 뒤의 값으로 채움

Unnamed: 0,A전자,B화학,C금융,D엔터,E텔레콤,F소프트
2018-12-01,1000.0,2000.0,3000.0,4000.0,1000.0,1000.0
2018-12-02,1010.0,2010.0,3010.0,4010.0,1010.0,1010.0
2018-12-03,1020.0,2020.0,3020.0,4020.0,1020.0,1020.0
2018-12-04,1050.0,2050.0,3050.0,4050.0,1050.0,1030.0
2018-12-06,1050.0,2050.0,3050.0,4050.0,1050.0,1050.0
2018-12-07,1060.0,2060.0,3060.0,4060.0,1060.0,1060.0
