# pandas
- R의 자료구조 dataframe, matrix(2차원 배열)
- 데이터 분석에서 데이터의 핸들링을 할 때 필수 라이브러리
- 자료형 : series와 dataframe
- 표(table) 형태 : 행(관측치,레코드),열(속성,피쳐)
- 시리즈 : 인덱스 + 값
- 1차원 : series
- 2차원 : dataframe
- 3차원 : panel

## series
- 1차원 배열
- 구성요소 : index, value
- 생성 : pd.series(data,index)

In [1]:
# 모듈로딩
import numpy as np
import pandas as pd

In [3]:
price = pd.Series([4000,5000,3000,2000,])
price

0    4000
1    5000
2    3000
3    2000
dtype: int64

In [4]:
price.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
price.values

array([4000, 5000, 3000, 2000], dtype=int64)

In [8]:
fruit = pd.Series([4000,5000,3000,2000,],
                 index=['aaa','bbb','ccc','ddd'])
fruit

aaa    4000
bbb    5000
ccc    3000
ddd    2000
dtype: int64

In [10]:
fruit['aaa']

4000

In [12]:
fruit['aaa':'ccc']

aaa    4000
bbb    5000
ccc    3000
dtype: int64

In [13]:
# dict를 이용한 시리즈 생성
city_dict = {'seoul':82,'busan':90,'incheon':84,'daejeon':42}
city_dict

{'seoul': 82, 'busan': 90, 'incheon': 84, 'daejeon': 42}

In [14]:
city = pd.Series(city_dict)
city

seoul      82
busan      90
incheon    84
daejeon    42
dtype: int64

In [15]:
city.index

Index(['seoul', 'busan', 'incheon', 'daejeon'], dtype='object')

In [16]:
city.values

array([82, 90, 84, 42], dtype=int64)

In [17]:
city['incheon']

84

In [18]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [24]:
obj2 = pd.Series([4, 7, -5, 3],index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [20]:
obj2['c']

3

In [25]:
obj2[['c','a','d']]

c    3
a   -5
d    4
dtype: int64

In [26]:
obj2>0

d     True
b     True
a    False
c     True
dtype: bool

In [27]:
obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

In [29]:
# in and not in
'c' in obj2

True

## DataFrame
- 행과 열로 구성
- pd.DataFrame(data,index,columns)
- 데이터 분석에서 가장 기본이 되는 자료구조
- 여러개의 Series로 구성

In [30]:
city_dict = {'name':['a','b','c'],'age':[13,21,19]}
city_dict

{'name': ['a', 'b', 'c'], 'age': [13, 21, 19]}

In [31]:
city = pd.DataFrame(city_dict)
city

Unnamed: 0,name,age
0,a,13
1,b,21
2,c,19


In [32]:
city.index

RangeIndex(start=0, stop=3, step=1)

In [33]:
city.columns

Index(['name', 'age'], dtype='object')

In [36]:
city[0,:]

InvalidIndexError: (0, slice(None, None, None))

In [37]:
data = {
    "2015": [9904312, 3448737, 2890451, 2466052],
    "2010": [9631482, 3393191, 2632035, 2431774],
    "2005": [9762546, 3512547, 2517680, 2456016],
    "2000": [9853972, 3655437, 2466338, 2473990],
    "지역": ["수도권", "경상권", "수도권", "경상권"],
    "2010-2015 증가율": [0.0283, 0.0163, 0.0982, 0.0141]
}
columns = ["지역", "2015", "2010", "2005", "2000", "2010-2015 증가율"]
index = ["서울", "부산", "인천", "대구"]
df = pd.DataFrame(data, index=index, columns=columns)
df

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


In [38]:
df.values

array([['수도권', 9904312, 9631482, 9762546, 9853972, 0.0283],
       ['경상권', 3448737, 3393191, 3512547, 3655437, 0.0163],
       ['수도권', 2890451, 2632035, 2517680, 2466338, 0.0982],
       ['경상권', 2466052, 2431774, 2456016, 2473990, 0.0141]], dtype=object)

In [39]:
df.columns

Index(['지역', '2015', '2010', '2005', '2000', '2010-2015 증가율'], dtype='object')

In [40]:
df.index

Index(['서울', '부산', '인천', '대구'], dtype='object')

In [41]:
df.T

Unnamed: 0,서울,부산,인천,대구
지역,수도권,경상권,수도권,경상권
2015,9904312,3448737,2890451,2466052
2010,9631482,3393191,2632035,2431774
2005,9762546,3512547,2517680,2456016
2000,9853972,3655437,2466338,2473990
2010-2015 증가율,0.0283,0.0163,0.0982,0.0141


In [42]:
df

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2431774,2456016,2473990,0.0141


In [43]:
# 열추가
df['2005-2010 증가율'] = ((df['2010']-df['2005'])/df['2005'] * 100).round(2)
df

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율,2005-2010 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283,-1.34
부산,경상권,3448737,3393191,3512547,3655437,0.0163,-3.4
인천,수도권,2890451,2632035,2517680,2466338,0.0982,4.54
대구,경상권,2466052,2431774,2456016,2473990,0.0141,-0.99


In [44]:
# 열삭제
del df['2010-2015 증가율']
df

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54
대구,경상권,2466052,2431774,2456016,2473990,-0.99


In [45]:
# 열 인덱싱 : 시리즈로 반환
df['지역']


서울    수도권
부산    경상권
인천    수도권
대구    경상권
Name: 지역, dtype: object

In [46]:
df[['2010','2015']]

Unnamed: 0,2010,2015
서울,9631482,9904312
부산,3393191,3448737
인천,2632035,2890451
대구,2431774,2466052


In [47]:
df[['지역']]

Unnamed: 0,지역
서울,수도권
부산,경상권
인천,수도권
대구,경상권


In [48]:
type(df[['지역']])

pandas.core.frame.DataFrame

In [49]:
type(df['지역'])

pandas.core.series.Series

In [51]:
df2 = pd.DataFrame(np.arange(12).reshape(3,4))
df2

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [52]:
df2[2]

0     2
1     6
2    10
Name: 2, dtype: int32

In [54]:
df2[[1,2]]

Unnamed: 0,1,2
0,1,2
1,5,6
2,9,10


In [55]:
# 행인덱싱 : 항상 슬라이싱[:]을 사용
df

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54
대구,경상권,2466052,2431774,2456016,2473990,-0.99


In [57]:
df[:1]

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
서울,수도권,9904312,9631482,9762546,9853972,-1.34


In [58]:
df[1:2]

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
부산,경상권,3448737,3393191,3512547,3655437,-3.4


In [59]:
df[1:3]

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
부산,경상권,3448737,3393191,3512547,3655437,-3.4
인천,수도권,2890451,2632035,2517680,2466338,4.54


In [63]:
df['서울':'부산']

Unnamed: 0,지역,2015,2010,2005,2000,2005-2010 증가율
서울,수도권,9904312,9631482,9762546,9853972,-1.34
부산,경상권,3448737,3393191,3512547,3655437,-3.4


In [66]:
df['2015']['서울']

9904312