In [34]:
import pandas as pd
import numpy as np
import seaborn as sns

pd.options.display.max_rows = 6

df = sns.load_dataset('iris')

In [35]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
...,...,...,...,...,...
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


## 데이터 슬라이싱

* ### loc : row - index값 기준, column - column명 기준

주의 : string을 input으로 받는다.

In [9]:
df.loc[:,['species']]

Unnamed: 0,species
0,setosa
1,setosa
2,setosa
...,...
147,virginica
148,virginica
149,virginica


In [11]:
df.loc[0:3,['species']]

Unnamed: 0,species
0,setosa
1,setosa
2,setosa
3,setosa


* ### iloc : row & column - 위치에 해당하는 정수값으로 대체

In [14]:
# 컬럼도 0부터 정수 index로
df.iloc[ 3:8, [0,2,3]]

Unnamed: 0,sepal_length,petal_length,petal_width
3,4.6,1.5,0.2
4,5.0,1.4,0.2
5,5.4,1.7,0.4
6,4.6,1.4,0.3
7,5.0,1.5,0.2


컬럼이 매우 많을 경우, np.r_을 활용하면 유용

In [43]:
# indexing 한 값들을 concat
np.r_[0:2,5:8]

array([0, 1, 5, 6, 7])

In [45]:
df.iloc[:,np.r_[0:2, 3:5]]

Unnamed: 0,sepal_length,sepal_width,petal_width,species
0,5.1,3.5,0.2,setosa
1,4.9,3.0,0.2,setosa
2,4.7,3.2,0.2,setosa
...,...,...,...,...
147,6.5,3.0,2.0,virginica
148,6.2,3.4,2.3,virginica
149,5.9,3.0,1.8,virginica


* ### at : single value

single value는 .loc함수보다 처리가 훨씬 빠르다.

In [36]:
df.at[0, 'species']

'setosa'

* ### iat : single value

In [38]:
df.iat[0, 3]

0.2

- ### xs : cross-section 주로 multiindex 슬라이싱에 유용

#### **Multiindex 슬라이싱**

In [20]:
df = sns.load_dataset('mpg')
df_indexed = df.groupby(['model_year','origin']).mean()

In [21]:
df_indexed

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration
model_year,origin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
70,europe,25.200000,4.000000,107.800000,86.200000,2309.200000,16.500000
70,japan,25.500000,4.000000,105.000000,91.500000,2251.000000,14.750000
70,usa,15.272727,7.636364,336.909091,166.954545,3716.500000,11.977273
...,...,...,...,...,...,...,...
82,europe,40.000000,4.000000,101.000000,63.000000,2055.000000,19.950000
82,japan,34.888889,4.000000,103.777778,74.000000,2132.777778,15.833333
82,usa,29.450000,4.300000,142.950000,86.947368,2637.750000,16.670000


하나의 index 컬럼 중 특정 value 기준 슬라이싱 ( level에 맞는 key 입력 )

In [26]:
df_indexed.xs(key=70, level='model_year') # level 생략도 가능?

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
europe,25.2,4.0,107.8,86.2,2309.2,16.5
japan,25.5,4.0,105.0,91.5,2251.0,14.75
usa,15.272727,7.636364,336.909091,166.954545,3716.5,11.977273


tuple을 활용하여 여러 index 값을 선택 가능

In [39]:
df_indexed.xs( key=(70,'usa') )

mpg               15.272727
cylinders          7.636364
displacement     336.909091
horsepower       166.954545
weight          3716.500000
acceleration      11.977273
Name: (70, usa), dtype: float64