# 資料的索引和選擇
複習存取、設定、以及修改NumPy陣列值的方法和工具
* 索引 (ex : arr[2,1])
* 切片 (ex : arr[:,1:5])
* 遮罩 (ex : arr[arr > 0])
* fancy索引 (ex : arr[0,[1,5]])
* 組合 (arr[:,[1,5]])

In [4]:
# Series 中選擇資料，把Series當做是字典
import pandas as pd
data = pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])
data['b']

0.5

In [5]:
'a' in data


True

In [6]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [7]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [8]:
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [10]:
# 把 Series 當作是一維陣列
#切片
data['a':'c'], data[0:2]

(a    0.25
 b    0.50
 c    0.75
 dtype: float64,
 a    0.25
 b    0.50
 dtype: float64)

In [11]:
#遮罩
data[(data>0.3)&(data<0.8)]

b    0.50
c    0.75
dtype: float64

In [19]:
# fancy 索引
data[['a','e']]

a    0.25
e    1.25
dtype: float64

## 重要!! : Indexer : loc、iloc、ix


In [21]:
data = pd.Series(['a','b','c'],index=[1,3,5])
data,data[1],data[1:3]

(1    a
 3    b
 5    c
 dtype: object,
 'a',
 3    b
 5    c
 dtype: object)

In [32]:
# loc 屬性允許索引和切片參考明確的索引
data.loc[1], data.loc[0:5] # 看index的值，值是0到5

('a',
 1    a
 3    b
 5    c
 dtype: object)

In [31]:
# iloc 屬性讓索引和切片的索引已隱含的Python型態索引
data.iloc[1], data.iloc[0:3] # 不看index的值，前0到3個

('b',
 1    a
 3    b
 5    c
 dtype: object)

### 在 DataFrame 中選取資料


In [39]:
area = pd.Series( {'California':38332521,
                   'Texas':16549845,
                   'New York':6549846,
                   'Florida':1564987,
                   'Illinois':1234567
                   })
pop = pd.Series( {'California':5000,
                   'Texas':6000,
                   'New York':7000,
                   'Florida':8000,
                   'Illinois':9000
                   })

data = pd.DataFrame({'area': area,'pop':pop})
data

Unnamed: 0,area,pop
California,38332521,5000
Texas,16549845,6000
New York,6549846,7000
Florida,1564987,8000
Illinois,1234567,9000


In [40]:
data['area'], data.area

(California    38332521
 Texas         16549845
 New York       6549846
 Florida        1564987
 Illinois       1234567
 Name: area, dtype: int64,
 California    38332521
 Texas         16549845
 New York       6549846
 Florida        1564987
 Illinois       1234567
 Name: area, dtype: int64)

In [42]:
# 注意取名 : pop 是 DataFrame 的方法，所以才會不一樣
data.area is data['area'], data.pop is data['pop']

(True, False)